首页
社区
课程
招聘
[旧帖] for循环算法优化 0.00雪花
发表于: 2016-9-14 16:32 2924

[旧帖] for循环算法优化 0.00雪花

2016-9-14 16:32
2924
for一:
    private byte[] cropYUV420SemiPlannerFrame(byte[] input, int iw, int ih, byte[] output, int ow, int oh) {
        int iFrameSize = iw * ih;
        int oFrameSize = ow * oh;
 
        int i = 0;
        for (int row = (ih - oh) / 2; row < oh + (ih - oh) / 2; row++) {
            for (int col = (iw - ow) / 2; col < ow + (iw - ow) / 2; col++) {
                output[i++] = input[iw * row + col];  // Y
            }
        }
 
        i = 0;
        for (int row = (ih - oh) / 4; row < oh / 2 + (ih - oh) / 4; row++) {
            for (int col = (iw - ow) / 4; col < ow / 2 + (iw - ow) / 4; col++) {
                output[oFrameSize + 2 * i] = input[iFrameSize + iw * row + 2 * col];  // U
                output[oFrameSize + 2 * i + 1] = input[iFrameSize + iw * row + 2 * col + 1];  // V
                i++;
            }
        }
 
        return output;
    }


for二:
    private byte[] cropYUV420PlannerFrame(byte[] input, int iw, int ih, byte[] output, int ow, int oh) {
        int iFrameSize = iw * ih;
        int iQFrameSize = iFrameSize / 4;
        int oFrameSize = ow * oh;
        int oQFrameSize = oFrameSize / 4;
 
        int i = 0;
        for (int row = (ih - oh) / 2; row < oh + (ih - oh) / 2; row++) {
            for (int col = (iw - ow) / 2; col < ow + (iw - ow) / 2; col++) {
                output[i++] = input[iw * row + col];  // Y
            }
        }
 
        i = 0;
        for (int row = (ih - oh) / 4; row < oh / 2 + (ih - oh) / 4; row++) {
            for (int col = (iw - ow) / 4; col < ow / 2 + (iw - ow) / 4; col++) {
                output[oFrameSize + i] = input[iFrameSize + iw / 2 * row + col];  // U
                i++;
            }
        }
 
        i = 0;
        for (int row = (ih - oh) / 4; row < oh / 2 + (ih - oh) / 4; row++) {
            for (int col = (iw - ow) / 4; col < ow / 2 + (iw - ow) / 4; col++) {
                output[oFrameSize + oQFrameSize + i] = input[iFrameSize + iQFrameSize + iw / 2 * row + col];  // V
                i++;
            }
        }
 
        return output;
    }


for三:
    private byte[] rotateYUV420SemiPlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
 
        int i = 0;
        for (int col = 0; col < width; col++) {
            for (int row = height - 1; row >= 0; row--) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int col = 0; col < width / 2; col++) {
            for (int row = height / 2 - 1; row >= 0; row--) {
                output[frameSize + i * 2 + 1] = input[frameSize + width * row + col * 2]; // Cb (U)
                output[frameSize + i * 2] = input[frameSize + width * row + col * 2 + 1]; // Cr (V)
                i++;
            }
        }
 
        return output;
    }


for四:
    private byte[] rotateYUV420PlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
        int qFrameSize = frameSize / 4;
 
        int i = 0;
        for (int col = 0; col < width; col++) {
            for (int row = height - 1; row >= 0; row--) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int col = 0; col < width / 2; col++) {
            for (int row = height / 2 - 1; row >= 0; row--) {
                output[frameSize + i] = input[frameSize + qFrameSize + width / 2 * row + col]; // Cb (U)
                i++;
            }
        }
 
        i = 0;
        for (int col = 0; col < width / 2; col++) {
            for (int row = height / 2 - 1; row >= 0; row--) {
                output[frameSize + qFrameSize + i] = input[frameSize + width / 2 * row + col]; // Cr (V)
                i++;
            }
        }
 
        return output;
    }


for五:
    private byte[] unrotateYUV420SemiPlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
 
        int i = 0;
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = 0; col < width / 2; col++) {
                output[frameSize + i * 2 + 1] = input[frameSize + width * row + col * 2]; // Cb (U)
                output[frameSize + i * 2] = input[frameSize + width * row + col * 2 + 1]; // Cr (V)
                i++;
            }
        }
 
        return output;
    }


for六:
    private byte[] unrotateYUV420PlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
        int qFrameSize = frameSize / 4;
 
        int i = 0;
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = 0; col < width / 2; col++) {
                output[frameSize + i] = input[frameSize + qFrameSize + width / 2 * row + col]; // Cb (U)
                i++;
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = 0; col < width / 2; col++) {
                output[frameSize + qFrameSize + i] = input[frameSize + width / 2 * row + col]; // Cr (V)
                i++;
            }
        }
 
        return output;
    }


for七:
    private byte[] flipYUV420SemiPlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
 
        int i = 0;
        for (int row = 0; row < height; row++) {
            for (int col = width - 1; col >= 0; col--) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = width / 2 - 1; col >= 0; col--) {
                output[frameSize + i * 2] = input[frameSize + width * row + col * 2]; // Cb (U)
                output[frameSize + i * 2 + 1] = input[frameSize + width * row + col * 2 + 1]; // Cr (V)
                i++;
            }
        }
 
        return output;
    }


for八:
    private byte[] flipYUV420PlannerFrame(byte[] input, byte[] output, int width, int height) {
        int frameSize = width * height;
        int qFrameSize = frameSize / 4;
 
        int i = 0;
        for (int row = 0; row < height; row++) {
            for (int col = width - 1; col >= 0; col--) {
                output[i++] = input[width * row + col]; // Y
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = width / 2 - 1; col >= 0; col--) {
                output[frameSize + i] = input[frameSize + width / 2 * row + col]; // Cr (V)
                i++;
            }
        }
 
        i = 0;
        for (int row = 0; row < height / 2; row++) {
            for (int col = width / 2 - 1; col >= 0; col--) {
                output[frameSize + qFrameSize + i] = input[frameSize + qFrameSize + width / 2 * row + col]; // Cb (U)
                i++;
            }
        }
 
        return output;
    }


本人只会把乘除法改为移位运算,但速度还是不够快,请大家帮忙,先谢谢了.

[招生]科锐逆向工程师培训(2024年11月15日实地,远程教学同时开班, 第51期)

收藏
免费 0
支持
分享
最新回复 (6)
雪    币: 12848
活跃值: (9147)
能力值: ( LV9,RANK:280 )
在线值:
发帖
回帖
粉丝
2
这种用gpu去算比较快吧,写个shader分分钟的事情
2016-9-15 11:43
0
雪    币: 26
活跃值: (27)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
3
gpu老爷机容易直接跑死....
2016-9-16 20:07
0
雪    币: 26
活跃值: (10)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
4
2楼的思路非常好,但本人对GPU也是一知半解
2016-9-18 09:06
0
雪    币: 31
活跃值: (12)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
5
没细看,至少你第一段里用cpymem会快很多,系统会尝试使用sse来加速

另外,你的循环写的也有问题,拷贝图像都是按行写,也就是按一个stride,每次只移动行首地址,你这里每次都乘,虽然我相信这种程度的话编译器会优化,但最好自己改一下。

另外,零散的复制尽量使用dword拷贝,而不是byte,即便不测试对齐,总用dw也比总用byte要快的多得多
2016-9-18 18:31
0
雪    币: 12848
活跃值: (9147)
能力值: ( LV9,RANK:280 )
在线值:
发帖
回帖
粉丝
6
不知道你这段代码是用在哪里的。如果是用来输出视频,那直接用opengl+glsl写个shader就可以了,光是输出视频的话shader挺好写的,c语言语法。

如果是转码的话建议你还是老老实实优化代码。
2016-9-18 20:09
0
雪    币: 26
活跃值: (10)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
7
需要怎么优化呢? 有没有方法把for循环搞少一个?
2016-9-19 10:25
0
游客
登录 | 注册 方可回帖
返回
//