图像二值化的指令集加速例子-526互联

以下代码基于VS2015、Qt5.9和OpenCV430，CPU型号是Intel Core i5-7400。功能是对图像进行二值化。下面直接上代码：

void main()
{
    Mat image(1024, 1024, CV_8UC1, Scalar(255));
    circle(image, Point2i(500, 500), 200, Scalar(0), -1);
    int64 t1, t2;
    Mat binar1(image.size(), image.type());
    Mat binar2(image.size(), image.type());
    // 确保32字节对齐
    ASSERT(int64(image.data) % 32 == 0);
    ASSERT(int64(binar1.data) % 32 == 0);
    ASSERT(int64(binar2.data) % 32 == 0);

    t1 = getTickCount();
    threshold(image, binar1, 127, 255, THRESH_BINARY);
    t2 = getTickCount();
    qDebug() << u8"OPENCV(ms):" << (t2 - t1) / getTickFrequency() * 1000;

    t1 = getTickCount();
    for (int i = 0; i < 1024; i++)
    {
        const uchar* line = image.ptr<uchar>(i);
        uchar* dest = binar2.ptr<uchar>(i);
        for (int j = 0; j < 1024; j++)
        {
            dest[j] = line[j] > 127 ? 255 : 0;
        }
    }
    t2 = getTickCount();
    qDebug() << u8"NONE(ms):" << (t2 - t1) / getTickFrequency() * 1000;

    t1 = getTickCount();
    __m128i m128t = _mm_set_epi16(127, 127, 127, 127, 127, 127, 127, 127);
    __m128i m128h = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0);
    for (int i = 0; i < 1024; i++)
    {
        const uchar* line = image.ptr<uchar>(i);
        uchar* dest = binar2.ptr<uchar>(i);
        for (int j = 0; j < 1024; j += 8)
        {
            __m128i mmx08 = _mm_set_epi64x(0, *(int64*)&line[j]);
            __m128i mmx16 = _mm_cvtepu8_epi16(mmx08);
            __m128i res = _mm_cmplt_epi16(m128t, mmx16);
            __m128i half = _mm_shuffle_epi8(res, m128h);
            *(int64*)&dest[j] = _mm_extract_epi64(half, 0);
        }
    }
    t2 = getTickCount();
    qDebug() << u8"SSE(ms):" << (t2 - t1) / getTickFrequency() * 1000;

    t1 = getTickCount();
    __m256i m256t = _mm256_set1_epi16(127);
    __m256i m256h = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0,
        -1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0);
    for (int i = 0; i < 1024; i++)
    {
        const uchar* line = image.ptr<uchar>(i);
        uchar* dest = binar2.ptr<uchar>(i);
        for (int j = 0; j < 1024; j += 16)
        {
            __m128i mmx08 = _mm_set_epi64x(*(int64*)&line[j + 8], *(int64*)&line[j]);
            __m256i mmx16 = _mm256_cvtepu8_epi16(mmx08);
            __m256i res = _mm256_cmpgt_epi16(mmx16, m256t);
            __m256i half = _mm256_shuffle_epi8(res, m256h);
            *(int64*)&dest[j] = _mm256_extract_epi64(half, 0);
            *(int64*)&dest[j + 8] = _mm256_extract_epi64(half, 2);
        }
    }
    t2 = getTickCount();
    qDebug() << u8"AVX(ms):" << (t2 - t1) / getTickFrequency() * 1000;
}

在Release版下执行50次的输出如下。从这一批次的输出可知AVX优化的运行效率大部分都能超过OpenCV的运行效率：

OPENCV(ms): 2.0732
NONE(ms): 0.7314
SSE(ms): 0.2543
AVX(ms): 0.2199
OPENCV(ms): 0.4455
NONE(ms): 0.7666
SSE(ms): 0.293
AVX(ms): 0.179
OPENCV(ms): 0.6254
NONE(ms): 0.8789
SSE(ms): 0.2223
AVX(ms): 0.1512
OPENCV(ms): 0.4486
NONE(ms): 0.7306
SSE(ms): 0.2154
AVX(ms): 0.175
OPENCV(ms): 0.5774
NONE(ms): 2.3402
SSE(ms): 0.2871
AVX(ms): 0.2766
OPENCV(ms): 0.3737
NONE(ms): 0.7787
SSE(ms): 0.3047
AVX(ms): 0.3284
OPENCV(ms): 0.3145
NONE(ms): 0.7349
SSE(ms): 0.3549
AVX(ms): 0.3025
OPENCV(ms): 0.4318
NONE(ms): 0.7679
SSE(ms): 2.4315
AVX(ms): 0.2681
OPENCV(ms): 0.3959
NONE(ms): 0.9343
SSE(ms): 0.3756
AVX(ms): 0.439
OPENCV(ms): 0.3512
NONE(ms): 2.4505
SSE(ms): 0.377
AVX(ms): 0.2237
OPENCV(ms): 0.5284
NONE(ms): 0.7935
SSE(ms): 0.4699
AVX(ms): 0.2633
OPENCV(ms): 0.4671
NONE(ms): 0.8124
SSE(ms): 0.2919
AVX(ms): 0.2929
OPENCV(ms): 0.5293
NONE(ms): 0.7665
SSE(ms): 0.3181
AVX(ms): 0.408
OPENCV(ms): 0.6264
NONE(ms): 0.8933
SSE(ms): 0.2657
AVX(ms): 0.3929
OPENCV(ms): 0.5343
NONE(ms): 0.8591
SSE(ms): 0.3004
AVX(ms): 0.8155
...<输出太多删除一部分>
OPENCV(ms): 0.3946
NONE(ms): 1.2074
SSE(ms): 0.3121
AVX(ms): 0.3349
OPENCV(ms): 0.6635
NONE(ms): 0.8499
SSE(ms): 0.2915
AVX(ms): 0.3152
OPENCV(ms): 0.6398
NONE(ms): 0.9685
SSE(ms): 0.3917
AVX(ms): 0.2999
OPENCV(ms): 0.3454
NONE(ms): 0.9082
SSE(ms): 0.3983
AVX(ms): 0.3385
OPENCV(ms): 0.3415
NONE(ms): 1.035
SSE(ms): 0.3842
AVX(ms): 0.2633
OPENCV(ms): 0.4105
NONE(ms): 1.1947
SSE(ms): 0.3958
AVX(ms): 0.3525
OPENCV(ms): 0.612
NONE(ms): 0.9998
SSE(ms): 0.3176
AVX(ms): 0.3837
OPENCV(ms): 0.4727
NONE(ms): 0.8645
SSE(ms): 0.2794
AVX(ms): 0.2068
OPENCV(ms): 0.6206
NONE(ms): 0.9266
SSE(ms): 0.3822
AVX(ms): 0.3107
OPENCV(ms): 0.6847
NONE(ms): 0.9386
SSE(ms): 0.3073
AVX(ms): 0.4238
OPENCV(ms): 0.4841
NONE(ms): 1.002
SSE(ms): 0.2424
AVX(ms): 0.2825
OPENCV(ms): 0.5021
NONE(ms): 1.2102
SSE(ms): 0.3045
AVX(ms): 0.2816
OPENCV(ms): 0.6298
NONE(ms): 1.6238
SSE(ms): 0.4122
AVX(ms): 0.2643
OPENCV(ms): 0.8655
NONE(ms): 1.0023
SSE(ms): 0.3301
AVX(ms): 0.3396
OPENCV(ms): 0.6918
NONE(ms): 0.8999
SSE(ms): 0.2622
AVX(ms): 0.1829