使用 SIMD 内部函数进行高效的行列转换



我是 SIMD 编程的初学者。我想按如下方式处理我的数据:

假设我有 4 个 simd 变量 ( __m128i ),数据如下:

__m128i a = {a1, a2, a3, a4}
__m128i b = {b1, b2, b3, b4}
__m128i c = {c1, c2, c3, c4}
__m128i d = {d1, d2, d3, d4}

现在我想初始化习近平如下:

__m128i x1 = {a1, b1, c1, d1}
__m128i x2 = {a2, b2, c2, d2}
__m128i x3 = {a3, b3, c3, d3}
__m128i x4 = {a4, b4, c4, d4}

任何人都可以向我建议如何有效地做到这一点吗?

我假设您的问题中有错别字,并且您实际上想进行 4x4 转置。如果是这样,那么您可以使用 8 条指令进行 4x4 转置,如下所示:

#include "emmintrin.h"
inline void Transpose_4_4(
    __m128i &v0,               // a1, a2, a3, a4 => a1, b1, c1, d1
    __m128i &v1,               // b1, b2, b3, b4 => a2, b2, c2, d2
    __m128i &v2,               // c1, c2, c3, c4 => a3, b3, c3, d3
    __m128i &v3)               // d1, d2, d3, d4 => a4, b4, c4, d4
{
    __m128i w0 = _mm_unpacklo_epi32(v0, v1);
    __m128i w1 = _mm_unpackhi_epi32(v0, v1);
    __m128i w2 = _mm_unpacklo_epi32(v2, v3);
    __m128i w3 = _mm_unpackhi_epi32(v2, v3);
    v0 = _mm_unpacklo_epi64(w0, w2);
    v1 = _mm_unpackhi_epi64(w0, w2);
    v2 = _mm_unpacklo_epi64(w1, w3);
    v3 = _mm_unpackhi_epi64(w1, w3);
}

演示:

//
// tranpose_4_4.cpp
//
#include <stdio.h>
#include <emmintrin.h>
inline void Transpose_4_4(
    __m128i &v0,               // a1, a2, a3, a4 => a1, b1, c1, d1
    __m128i &v1,               // b1, b2, b3, b4 => a2, b2, c2, d2
    __m128i &v2,               // c1, c2, c3, c4 => a3, b3, c3, d3
    __m128i &v3)               // d1, d2, d3, d4 => a4, b4, c4, d4
{
    __m128i w0 = _mm_unpacklo_epi32(v0, v1);
    __m128i w1 = _mm_unpackhi_epi32(v0, v1);
    __m128i w2 = _mm_unpacklo_epi32(v2, v3);
    __m128i w3 = _mm_unpackhi_epi32(v2, v3);
    v0 = _mm_unpacklo_epi64(w0, w2);
    v1 = _mm_unpackhi_epi64(w0, w2);
    v2 = _mm_unpacklo_epi64(w1, w3);
    v3 = _mm_unpackhi_epi64(w1, w3);
}
int main(void)
{
    int32_t buff[4][4] __attribute__ ((aligned(16)));
    int i, j;
    int k = 0;
    // init buff
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            buff[i][j] = k++;
        }
    }
    // print buff
    printf("nBEFORE:n");
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("n");
    }
    // transpose
    Transpose_4_4(*(__m128i *)buff[0], *(__m128i *)buff[1], *(__m128i *)buff[2], *(__m128i *)buff[3]);
    // print buff
    printf("nAFTER:n");
    for (i = 0; i < 4; ++i)
    {
        for (j = 0; j < 4; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("n");
    }
    return 0;
}

编译并运行:

$ g++ -Wall -msse3 transpose_4_4.cpp && ./a.out 
BEFORE:
   0   1   2   3
   4   5   6   7
   8   9  10  11
  12  13  14  15
AFTER:
   0   4   8  12
   1   5   9  13
   2   6  10  14
   3   7  11  15
$ 

最新更新