如何强制emscripten/em++/llvm从.rodata加载常量和/或执行更好的SIMD优化



我是SSIM.js和jest图像快照的活跃作者和维护者。目前,我正在优化我们的图像处理实现,以利用WebAssembly来提高性能。

现在,我注意到生成的代码添加了来自llvm程序集(webassembly text?(输出透视图以及Node.js实际程序集输出(--print wasm代码(的不必要指令。特别值得注意的是,它在加载常量时会做一些非常奇怪的事情。例如,在下面的三段代码中,查看名为multiplier的数组或常量rounder。在GCC上,乘数将存储在要加载一次或转换为整数的程序集的.rodata部分中,rounder与movd或movq内联。在这里,它似乎是在循环的每一轮上插入值。它也在用vpblendw做一些我完全不知道的事情。

我该如何解决这个问题?

alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
typedef __u8x16 v8x16;
typedef __u16x8 v16x8;
v8x16* pInputPtr = (v8x16*) inputDataBuffer;
v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
__m128i rounder = _mm_cvtsi32_si128(0x80808080);
v8x16 zero;
zero ^= zero;
__m128i multiplier = *((__m128i*)multiplierArray);
//      v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
unsigned i = 0;
for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
// rg ba rg ba rg ba rg ba rg ba rg ba rg ba
__m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
__m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
__m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
__m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
// rgba rgba rgba rgba rgba rgba rgba rgba
__m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
__m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
}
// abbreviated...
return 0;
}

llvm程序集是:

.section    .text.rgba2y,"",@
.hidden rgba2y                          # -- Begin function rgba2y
.globl  rgba2y
.type   rgba2y,@function
rgba2y:                                 # @rgba2y
.Lfunc_begin0:
.loc    2 56 0                          # rgb2y-sample.cpp:56:0
.functype   rgba2y (i32, i32) -> (i32)
.local      i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0:                                # %entry
#DEBUG_VALUE: rgba2y:length <- %4
#DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
#DEBUG_VALUE: rgba2y:i <- 0
#DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
#DEBUG_VALUE: rgba2y:pInputPtr <- %3
#DEBUG_VALUE: rgba2y:pOutputPtr <- %3
#DEBUG_VALUE: rgba2y:rounder <- undef
#DEBUG_VALUE: rgba2y:zero <- undef
#DEBUG_VALUE: rgba2y:multiplier <- undef
block
.Ltmp0:
.loc    2 68 30 prologue_end            # rgb2y-sample.cpp:68:30
local.get   1
i32.const   64
i32.lt_u
.Ltmp1:
.loc    2 68 2 is_stmt 0                # rgb2y-sample.cpp:68:2
br_if       0                               # 0: down to label0
.Ltmp2:
# %bb.1:
.loc    2 0 2                           # rgb2y-sample.cpp:0:2
i32.const   0
local.set   2
i32.const   4
local.set   3
.LBB0_2:                                # %for.body
# =>This Inner Loop Header: Depth=1
loop                                        # label1:
.Ltmp3:
#DEBUG_VALUE: rgba2y:i <- %101
#DEBUG_VALUE: rgba0 <- undef
#DEBUG_VALUE: rgba1 <- undef
.loc    2 69 15 is_stmt 1               # rgb2y-sample.cpp:69:15
local.get   0
local.get   2
i32.const   2
i32.shl
i32.add
local.tee   2
local.get   2
v128.load   0
i32.const   0
i8x16.splat
local.tee   4
i32.const   -128
i8x16.replace_lane  0
i32.const   -128
i8x16.replace_lane  1
i32.const   -128
i8x16.replace_lane  2
i32.const   -128
i8x16.replace_lane  3
local.tee   5
v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
.loc    2 74 48                         # rgb2y-sample.cpp:74:48
local.tee   6
.Ltmp5:
#DEBUG_VALUE: iv0 <- undef
#DEBUG_VALUE: iv0 <- %153
local.get   4
v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
i32.const   77
.loc    2 74 32 is_stmt 0               # rgb2y-sample.cpp:74:32
i16x8.splat
i32.const   150
i16x8.replace_lane  1
i32.const   29
i16x8.replace_lane  2
i32.const   1
i16x8.replace_lane  3
i32.const   160
i16x8.replace_lane  5
i32.const   29
i16x8.replace_lane  6
i32.const   1
i16x8.replace_lane  7
local.tee   7
i16x8.mul
.loc    2 74 133                        # rgb2y-sample.cpp:74:133
local.tee   8
local.get   6
local.get   4
v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc    2 74 117                        # rgb2y-sample.cpp:74:117
local.get   7
i16x8.mul
.loc    2 74 17                         # rgb2y-sample.cpp:74:17
local.tee   6
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   8
local.get   6
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp6:
.loc    2 0 17                          # rgb2y-sample.cpp:0:17
local.tee   6
.Ltmp7:
#DEBUG_VALUE: rg0 <- undef
#DEBUG_VALUE: rg0 <- %153
.loc    2 70 15 is_stmt 1               # rgb2y-sample.cpp:70:15
local.get   2
i32.const   16
i32.add
v128.load   0
local.get   5
v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
.loc    2 75 62                         # rgb2y-sample.cpp:75:62
local.tee   8
.Ltmp9:
#DEBUG_VALUE: iv1 <- undef
#DEBUG_VALUE: iv1 <- %157
local.get   4
v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc    2 75 46 is_stmt 0               # rgb2y-sample.cpp:75:46
local.get   7
i16x8.mul
.loc    2 75 146                        # rgb2y-sample.cpp:75:146
local.tee   9
local.get   8
local.get   4
v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc    2 75 130                        # rgb2y-sample.cpp:75:130
local.get   7
i16x8.mul
.loc    2 75 31                         # rgb2y-sample.cpp:75:31
local.tee   8
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   9
local.get   8
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp10:
.loc    2 79 33 is_stmt 1               # rgb2y-sample.cpp:79:33
local.tee   8
.Ltmp11:
#DEBUG_VALUE: rg1 <- undef
#DEBUG_VALUE: rg1 <- %157
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   6
local.get   8
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const   8
.loc    2 79 18 is_stmt 0               # rgb2y-sample.cpp:79:18
i16x8.shr_u
.loc    2 71 15 is_stmt 1               # rgb2y-sample.cpp:71:15
local.get   2
i32.const   32
i32.add
v128.load   0
local.get   5
v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
.loc    2 76 62                         # rgb2y-sample.cpp:76:62
local.tee   6
.Ltmp13:
#DEBUG_VALUE: iv2 <- undef
#DEBUG_VALUE: iv2 <- %153
local.get   4
v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc    2 76 46 is_stmt 0               # rgb2y-sample.cpp:76:46
local.get   7
i16x8.mul
.loc    2 76 146                        # rgb2y-sample.cpp:76:146
local.tee   8
local.get   6
local.get   4
v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc    2 76 130                        # rgb2y-sample.cpp:76:130
local.get   7
i16x8.mul
.loc    2 76 31                         # rgb2y-sample.cpp:76:31
local.tee   6
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   8
local.get   6
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp14:
.loc    2 0 31                          # rgb2y-sample.cpp:0:31
local.tee   6
.Ltmp15:
#DEBUG_VALUE: rg2 <- undef
#DEBUG_VALUE: rg2 <- %153
.loc    2 72 15 is_stmt 1               # rgb2y-sample.cpp:72:15
local.get   2
i32.const   48
i32.add
v128.load   0
local.get   5
v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
.loc    2 77 62                         # rgb2y-sample.cpp:77:62
local.tee   5
.Ltmp17:
#DEBUG_VALUE: iv3 <- undef
#DEBUG_VALUE: iv3 <- %98
local.get   4
v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
.loc    2 77 46 is_stmt 0               # rgb2y-sample.cpp:77:46
local.get   7
i16x8.mul
.loc    2 77 146                        # rgb2y-sample.cpp:77:146
local.tee   8
local.get   5
local.get   4
v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
.loc    2 77 130                        # rgb2y-sample.cpp:77:130
local.get   7
i16x8.mul
.loc    2 77 31                         # rgb2y-sample.cpp:77:31
local.tee   4
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   8
local.get   4
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
.Ltmp18:
.loc    2 80 33 is_stmt 1               # rgb2y-sample.cpp:80:33
local.tee   4
.Ltmp19:
#DEBUG_VALUE: rg3 <- undef
#DEBUG_VALUE: rg3 <- %93
v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
local.get   6
local.get   4
v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
i16x8.add
i32.const   8
.loc    2 80 18 is_stmt 0               # rgb2y-sample.cpp:80:18
i16x8.shr_u
.loc    2 81 21 is_stmt 1               # rgb2y-sample.cpp:81:21
i8x16.narrow_i16x8_u
.loc    2 81 19 is_stmt 0               # rgb2y-sample.cpp:81:19
v128.store  0
.Ltmp20:
#DEBUG_VALUE: rgba2y:i <- %170
.loc    2 0 19                          # rgb2y-sample.cpp:0:19
local.get   3
local.tee   3
local.set   2
.Ltmp21:
.loc    2 68 11 is_stmt 1               # rgb2y-sample.cpp:68:11
local.get   3
i32.const   4
i32.add
local.tee   3
i32.const   4
.loc    2 68 14 is_stmt 0               # rgb2y-sample.cpp:68:14
i32.shl
.loc    2 68 30                         # rgb2y-sample.cpp:68:30
local.get   1
i32.le_u
.Ltmp22:
.loc    2 68 2                          # rgb2y-sample.cpp:68:2
br_if       0                               # 0: up to label1
.Ltmp23:
.LBB0_3:                                # %for.end
end_loop
end_block                               # label0:
i32.const   0
.Ltmp24:
.loc    2 84 2 is_stmt 1                # rgb2y-sample.cpp:84:2
# fallthrough-return
end_function
.Ltmp25:
.Lfunc_end0:
.size   rgba2y, .Lfunc_end0-rgba2y
# -- End function

装配商/装配商:

--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180     0  55             push rbp
0xa5976359181     1  4889e5         REX.W movq rbp,rsp
0xa5976359184     4  6a0a           push 0xa
0xa5976359186     6  56             push rsi
0xa5976359187     7  4883ec58       REX.W subq rsp,0x58
0xa597635918b     b  488b5e17       REX.W movq rbx,[rsi+0x17]
0xa597635918f     f  83fa40         cmpl rdx,0x40
0xa5976359192    12  0f8307000000   jnc 0xa597635919f  <+0x1f>
0xa5976359198    18  33c9           xorl rcx,rcx
0xa597635919a    1a  e990030000     jmp 0xa597635952f  <+0x3af>
0xa597635919f    1f  b94d000000     movl rcx,0x4d
0xa59763591a4    24  c5f96ec1       vmovd xmm0,rcx
0xa59763591a8    28  c5fb70c000     vpshuflw xmm0,xmm0,0x0
0xa59763591ad    2d  c5f970c000     vpshufd xmm0,xmm0,0x0
0xa59763591b2    32  33c9           xorl rcx,rcx
0xa59763591b4    34  c5f96ec9       vmovd xmm1,rcx
0xa59763591b8    38  c4410057ff     vxorps xmm15,xmm15,xmm15
0xa59763591bd    3d  c4c27100cf     vpshufb xmm1,xmm1,xmm15
0xa59763591c2    42  bf96000000     movl rdi,0x96
0xa59763591c7    47  c5f9c4c701     vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc    4c  bf80ffffff     movl rdi,0xffffff80
0xa59763591d1    51  c5f928d1       vmovapd xmm2,xmm1
0xa59763591d5    55  c4e36920d700   vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db    5b  41b81d000000   movl r8,0x1d
0xa59763591e1    61  c4c179c4c002   vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7    67  c4e36920d701   vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed    6d  41b901000000   movl r9,0x1
0xa59763591f3    73  c4c179c4c103   vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9    79  c4e36920d702   vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff    7f  41bba0000000   movl r11,0xa0
0xa5976359205    85  c4c179c4c305   vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b    8b  c4e36920d703   vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211    91  c4c179c4c006   vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217    97  c4c179c4c107   vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d    9d  488bf9         REX.W movq rdi,rcx
0xa5976359220    a0  41b804000000   movl r8,0x4
0xa5976359226    a6  e90b000000     jmp 0xa5976359236  <+0xb6>
0xa597635922b    ab  0f1f440000     nop
0xa5976359230    b0  498bf8         REX.W movq rdi,r8
0xa5976359233    b3  4d8bc1         REX.W movq r8,r9
0xa5976359236    b6  4c8b4e2f       REX.W movq r9,[rsi+0x2f]
0xa597635923a    ba  493b21         REX.W cmpq rsp,[r9]
0xa597635923d    bd  0f86f4020000   jna 0xa5976359537  <+0x3b7>
0xa5976359243    c3  458d4804       leal r9,[r8+0x4]
0xa5976359247    c7  4d8bd9         REX.W movq r11,r9
0xa597635924a    ca  41c1e304       shll r11, 4
0xa597635924e    ce  8d3cb8         leal rdi,[rax+rdi*4]
0xa5976359251    d1  c5fa6f1c3b     vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256    d6  c5fa6f641f10   vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c    dc  c5fa6f6c1f20   vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262    e2  c5fa6f741f30   vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268    e8  c57810fe       vmovups xmm15,xmm6
0xa597635926c    ec  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276    f6  c441f96ec2     vmovq xmm8,r10
0xa597635927b    fb  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285   105  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b   10b  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359290   110  0f10fa         movups xmm7,xmm2
0xa5976359293   113  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d   11d  c441f96ec2     vmovq xmm8,r10
0xa59763592a2   122  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9   129  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af   12f  c4c24100f8     vpshufb xmm7,xmm7,xmm8
0xa59763592b4   134  c4c141ebff     vpor xmm7,xmm7,xmm15
0xa59763592b9   139  c57810fd       vmovups xmm15,xmm5
0xa59763592bd   13d  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4   144  c441f96ec2     vmovq xmm8,r10
0xa59763592c9   149  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0   150  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6   156  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa59763592db   15b  0f10f2         movups xmm6,xmm2
0xa59763592de   15e  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5   165  c441f96ec2     vmovq xmm8,r10
0xa59763592ea   16a  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1   171  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7   177  c4c24900f0     vpshufb xmm6,xmm6,xmm8
0xa59763592fc   17c  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xa5976359301   181  c57810fc       vmovups xmm15,xmm4
0xa5976359305   185  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c   18c  c441f96ec2     vmovq xmm8,r10
0xa5976359311   191  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318   198  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e   19e  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359323   1a3  0f10ea         movups xmm5,xmm2
0xa5976359326   1a6  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d   1ad  c441f96ec2     vmovq xmm8,r10
0xa5976359332   1b2  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339   1b9  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f   1bf  c4c25100e8     vpshufb xmm5,xmm5,xmm8
0xa5976359344   1c4  c4c151ebef     vpor xmm5,xmm5,xmm15
0xa5976359349   1c9  c57810fb       vmovups xmm15,xmm3
0xa597635934d   1cd  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354   1d4  c441f96ec2     vmovq xmm8,r10
0xa5976359359   1d9  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360   1e0  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366   1e6  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa597635936b   1eb  0f10e2         movups xmm4,xmm2
0xa597635936e   1ee  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375   1f5  c441f96ec2     vmovq xmm8,r10
0xa597635937a   1fa  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381   201  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387   207  c4c25900e0     vpshufb xmm4,xmm4,xmm8
0xa597635938c   20c  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xa5976359391   211  c5f928df       vmovapd xmm3,xmm7
0xa5976359395   215  c5e168d9       vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399   219  c5c160f9       vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d   21d  c57928c6       vmovapd xmm8,xmm6
0xa59763593a1   221  c53968c1       vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5   225  c5c960f1       vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9   229  c57928cd       vmovapd xmm9,xmm5
0xa59763593ad   22d  c53168c9       vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1   231  c5d160e9       vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5   235  c57928d4       vmovapd xmm10,xmm4
0xa59763593b9   239  c52968d1       vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd   23d  c5d960e1       vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1   241  c5e1d5d8       vpmullw xmm3,xmm3,xmm0
0xa59763593c5   245  c5c1d5f8       vpmullw xmm7,xmm7,xmm0
0xa59763593c9   249  c539d5c0       vpmullw xmm8,xmm8,xmm0
0xa59763593cd   24d  c5c9d5f0       vpmullw xmm6,xmm6,xmm0
0xa59763593d1   251  c531d5c8       vpmullw xmm9,xmm9,xmm0
0xa59763593d5   255  c5d1d5e8       vpmullw xmm5,xmm5,xmm0
0xa59763593d9   259  c529d5d0       vpmullw xmm10,xmm10,xmm0
0xa59763593dd   25d  c5d9d5e0       vpmullw xmm4,xmm4,xmm0
0xa59763593e1   261  c57928df       vmovapd xmm11,xmm7
0xa59763593e5   265  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763593ea   26a  c463010efb55   vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0   270  c443210edfaa   vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6   276  c442212bdf     vpackusdw xmm11,xmm11,xmm15
0xa59763593fb   27b  c57810fb       vmovups xmm15,xmm3
0xa59763593ff   27f  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359405   285  c5c172d710     vpsrld xmm7,xmm7,16
0xa597635940a   28a  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa597635940f   28f  c5f928de       vmovapd xmm3,xmm6
0xa5976359413   293  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359418   298  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e   29e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424   2a4  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa5976359429   2a9  c4417810f8     vmovups xmm15,xmm8
0xa597635942e   2ae  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359434   2b4  c5c972d610     vpsrld xmm6,xmm6,16
0xa5976359439   2b9  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa597635943e   2be  c57928c5       vmovapd xmm8,xmm5
0xa5976359442   2c2  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359447   2c7  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d   2cd  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453   2d3  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xa5976359458   2d8  c4417810f9     vmovups xmm15,xmm9
0xa597635945d   2dd  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359463   2e3  c5d172d510     vpsrld xmm5,xmm5,16
0xa5976359468   2e8  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xa597635946d   2ed  c57928cc       vmovapd xmm9,xmm4
0xa5976359471   2f1  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359476   2f6  c443010efa55   vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c   2fc  c443310ecfaa   vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482   302  c442312bcf     vpackusdw xmm9,xmm9,xmm15
0xa5976359487   307  c4417810fa     vmovups xmm15,xmm10
0xa597635948c   30c  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359492   312  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359497   317  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635949c   31c  c4c141fdfb     vpaddw xmm7,xmm7,xmm11
0xa59763594a1   321  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa59763594a5   325  c4c151fde8     vpaddw xmm5,xmm5,xmm8
0xa59763594aa   32a  c4c159fde1     vpaddw xmm4,xmm4,xmm9
0xa59763594af   32f  c5f928de       vmovapd xmm3,xmm6
0xa59763594b3   333  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594b8   338  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be   33e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4   344  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa59763594c9   349  c57810ff       vmovups xmm15,xmm7
0xa59763594cd   34d  c4c10172d710   vpsrld xmm15,xmm15,16
0xa59763594d3   353  c5c972d610     vpsrld xmm6,xmm6,16
0xa59763594d8   358  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa59763594dd   35d  c5f928fc       vmovapd xmm7,xmm4
0xa59763594e1   361  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594e6   366  c463010efd55   vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec   36c  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2   372  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa59763594f7   377  c57810fd       vmovups xmm15,xmm5
0xa59763594fb   37b  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359501   381  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359506   386  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635950b   38b  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa597635950f   38f  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xa5976359513   393  c5c971d608     vpsrlw xmm6,xmm6,8
0xa5976359518   398  c5d971d408     vpsrlw xmm4,xmm4,8
0xa597635951d   39d  c5d967e6       vpackuswb xmm4,xmm4,xmm6
0xa5976359521   3a1  c5fa7f243b     vmovdqu [rbx+rdi*1],xmm4
0xa5976359526   3a6  443bda         cmpl r11,rdx
0xa5976359529   3a9  0f8601fdffff   jna 0xa5976359230  <+0xb0>
0xa597635952f   3af  488bc1         REX.W movq rax,rcx
0xa5976359532   3b2  488be5         REX.W movq rsp,rbp
0xa5976359535   3b5  5d             pop rbp
0xa5976359536   3b6  c3             retl
0xa5976359537   3b7  488955e8       REX.W movq [rbp-0x18],rdx
0xa597635953b   3bb  48895de0       REX.W movq [rbp-0x20],rbx
0xa597635953f   3bf  c5f81145d0     vmovups [rbp-0x30],xmm0
0xa5976359544   3c4  c5f8114dc0     vmovups [rbp-0x40],xmm1
0xa5976359549   3c9  c5f81155b0     vmovups [rbp-0x50],xmm2
0xa597635954e   3ce  488945a8       REX.W movq [rbp-0x58],rax
0xa5976359552   3d2  48897da0       REX.W movq [rbp-0x60],rdi
0xa5976359556   3d6  4c894598       REX.W movq [rbp-0x68],r8
0xa597635955a   3da  e8615dffff     call 0xa597634f2c0       ;; wasm stub: WasmStackGuard
0xa597635955f   3df  33c9           xorl rcx,rcx
0xa5976359561   3e1  488b55e8       REX.W movq rdx,[rbp-0x18]
0xa5976359565   3e5  488b5de0       REX.W movq rbx,[rbp-0x20]
0xa5976359569   3e9  c5f81045d0     vmovups xmm0,[rbp-0x30]
0xa597635956e   3ee  c5f8104dc0     vmovups xmm1,[rbp-0x40]
0xa5976359573   3f3  c5f81055b0     vmovups xmm2,[rbp-0x50]
0xa5976359578   3f8  488b45a8       REX.W movq rax,[rbp-0x58]
0xa597635957c   3fc  488b7da0       REX.W movq rdi,[rbp-0x60]
0xa5976359580   400  4c8b4598       REX.W movq r8,[rbp-0x68]
0xa5976359584   404  488b75f0       REX.W movq rsi,[rbp-0x10]
0xa5976359588   408  e9b6fcffff     jmp 0xa5976359243  <+0xc3>
0xa597635958d   40d  e8fe5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592   412  e8f95affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597   417  e8f45affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c   41c  e8ef5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1   421  e8ea5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6   426  90             nop
0xa59763595a7   427  90             nop
Protected instructions:
pc offset  land pad
3a1       40d
e2       412
dc       417
d6       41c
d1       421
Source positions:
pc offset  position
d1        43
d6       239
dc       416
e2       545
3a1       722
3b7        29
40d       722
412       545
417       416
41c       239
421        43
Safepoints (size = 22)
0xa5a7635917fffffffff  000000000000000 (sp -> fp)
RelocInfo (size = 8)
0xa597635955b  wasm stub call
0xa597635958e  wasm stub call
0xa5976359593  wasm stub call
0xa5976359598  wasm stub call
0xa597635959d  wasm stub call
0xa59763595a2  wasm stub call
--- End code ---

复制Emscripten问题的答案:

我们之所以不使用v128.const,是因为v128.const最近才在V8中实现。为了避免破坏原始试用版用户,在相关的V8补丁进入Chrome稳定版之前,我们不能更新LLVM以发出v128.const。我一直在关注这个仪表板,以确定什么时候是做出这一改变的好时机。如果你使用的是Chrome的最新版本或其他支持v128.const的执行环境,你可以尝试使用-munimplemented-simd128标志编译你的项目,这将在LLVM中启用v128.const(但也可能会引入其他你不想要的更改(。一旦v128.const广泛可用,LLVM使用v128.consts将比从内存加载向量更好,因为这允许引擎在给定运行时平台的情况下确定实现向量的最佳方式。

还可能值得考虑移植代码中对性能敏感的部分,以便直接使用WebAssembly内部函数头,而不是依赖于模拟的SSE。这将减少代码和底层机器代码之间的阻抗不匹配。

最后,如果您注意到任何地方的指令选择都不理想,那么如果您可以针对您看到的特定问题提交LLVM错误(如果它在代码端(或V8错误(如果在本机端(,这将是非常有用的。这种反馈对我们来说非常有价值

@PeterOrderes

我已经对LLVM实现进行了一些更改,现在正在对其进行测试。您认为为常量生成的ASM代码如何?我认为这要好得多,但我希望有第二种意见。

--- WebAssembly code ---
index: 3
kind: wasm function
compiler: TurboFan
Body (size = 1280 = 1278 + 2 padding)
Instructions (size = 1256)
0xfbcf162d3c0     0  55             push rbp
0xfbcf162d3c1     1  4889e5         REX.W movq rbp,rsp
0xfbcf162d3c4     4  6a0a           push 0xa
0xfbcf162d3c6     6  56             push rsi
0xfbcf162d3c7     7  4883ec50       REX.W subq rsp,0x50
0xfbcf162d3cb     b  488b4e17       REX.W movq rcx,[rsi+0x17]
0xfbcf162d3cf     f  488bd8         REX.W movq rbx,rax
0xfbcf162d3d2    12  83fa40         cmpl rdx,0x40
0xfbcf162d3d5    15  0f8308000000   jnc 0xfbcf162d3e3  <+0x23>
0xfbcf162d3db    1b  4533c0         xorl r8,r8
0xfbcf162d3de    1e  e950030000     jmp 0xfbcf162d733  <+0x373>
0xfbcf162d3e3    23  48bf8080808080808080 REX.W movq rdi,0x8080808080808080
0xfbcf162d3ed    2d  c4e1f96ec7     vmovq xmm0,rdi
0xfbcf162d3f2    32  c5fb12c0       vmovddup xmm0,xmm0
0xfbcf162d3f6    36  48bf4d0096001d000100 REX.W movq rdi,0x1001d0096004d
0xfbcf162d400    40  c4e1f96ecf     vmovq xmm1,rdi
0xfbcf162d405    45  c5fb12c9       vmovddup xmm1,xmm1
0xfbcf162d409    49  bf04000000     movl rdi,0x4
0xfbcf162d40e    4e  4533c9         xorl r9,r9
0xfbcf162d411    51  4c8bc7         REX.W movq r8,rdi
0xfbcf162d414    54  e90d000000     jmp 0xfbcf162d426  <+0x66>
0xfbcf162d419    59  0f1f8000000000 nop
0xfbcf162d420    60  4d8bc8         REX.W movq r9,r8
0xfbcf162d423    63  4d8bc3         REX.W movq r8,r11
0xfbcf162d426    66  4c8b5e2f       REX.W movq r11,[rsi+0x2f]
0xfbcf162d42a    6a  493b23         REX.W cmpq rsp,[r11]
0xfbcf162d42d    6d  0f86aa030000   jna 0xfbcf162d7dd  <+0x41d>
0xfbcf162d433    73  458d5804       leal r11,[r8+0x4]
0xfbcf162d437    77  4d8be3         REX.W movq r12,r11
0xfbcf162d43a    7a  41c1e404       shll r12, 4
0xfbcf162d43e    7e  468d0c8b       leal r9,[rbx+r9*4]
0xfbcf162d442    82  c4a17a6f1409   vmovdqu xmm2,[rcx+r9*1]
0xfbcf162d448    88  c4c17a6f5c0910 vmovdqu xmm3,[r9+rcx*1+0x10]
0xfbcf162d44f    8f  c4c17a6f640920 vmovdqu xmm4,[r9+rcx*1+0x20]
0xfbcf162d456    96  c4c17a6f6c0930 vmovdqu xmm5,[r9+rcx*1+0x30]
0xfbcf162d45d    9d  c57810fd       vmovups xmm15,xmm5
0xfbcf162d461    a1  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xfbcf162d46b    ab  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d470    b0  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xfbcf162d47a    ba  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d480    c0  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d485    c5  0f10f0         movups xmm6,xmm0
0xfbcf162d488    c8  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xfbcf162d492    d2  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d497    d7  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xfbcf162d49e    de  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4a4    e4  c4e24900f7     vpshufb xmm6,xmm6,xmm7
0xfbcf162d4a9    e9  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xfbcf162d4ae    ee  c57810fc       vmovups xmm15,xmm4
0xfbcf162d4b2    f2  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xfbcf162d4b9    f9  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d4be    fe  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xfbcf162d4c5   105  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4cb   10b  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d4d0   110  0f10e8         movups xmm5,xmm0
0xfbcf162d4d3   113  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xfbcf162d4da   11a  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d4df   11f  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xfbcf162d4e6   126  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d4ec   12c  c4e25100ef     vpshufb xmm5,xmm5,xmm7
0xfbcf162d4f1   131  c4c151ebef     vpor xmm5,xmm5,xmm15
0xfbcf162d4f6   136  c57810fb       vmovups xmm15,xmm3
0xfbcf162d4fa   13a  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xfbcf162d501   141  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d506   146  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xfbcf162d50d   14d  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d513   153  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d518   158  0f10e0         movups xmm4,xmm0
0xfbcf162d51b   15b  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xfbcf162d522   162  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d527   167  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xfbcf162d52e   16e  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d534   174  c4e25900e7     vpshufb xmm4,xmm4,xmm7
0xfbcf162d539   179  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xfbcf162d53e   17e  c57810fa       vmovups xmm15,xmm2
0xfbcf162d542   182  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xfbcf162d549   189  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d54e   18e  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xfbcf162d555   195  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d55b   19b  c4620100ff     vpshufb xmm15,xmm15,xmm7
0xfbcf162d560   1a0  0f10d8         movups xmm3,xmm0
0xfbcf162d563   1a3  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xfbcf162d56a   1aa  c4c1f96efa     vmovq xmm7,r10
0xfbcf162d56f   1af  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xfbcf162d576   1b6  c4c3c122fa01   vpinsrq xmm7,xmm7,r10,0x1
0xfbcf162d57c   1bc  c4e26100df     vpshufb xmm3,xmm3,xmm7
0xfbcf162d581   1c1  c4c161ebdf     vpor xmm3,xmm3,xmm15
0xfbcf162d586   1c6  c4e3690fd608   vpalignr xmm2,xmm2,xmm6,0x8
0xfbcf162d58c   1cc  c4e27930d2     vpmovzxbw xmm2,xmm2
0xfbcf162d591   1d1  c4e27930f6     vpmovzxbw xmm6,xmm6
0xfbcf162d596   1d6  c4e3410ffd08   vpalignr xmm7,xmm7,xmm5,0x8
0xfbcf162d59c   1dc  c4e27930ff     vpmovzxbw xmm7,xmm7
0xfbcf162d5a1   1e1  c4e27930ed     vpmovzxbw xmm5,xmm5
0xfbcf162d5a6   1e6  c463390fc408   vpalignr xmm8,xmm8,xmm4,0x8
0xfbcf162d5ac   1ec  c4427930c0     vpmovzxbw xmm8,xmm8
0xfbcf162d5b1   1f1  c4e27930e4     vpmovzxbw xmm4,xmm4
0xfbcf162d5b6   1f6  c463310fcb08   vpalignr xmm9,xmm9,xmm3,0x8
0xfbcf162d5bc   1fc  c4427930c9     vpmovzxbw xmm9,xmm9
0xfbcf162d5c1   201  c4e27930db     vpmovzxbw xmm3,xmm3
0xfbcf162d5c6   206  c5e9d5d1       vpmullw xmm2,xmm2,xmm1
0xfbcf162d5ca   20a  c5c9d5f1       vpmullw xmm6,xmm6,xmm1
0xfbcf162d5ce   20e  c5c1d5f9       vpmullw xmm7,xmm7,xmm1
0xfbcf162d5d2   212  c5d1d5e9       vpmullw xmm5,xmm5,xmm1
0xfbcf162d5d6   216  c539d5c1       vpmullw xmm8,xmm8,xmm1
0xfbcf162d5da   21a  c5d9d5e1       vpmullw xmm4,xmm4,xmm1
0xfbcf162d5de   21e  c531d5c9       vpmullw xmm9,xmm9,xmm1
0xfbcf162d5e2   222  c5e1d5d9       vpmullw xmm3,xmm3,xmm1
0xfbcf162d5e6   226  c57928d6       vmovapd xmm10,xmm6
0xfbcf162d5ea   22a  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d5ef   22f  c463010efa55   vpblendw xmm15,xmm15,xmm2,0x55
0xfbcf162d5f5   235  c443290ed7aa   vpblendw xmm10,xmm10,xmm15,0xaa
0xfbcf162d5fb   23b  c442292bd7     vpackusdw xmm10,xmm10,xmm15
0xfbcf162d600   240  c57810fa       vmovups xmm15,xmm2
0xfbcf162d604   244  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d60a   24a  c5c972d610     vpsrld xmm6,xmm6,16
0xfbcf162d60f   24f  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xfbcf162d614   254  c5f928d5       vmovapd xmm2,xmm5
0xfbcf162d618   258  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d61d   25d  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xfbcf162d623   263  c4c3690ed7aa   vpblendw xmm2,xmm2,xmm15,0xaa
0xfbcf162d629   269  c4c2692bd7     vpackusdw xmm2,xmm2,xmm15
0xfbcf162d62e   26e  c57810ff       vmovups xmm15,xmm7
0xfbcf162d632   272  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d638   278  c5d172d510     vpsrld xmm5,xmm5,16
0xfbcf162d63d   27d  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xfbcf162d642   282  c5f928fc       vmovapd xmm7,xmm4
0xfbcf162d646   286  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d64b   28b  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xfbcf162d651   291  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xfbcf162d657   297  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xfbcf162d65c   29c  c4417810f8     vmovups xmm15,xmm8
0xfbcf162d661   2a1  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d667   2a7  c5d972d410     vpsrld xmm4,xmm4,16
0xfbcf162d66c   2ac  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xfbcf162d671   2b1  c57928c3       vmovapd xmm8,xmm3
0xfbcf162d675   2b5  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d67a   2ba  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xfbcf162d680   2c0  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xfbcf162d686   2c6  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xfbcf162d68b   2cb  c4417810f9     vmovups xmm15,xmm9
0xfbcf162d690   2d0  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d696   2d6  c5e172d310     vpsrld xmm3,xmm3,16
0xfbcf162d69b   2db  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xfbcf162d6a0   2e0  c4c149fdf2     vpaddw xmm6,xmm6,xmm10
0xfbcf162d6a5   2e5  c5d1fdea       vpaddw xmm5,xmm5,xmm2
0xfbcf162d6a9   2e9  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xfbcf162d6ad   2ed  c4c161fdd8     vpaddw xmm3,xmm3,xmm8
0xfbcf162d6b2   2f2  c5f928d5       vmovapd xmm2,xmm5
0xfbcf162d6b6   2f6  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d6bb   2fb  c463010efe55   vpblendw xmm15,xmm15,xmm6,0x55
0xfbcf162d6c1   301  c4c3690ed7aa   vpblendw xmm2,xmm2,xmm15,0xaa
0xfbcf162d6c7   307  c4c2692bd7     vpackusdw xmm2,xmm2,xmm15
0xfbcf162d6cc   30c  c57810fe       vmovups xmm15,xmm6
0xfbcf162d6d0   310  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d6d6   316  c5d172d510     vpsrld xmm5,xmm5,16
0xfbcf162d6db   31b  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xfbcf162d6e0   320  c5f928f3       vmovapd xmm6,xmm3
0xfbcf162d6e4   324  c44101efff     vpxor xmm15,xmm15,xmm15
0xfbcf162d6e9   329  c463010efc55   vpblendw xmm15,xmm15,xmm4,0x55
0xfbcf162d6ef   32f  c4c3490ef7aa   vpblendw xmm6,xmm6,xmm15,0xaa
0xfbcf162d6f5   335  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xfbcf162d6fa   33a  c57810fc       vmovups xmm15,xmm4
0xfbcf162d6fe   33e  c4c10172d710   vpsrld xmm15,xmm15,16
0xfbcf162d704   344  c5e172d310     vpsrld xmm3,xmm3,16
0xfbcf162d709   349  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xfbcf162d70e   34e  c5d1fdea       vpaddw xmm5,xmm5,xmm2
0xfbcf162d712   352  c5e1fdde       vpaddw xmm3,xmm3,xmm6
0xfbcf162d716   356  c5d171d508     vpsrlw xmm5,xmm5,8
0xfbcf162d71b   35b  c5e171d308     vpsrlw xmm3,xmm3,8
0xfbcf162d720   360  c5e167dd       vpackuswb xmm3,xmm3,xmm5
0xfbcf162d724   364  c4a17a7f1c09   vmovdqu [rcx+r9*1],xmm3
0xfbcf162d72a   36a  443be2         cmpl r12,rdx
0xfbcf162d72d   36d  0f86edfcffff   jna 0xfbcf162d420  <+0x60>
0xfbcf162d733   373  33ff           xorl rdi,rdi
0xfbcf162d735   375  41b904000000   movl r9,0x4
0xfbcf162d73b   37b  4183f9ff       cmpl r9,0xff
0xfbcf162d73f   37f  0f84e7000000   jz 0xfbcf162d82c  <+0x46c>
0xfbcf162d745   385  41c1e004       shll r8, 4
0xfbcf162d749   389  488bc2         REX.W movq rax,rdx
0xfbcf162d74c   38c  99             cdql
0xfbcf162d74d   38d  41f7f9         idivl r9
0xfbcf162d750   390  428d1403       leal rdx,[rbx+r8*1]
0xfbcf162d754   394  03d8           addl rbx,rax
0xfbcf162d756   396  3bd3           cmpl rdx,rbx
0xfbcf162d758   398  0f8777000000   ja 0xfbcf162d7d5  <+0x415>
0xfbcf162d75e   39e  4c8bc7         REX.W movq r8,rdi
0xfbcf162d761   3a1  4c8bca         REX.W movq r9,rdx
0xfbcf162d764   3a4  e90d000000     jmp 0xfbcf162d776  <+0x3b6>
0xfbcf162d769   3a9  0f1f8000000000 nop
0xfbcf162d770   3b0  4d8bc3         REX.W movq r8,r11
0xfbcf162d773   3b3  4d89e1         REX.W movq r9,r12
0xfbcf162d776   3b6  4c8b5e2f       REX.W movq r11,[rsi+0x2f]
0xfbcf162d77a   3ba  493b23         REX.W cmpq rsp,[r11]
0xfbcf162d77d   3bd  0f86ba000000   jna 0xfbcf162d83d  <+0x47d>
0xfbcf162d783   3c3  458d5804       leal r11,[r8+0x4]
0xfbcf162d787   3c7  468d241a       leal r12,[rdx+r11*1]
0xfbcf162d78b   3cb  4d8bf0         REX.W movq r14,r8
0xfbcf162d78e   3ce  4183ce01       orl r14,0x1
0xfbcf162d792   3d2  458bc9         movl r9,r9
0xfbcf162d795   3d5  4403f2         addl r14,rdx
0xfbcf162d798   3d8  4183c802       orl r8,0x2
0xfbcf162d79c   3dc  460fb63c09     movzxbl r15,[rcx+r9*1]
0xfbcf162d7a1   3e1  4403c2         addl r8,rdx
0xfbcf162d7a4   3e4  460fb63431     movzxbl r14,[rcx+r14*1]
0xfbcf162d7a9   3e9  460fb60401     movzxbl r8,[rcx+r8*1]
0xfbcf162d7ae   3ee  4569f696000000 imull r14,r14,0x96
0xfbcf162d7b5   3f5  456bff4d       imull r15,r15,0x4d
0xfbcf162d7b9   3f9  456bc01d       imull r8,r8,0x1d
0xfbcf162d7bd   3fd  4503f7         addl r14,r15
0xfbcf162d7c0   400  478d843080000000 leal r8,[r8+r14*1+0x80]
0xfbcf162d7c8   408  41c1e808       shrl r8, 8
0xfbcf162d7cc   40c  46880409       movb [rcx+r9*1],r8l
0xfbcf162d7d0   410  443be3         cmpl r12,rbx
0xfbcf162d7d3   413  769b           jna 0xfbcf162d770  <+0x3b0>
0xfbcf162d7d5   415  488bc7         REX.W movq rax,rdi
0xfbcf162d7d8   418  488be5         REX.W movq rsp,rbp
0xfbcf162d7db   41b  5d             pop rbp
0xfbcf162d7dc   41c  c3             retl
0xfbcf162d7dd   41d  48894de0       REX.W movq [rbp-0x20],rcx
0xfbcf162d7e1   421  48895de8       REX.W movq [rbp-0x18],rbx
0xfbcf162d7e5   425  488955d8       REX.W movq [rbp-0x28],rdx
0xfbcf162d7e9   429  4c8945d0       REX.W movq [rbp-0x30],r8
0xfbcf162d7ed   42d  c5f8114db0     vmovups [rbp-0x50],xmm1
0xfbcf162d7f2   432  c5f81145a0     vmovups [rbp-0x60],xmm0
0xfbcf162d7f7   437  4c894dc8       REX.W movq [rbp-0x38],r9
0xfbcf162d7fb   43b  e8c04affff     call 0xfbcf16222c0       ;; wasm stub: WasmStackGuard
0xfbcf162d800   440  488b4de0       REX.W movq rcx,[rbp-0x20]
0xfbcf162d804   444  488b75f0       REX.W movq rsi,[rbp-0x10]
0xfbcf162d808   448  488b5de8       REX.W movq rbx,[rbp-0x18]
0xfbcf162d80c   44c  bf04000000     movl rdi,0x4
0xfbcf162d811   451  488b55d8       REX.W movq rdx,[rbp-0x28]
0xfbcf162d815   455  4c8b45d0       REX.W movq r8,[rbp-0x30]
0xfbcf162d819   459  c5f8104db0     vmovups xmm1,[rbp-0x50]
0xfbcf162d81e   45e  c5f81045a0     vmovups xmm0,[rbp-0x60]
0xfbcf162d823   463  4c8b4dc8       REX.W movq r9,[rbp-0x38]
0xfbcf162d827   467  e907fcffff     jmp 0xfbcf162d433  <+0x73>
0xfbcf162d82c   46c  81fa00000080   cmpl rdx,0x80000000
0xfbcf162d832   472  0f843d000000   jz 0xfbcf162d875  <+0x4b5>
0xfbcf162d838   478  e908ffffff     jmp 0xfbcf162d745  <+0x385>
0xfbcf162d83d   47d  48895de8       REX.W movq [rbp-0x18],rbx
0xfbcf162d841   481  48894de0       REX.W movq [rbp-0x20],rcx
0xfbcf162d845   485  488955d8       REX.W movq [rbp-0x28],rdx
0xfbcf162d849   489  4c8945d0       REX.W movq [rbp-0x30],r8
0xfbcf162d84d   48d  4c894dc8       REX.W movq [rbp-0x38],r9
0xfbcf162d851   491  e86a4affff     call 0xfbcf16222c0       ;; wasm stub: WasmStackGuard
0xfbcf162d856   496  33ff           xorl rdi,rdi
0xfbcf162d858   498  488b5de8       REX.W movq rbx,[rbp-0x18]
0xfbcf162d85c   49c  488b4de0       REX.W movq rcx,[rbp-0x20]
0xfbcf162d860   4a0  488b55d8       REX.W movq rdx,[rbp-0x28]
0xfbcf162d864   4a4  4c8b45d0       REX.W movq r8,[rbp-0x30]
0xfbcf162d868   4a8  4c8b4dc8       REX.W movq r9,[rbp-0x38]
0xfbcf162d86c   4ac  488b75f0       REX.W movq rsi,[rbp-0x10]
0xfbcf162d870   4b0  e90effffff     jmp 0xfbcf162d783  <+0x3c3>
0xfbcf162d875   4b5  e84648ffff     call 0xfbcf16220c0       ;; wasm stub: ThrowWasmTrapDivUnrepresentable
0xfbcf162d87a   4ba  e81148ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d87f   4bf  e80c48ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d884   4c4  e80748ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d889   4c9  e80248ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d88e   4ce  e8fd47ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d893   4d3  e8f847ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d898   4d8  e8f347ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d89d   4dd  e8ee47ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d8a2   4e2  e8e947ffff     call 0xfbcf1622090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xfbcf162d8a7   4e7  90             nop

相关内容

  • 没有找到相关文章

最新更新