class Program
{
static void Main(string[] args)
{
Console.WriteLine(Vector.IsHardwareAccelerated ? "SIMD supported" : "SIMD not supported.");
var rand = new Random();
var numNums = 10000000;
var arr1 = Enumerable.Repeat(0, numNums).Select(x => (int) (rand.NextDouble() * 100)).ToArray();
var arr2 = Enumerable.Repeat(0, numNums).Select(x => (int) (rand.NextDouble() * 100)).ToArray();
var simdResult = new int [numNums];
var conventionalResult = new int [numNums];
var watch = System.Diagnostics.Stopwatch.StartNew();
ConventionalArrayAddition(arr1, arr2, conventionalResult);
watch.Stop();
Console.WriteLine("Conventional time :" + watch.ElapsedMilliseconds);
var watch2 = System.Diagnostics.Stopwatch.StartNew();
SIMDArrayAddition(arr1, arr2, simdResult);
watch2.Stop();
Console.WriteLine("Simd time :" + watch2.ElapsedMilliseconds);
Console.ReadKey();
}
public static void SIMDArrayAddition(int[] lhs, int[] rhs, int [] result)
{
var simdLength = Vector<int>.Count;
var i = 0;
for (; i <= lhs.Length - simdLength; i += simdLength)
{
var va = new Vector<int>(lhs, i);
var vb = new Vector<int>(rhs, i);
(va + vb).CopyTo(result, i);
}
for (; i < lhs.Length; ++i)
{
result[i] = lhs[i] + rhs[i];
}
}
public static void ConventionalArrayAddition(int[] lhs, int[] rhs, int[] result)
{
for (int i = 0; i < lhs.Length; i ++)
{
result[i] = lhs[i] + rhs[i];
}
}
}
此代码改编自上的一个示例https://instil.co/2016/03/21/parallelism-on-a-single-core-simd-with-c/.
我正在将其编译为.Net Framework控制台应用程序(我尝试过4.6.1和4.7(,并选择了"优化代码"作为x64。
我得到的结果是:
常规时间:22
模拟时间:23
如果我在.net core中进行类似的测试,使用向量方法确实会得到更快的结果,但这只是因为在.net cores下,天真的实现要慢得多(大约需要55ms(。core中的矢量化实现通常比我在.net框架中得到的结果稍慢(比如24ms(。
我的处理器是i5-7500T,我在i5-7200上也得到了类似的结果。
可能还有其他一些我忽略了的简单设置吗?或者可能是编译器在某种程度上进行了优化,在幼稚的代码中使用simd指令?
更新:遵循中的说明https://blogs.msdn.microsoft.com/clrcodegeneration/2007/10/19/how-to-see-the-assembly-code-generated-by-the-jit-using-visual-studio/,以下是ConventionalArrayAddition((:
for (int i = 0; i < lhs.Length; i++)
00000000 sub rsp,28h
00000004 xor eax,eax
00000006 mov r9d,dword ptr [rcx+8]
0000000a test r9d,r9d
0000000d jle 000000000000008A
0000000f test rdx,rdx
00000012 setne r10b
00000016 movzx r10d,r10b
0000001a and r10d,1
0000001e test r8,r8
00000021 setne r11b
00000025 movzx r11d,r11b
00000029 test r11d,r10d
0000002c je 0000000000000066
0000002e cmp dword ptr [rdx+8],r9d
00000032 setge r10b
00000036 movzx r10d,r10b
0000003a cmp dword ptr [r8+8],r9d
0000003e setge r11b
00000042 movzx r11d,r11b
00000046 test r11d,r10d
00000049 je 0000000000000066
{
result[i] = lhs[i] + rhs[i];
0000004b movsxd r10,eax
0000004e mov r11d,dword ptr [rcx+r10*4+10h]
00000053 add r11d,dword ptr [rdx+r10*4+10h]
00000058 mov dword ptr [r8+r10*4+10h],r11d
for (int i = 0; i < lhs.Length; i++)
0000005d inc eax
0000005f cmp r9d,eax
00000062 jg 000000000000004B
00000064 jmp 000000000000008A
00000066 movsxd r10,eax
00000069 mov r11d,dword ptr [rcx+r10*4+10h]
0000006e cmp eax,dword ptr [rdx+8]
00000071 jae 000000000000008F
00000073 add r11d,dword ptr [rdx+r10*4+10h]
00000078 cmp eax,dword ptr [r8+8]
0000007c jae 000000000000008F
0000007e mov dword ptr [r8+r10*4+10h],r11d
00000083 inc eax
00000085 cmp r9d,eax
00000088 jg 0000000000000066
0000008a add rsp,28h
}
}
0000008e ret
0000008f call 000000005FA91300
00000094 int 3
对于SIMDArrayAddition((:
var simdLength = Vector<int>.Count;
00000000 push rdi
00000001 push rsi
00000002 sub rsp,28h
00000006 vzeroupper
00000009 xor eax,eax
for (; i <= lhs.Length - simdLength; i += simdLength)
0000000b mov r9d,dword ptr [rcx+8]
0000000f mov r10d,r9d
00000012 sub r10d,8
00000016 test r10d,r10d
00000019 jl 0000000000000064
0000001b mov r11d,dword ptr [rdx+8]
0000001f mov esi,dword ptr [r8+8]
00000023 cmp eax,r9d
00000026 jae 00000000000000A2
00000028 lea edi,[rax+7]
0000002b cmp edi,r9d
0000002e jae 00000000000000A2
00000030 vmovupd ymm0,ymmword ptr [rcx+rax*4+10h]
var vb = new Vector<int>(rhs, i);
00000037 cmp eax,r11d
0000003a jae 00000000000000A2
0000003c cmp edi,r11d
0000003f jae 00000000000000A2
00000041 vmovupd ymm1,ymmword ptr [rdx+rax*4+10h]
(va + vb).CopyTo(result, i);
00000048 vpaddd ymm0,ymm0,ymm1
0000004d cmp eax,esi
0000004f jae 00000000000000A7
00000051 cmp edi,esi
00000053 jae 00000000000000AC
00000055 vmovupd ymmword ptr [r8+rax*4+10h],ymm0
for (; i <= lhs.Length - simdLength; i += simdLength)
0000005c add eax,8
0000005f cmp r10d,eax
00000062 jge 0000000000000023
}
for (; i < lhs.Length; ++i)
00000064 cmp r9d,eax
00000067 jle 0000000000000098
00000069 mov r11d,dword ptr [rdx+8]
0000006d mov esi,dword ptr [r8+8]
{
result[i] = lhs[i] + rhs[i];
00000071 cmp eax,r9d
00000074 jae 00000000000000A2
00000076 movsxd r10,eax
00000079 mov edi,dword ptr [rcx+r10*4+10h]
0000007e cmp eax,r11d
00000081 jae 00000000000000A2
00000083 add edi,dword ptr [rdx+r10*4+10h]
00000088 cmp eax,esi
0000008a jae 00000000000000A2
0000008c mov dword ptr [r8+r10*4+10h],edi
for (; i < lhs.Length; ++i)
00000091 inc eax
00000093 cmp r9d,eax
00000096 jg 0000000000000071
00000098 vzeroupper
}
}
0000009b add rsp,28h
0000009f pop rsi
000000a0 pop rdi
000000a1 ret
000000a2 call 000000005FA91250
000000a7 call 000000005FA91B00
000000ac call 000000005FA91A50
000000b1 int 3
这些是从不同的机器(i7-4790(获得的,该机器产生类似的时序。
将实现更改为AddTo
以减少源和目的地的数量,将性能提高约70%。这种添加在许多情况下都很有用,而且大多数CPU内部添加都是如何工作的,从而减少了内存带宽和缓存需求。
public static void SIMDArrayAddTo(int[] lhs, int[] rhs)
{
var simdLength = Vector<int>.Count;
var end = lhs.Length - simdLength;
var i = 0;
for (; i <= end; i += simdLength)
{
var va = new Vector<int>(lhs, i);
var vb = new Vector<int>(rhs, i);
(va + vb).CopyTo(lhs, i);
}
for (; i < lhs.Length; ++i)
{
lhs[i] += rhs[i];
}
}
我还试着展开SSE循环,但似乎没有帮助。在HPCsharp-nuget包中添加了与此版本类似的版本,包括多核版本。
此外,在上述功能的基础上添加了多核并行,这并没有提高性能。如果有人可以访问具有2个以上内存通道的CPU,那么当有更多的系统内存带宽可用时,看看这个代码是如何扩展的将是很有趣的。