c-cuda SIMD指令,用于无符号饱和的每字节乘法



CUDA有一组很好的整数SIMD指令,可以进行高效的SIMD计算。在这些函数中,有一些可以计算每字节或每半个字的加法和减法(如__vadd2和__vadd4),然而,我找不到类似的函数来计算32位寄存器的每字节乘法。如果有人能帮我找到合适的解决方案,我将不胜感激。

然而,我找不到类似的函数来计算32位寄存器的每字节乘法。

没有一个退货的4个单独的产品。

最接近的是__dp4a()内在函数,它以32位整数的形式返回4个乘积的和。

你可以写一个饱和的8位压缩无符号乘法,如下所示:

$ cat t2048.cu
#include <cstdio>
#include <cstdint>
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(uchar4 a, uchar4 b, uchar4 *c){
*c = u8mulsat(a, b);
}
int main(){
uchar4 a,b,c, *d_c;
cudaMalloc(&d_c, sizeof(uchar4));
a.x = 1;
a.y = 2;
a.z = 4;
a.w = 8;
b.x = 64;
b.y = 64;
b.z = 64;
b.w = 1;
k<<<1,1>>>(a, b, d_c);
cudaMemcpy(&c, d_c, sizeof(uchar4), cudaMemcpyDeviceToHost);
printf("c.x = %un", (unsigned)c.x);
printf("c.y = %un", (unsigned)c.y);
printf("c.z = %un", (unsigned)c.z);
printf("c.w = %un", (unsigned)c.w);
}
$ nvcc -o t2048 t2048.cu
$ compute-sanitizer ./t2048
========= COMPUTE-SANITIZER
c.x = 64
c.y = 128
c.z = 255
c.w = 8
========= ERROR SUMMARY: 0 errors
$ cuobjdump -sass ./t2048
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Function : _Z1k6uchar4S_PS_
.headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400e22007f6 */
/*0008*/                   MOV R1, c[0x0][0x20] ;        /* 0x4c98078000870001 */
/*0010*/                   LDC.U8 R0, c[0x0][0x140] ;    /* 0xef9000001407ff00 */
/*0018*/                   LDC.U8 R2, c[0x0][0x144] ;    /* 0xef9000001447ff02 */
/* 0x001d4400e6200731 */
/*0028*/                   LDC.U8 R3, c[0x0][0x141] ;    /* 0xef9000001417ff03 */
/*0030*/                   LDC.U8 R4, c[0x0][0x145] ;    /* 0xef9000001457ff04 */
/*0038*/                   LDC.U8 R5, c[0x0][0x142] ;    /* 0xef9000001427ff05 */
/* 0x001dfc00ee200751 */
/*0048*/                   LDC.U8 R6, c[0x0][0x146] ;    /* 0xef9000001467ff06 */
/*0050*/                   LDC.U8 R7, c[0x0][0x143] ;    /* 0xef9000001437ff07 */
/*0058*/                   LDC.U8 R8, c[0x0][0x147] ;    /* 0xef9000001477ff08 */
/* 0x009fd002fe200fe1 */
/*0068*/                   XMAD R0, R2, R0, RZ ;         /* 0x5b007f8000070200 */
/*0070*/                   XMAD R2, R4, R3, RZ ;         /* 0x5b007f8000370402 */
/*0078*/                   XMAD R3, R6, R5, RZ ;         /* 0x5b007f8000570603 */
/* 0x001fc408fe2007f1 */
/*0088*/                   IMNMX.U32 R0, R0, 0xff, PT ;  /* 0x382003800ff70000 */
/*0090*/                   XMAD R4, R8, R7, RZ ;         /* 0x5b007f8000770804 */
/*0098*/                   IMNMX.U32 R2, R2, 0xff, PT ;  /* 0x382003800ff70202 */
/* 0x001fc400fe2007e4 */
/*00a8*/                   IMNMX.U32 R3, R3, 0xff, PT ;  /* 0x382003800ff70303 */
/*00b0*/                   IMNMX.U32 R4, R4, 0xff, PT ;  /* 0x382003800ff70404 */
/*00b8*/                   BFI R0, R2, 0x808, R0 ;       /* 0x36f0000080870200 */
/* 0x001fd400fe2007f5 */
/*00c8*/                   MOV R2, c[0x0][0x148] ;       /* 0x4c98078005270002 */
/*00d0*/                   BFI R5, R3, 0x810, R0 ;       /* 0x36f0000081070305 */
/*00d8*/                   MOV R3, c[0x0][0x14c] ;       /* 0x4c98078005370003 */
/* 0x001ffc00fe2007e2 */
/*00e8*/                   BFI R4, R4, 0x818, R5 ;       /* 0x36f0028081870404 */
/*00f0*/                   STG.E [R2], R4 ;              /* 0xeedc200000070204 */
/*00f8*/                   EXIT ;                        /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0108*/                   BRA 0x100 ;                   /* 0xe2400fffff07000f */
/*0110*/                   NOP;                          /* 0x50b0000000070f00 */
/*0118*/                   NOP;                          /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*0128*/                   NOP;                          /* 0x50b0000000070f00 */
/*0130*/                   NOP;                          /* 0x50b0000000070f00 */
/*0138*/                   NOP;                          /* 0x50b0000000070f00 */
..........

Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$

SASS代码的长度与我预期的差不多,与C++代码的长度大致相同,忽略了LDCSTG指令。

FWIW,在特斯拉V100、CUDA 11.4上,njuffa和mine的实现在寄存器使用率(njuffa:16,mine:17)和性能(njufa快约1%)方面非常接近:

$ cat t2048.cu
#include <iostream>
#include <cstdint>
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
unsigned int plo, phi, res;
// compute products
plo = ((a & 0x000000ff) * (b & 0x000000ff) +
(a & 0x0000ff00) * (b & 0x0000ff00));
phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) +
__umulhi (a & 0xff000000, b & 0xff000000));
// clamp products to 255
plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
// extract least significant eight bits of each product
res = __byte_perm (plo, phi, 0x6420);
return res;
}
__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
const unsigned sv = 255;
uchar4 result;
unsigned t;
t = a.x*b.x;
if (t > sv) t = sv;
result.x = t;
t = a.y*b.y;
if (t > sv) t = sv;
result.y = t;
t = a.z*b.z;
if (t > sv) t = sv;
result.z = t;
t = a.w*b.w;
if (t > sv) t = sv;
result.w = t;
return result;
}
__global__ void k(const uchar4 * __restrict__ a, const uchar4 * __restrict__ b, uchar4 * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = u8mulsat(a[idx], b[idx]);
}
__global__ void k1(const unsigned * __restrict__ a, const unsigned * __restrict__ b, unsigned * __restrict__ c, unsigned N){
unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < N)
c[idx] = vmulus4(a[idx], b[idx]);
}
int main(){
unsigned N = 256U*80U*8U*400U;
uchar4 *d_a,*d_b, *d_c;
cudaMalloc(&d_c, sizeof(uchar4)*N);
cudaMalloc(&d_a, sizeof(uchar4)*N);
cudaMalloc(&d_b, sizeof(uchar4)*N);
for (int i = 0; i < 100; i++) {
k<<<N/256,256>>>(d_a, d_b, d_c, N);
k1<<<N/256,256>>>((unsigned *)d_a, (unsigned *)d_b, (unsigned *)d_c, N);}
cudaDeviceSynchronize();
}
$ nvcc -o t2048 t2048.cu -arch=sm_70 -Xptxas -v
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z2k1PKjS0_Pjj' for 'sm_70'
ptxas info    : Function properties for _Z2k1PKjS0_Pjj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 16 registers, 380 bytes cmem[0]
ptxas info    : Compiling entry function '_Z1kPK6uchar4S1_PS_j' for 'sm_70'
ptxas info    : Function properties for _Z1kPK6uchar4S1_PS_j
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 380 bytes cmem[0]
$ nvprof ./t2048
==2696== NVPROF is profiling process 2696, command: ./t2048
==2696== Profiling application: ./t2048
==2696== Profiling result:
Type  Time(%)      Time     Calls       Avg       Min       Max  Name
GPU activities:   50.21%  100.24ms       100  1.0024ms  998.26us  1.0084ms  k(uchar4 const *, uchar4 const *, uchar4*, unsigned int)
49.79%  99.412ms       100  994.12us  990.33us  1.0015ms  k1(unsigned int const *, unsigned int const *, unsigned int*, unsigned int)
API calls:   57.39%  279.76ms         3  93.254ms  557.75us  278.64ms  cudaMalloc
40.69%  198.31ms         1  198.31ms  198.31ms  198.31ms  cudaDeviceSynchronize
1.03%  5.0147ms         4  1.2537ms  589.80us  3.2328ms  cuDeviceTotalMem
0.51%  2.4799ms       404  6.1380us     333ns  272.34us  cuDeviceGetAttribute
0.30%  1.4715ms       200  7.3570us  6.5220us  68.684us  cudaLaunchKernel
0.07%  354.69us         4  88.672us  61.927us  166.60us  cuDeviceGetName
0.00%  20.956us         4  5.2390us  3.1200us  7.8000us  cuDeviceGetPCIBusId
0.00%  10.445us         8  1.3050us     522ns  4.9100us  cuDeviceGet
0.00%  3.7970us         4     949ns     780ns  1.2230us  cuDeviceGetUuid
0.00%  3.2030us         3  1.0670us     751ns  1.5050us  cuDeviceGetCount
$

稍后:与我之前的相比,这里有一个稍微快一点的例程(在sm_70上有百分之几)

__device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){
uchar4 result;
const half sv = 255;
const short svi = 255;
__half2 ah2, bh2, rh2;
ah2 = __floats2half2_rn(a.x, a.y);
bh2 = __floats2half2_rn(b.x, b.y);
rh2 = __hmul2(ah2, bh2);
result.x = (rh2.x > sv) ? (svi):((short)rh2.x);
result.y = (rh2.y > sv) ? (svi):((short)rh2.y);
ah2 = __floats2half2_rn(a.z, a.w);
bh2 = __floats2half2_rn(b.z, b.w);
rh2 = __hmul2(ah2, bh2);
result.z = (rh2.x > sv) ? (svi):((short)rh2.x);
result.w = (rh2.y > sv) ? (svi):((short)rh2.y);
return result;
}

它的缺点是使用CUDA半精度内部函数;不太便携";并且同样不能用__host__来装饰。

CUDA中不存在固有__vmulus8()。但是,它可以使用现有的内部函数来模拟。基本上,我们可以使用两个32位变量来封装四个8位量的16位乘积。然后将每个乘积箝位到255,并在置换运算的帮助下将每个乘积的最低有效字节提取到最终结果中。由CUDA 11生成的用于计算能力的代码>=7.0看起来很合理。性能是否足够将取决于用例。如果此操作发生在使用打包字节进行计算的处理管道的中间,则应该是这种情况。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
/* byte-wise multiply with unsigned saturation */
__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
unsigned int plo, phi, res;
// compute products
plo = ((a & 0x000000ff) * (b & 0x000000ff) + 
(a & 0x0000ff00) * (b & 0x0000ff00));
phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) + 
__umulhi (a & 0xff000000, b & 0xff000000));
// clamp products to 255
plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
// extract least significant eight bits of each product
res = __byte_perm (plo, phi, 0x6420);
return res;
}
__global__ void kernel (unsigned int a, unsigned int b, unsigned int *res)
{
*res = vmulus4 (a, b);
}
unsigned int vmulus4_ref (unsigned int a, unsigned int b)
{
unsigned char a0, a1, a2, a3, b0, b1, b2, b3;
unsigned int p0, p1, p2, p3;
a0 = (a >>  0) & 0xff;
a1 = (a >>  8) & 0xff;
a2 = (a >> 16) & 0xff;
a3 = (a >> 24) & 0xff;
b0 = (b >>  0) & 0xff;
b1 = (b >>  8) & 0xff;
b2 = (b >> 16) & 0xff;
b3 = (b >> 24) & 0xff;
p0 = (unsigned int)a0 * (unsigned int)b0;
p1 = (unsigned int)a1 * (unsigned int)b1;
p2 = (unsigned int)a2 * (unsigned int)b2;
p3 = (unsigned int)a3 * (unsigned int)b3;
if (p0 > 255) p0 = 255;
if (p1 > 255) p1 = 255;
if (p2 > 255) p2 = 255;
if (p3 > 255) p3 = 255;
return (p0 << 0) + (p1 << 8) + (p2 << 16) + (p3 << 24);
}
// George Marsaglia's KISS PRNG, period 2**123. Newsgroup sci.math, 21 Jan 1999
// Bug fix: Greg Rose, "KISS: A Bit Too Simple" http://eprint.iacr.org/2011/007
static uint32_t kiss_z=362436069, kiss_w=521288629;
static uint32_t kiss_jsr=123456789, kiss_jcong=380116160;
#define znew (kiss_z=36969*(kiss_z&65535)+(kiss_z>>16))
#define wnew (kiss_w=18000*(kiss_w&65535)+(kiss_w>>16))
#define MWC  ((znew<<16)+wnew )
#define SHR3 (kiss_jsr^=(kiss_jsr<<13),kiss_jsr^=(kiss_jsr>>17), 
kiss_jsr^=(kiss_jsr<<5))
#define CONG (kiss_jcong=69069*kiss_jcong+1234567)
#define KISS ((MWC^CONG)+SHR3)
int main (void)
{
unsigned int *resD = 0;
unsigned int a, b, res, ref;
cudaMalloc ((void**)&resD, sizeof resD[0]);
for (int i = 0; i < 1000000; i++) {
a = KISS;
b = KISS;
kernel<<<1,1>>>(a, b, resD);
cudaMemcpy (&res, resD, sizeof res, cudaMemcpyDeviceToHost);
ref = vmulus4_ref (a, b);
if (res != ref) {
printf ("error: a=%08x b=%08x res=%08x ref=%08xn", a, b, res, ref);
return EXIT_FAILURE;
}
}
cudaFree (resD);
return EXIT_SUCCESS;
}

相关内容

最新更新