在单个流中使用对等gpu

在我目前的项目中，我使用gpu进行信号处理和可视化。我已经使用流来允许异步操作。信号在帧中处理，对于每个帧，流中的处理步骤如下

内存到设备
图像处理

现在这些步骤发生在单个GPU上，但是我的机器有一个多GPU卡(GeForce GTX 690)，我想在两个设备之间分配操作。基本上，我想在设备A上执行步骤1和2，在设备B上执行步骤3和4，而操作1,2,3和4仍然作为单个异步流执行。期望的结果是一个流布局，看起来像这样

Device A Stream a 1 2       1 2  ...
         Stream b      1 2  ...
Device B Stream a    3 4       3 4 ...
         Stream b         3 4  ...

我该怎么做?

我之前的尝试是不正确的，因为流与它创建的设备相关联。所以我认为对你题目中提出的问题最直接的回答是"这是不可能做到的"。您不能创建单个流并从它向多个GPU发出命令。从这里:

Stream and Event Behavior
A kernel launch or memory copy will fail if it is issued to a stream that is not associated to the current device

然而，在研究它的时候，我注意到事件是一种在两个不同设备上同步两个流的建议方法:

cudaStreamWaitEvent()将成功，即使输入流和输入事件关联到不同的设备。cudaStreamWaitEvent()可以因此，用于多个设备之间的同步。

因此，按照这种思路，我创建了以下代码来说明这一点:

#include <stdio.h>
#define SIZE 32
#define K1VAL 5
#define K3VAL 3
#define cudaCheckErrors(msg) 
    do { 
        cudaError_t __err = cudaGetLastError(); 
        if (__err != cudaSuccess) { 
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)n", 
                msg, cudaGetErrorString(__err), 
                __FILE__, __LINE__); 
            fprintf(stderr, "*** FAILED - ABORTINGn"); 
            exit(1); 
        } 
    } while (0)

__global__ void kernel1(int *frame, int size){
  int idx = threadIdx.x + (blockDim.x * blockIdx.x);
  if (idx == 0){
   int *a = new int[10000]; // just to make this kernel take a while
   for (int i = 0; i<10000; i++)
     a[i] = 0;
   for (int i = 0; i < size; i++)
     frame[i] += K1VAL;
   }
}

__global__ void kernel3(int *frame, int size){
  int idx = threadIdx.x + (blockDim.x * blockIdx.x);
  if (idx == 0)
   for (int i = 0; i < size; i++)
     frame[i] -= K3VAL;
}
void set_device(int dev){
  int ldev;
  cudaSetDevice(dev);
  cudaGetDevice(&ldev);
  cudaCheckErrors("set device error");
  if (ldev != dev){
    printf("set device mismatch errorn");
    exit(1);
    }
}

int main(){
  int A=0;
  int B=1;
  int framesize = SIZE*sizeof(int);
  int *h_frame;
  int *d_frame_aA,  *d_frame_bB;
  int numdev = 0;
  cudaGetDeviceCount(&numdev);
  cudaCheckErrors("can't determine number of devices");
  if (numdev < 2){
   printf("not enough devices!n");
   return 1;
   }
  set_device(A);
  cudaMalloc((void **) &d_frame_aA, framesize);  // stream_a
  cudaMemset(d_frame_aA, 0, framesize);
  set_device(B);
  cudaMalloc((void **) &d_frame_bB, framesize);  // stream_b
  cudaMemset(d_frame_bB, 0, framesize);
  cudaHostAlloc((void **) &h_frame, framesize, cudaHostAllocDefault);
  cudaCheckErrors("allocations failure");
  set_device(A);
  cudaStream_t stream_a, stream_b;
  cudaStreamCreate(&stream_a);
  cudaEvent_t absync;
  cudaEventCreate(&absync);
  set_device(B);
  cudaStreamCreate(&stream_b);
  cudaCheckErrors("stream creation failure");

  for (int i = 0; i < SIZE; i++)
    h_frame[i] = 0;
  set_device(A);
  cudaDeviceEnablePeerAccess(B, 0);
  set_device(B);
  cudaDeviceEnablePeerAccess(A, 0);
  cudaCheckErrors("enable peer access fail");

  set_device(A);
  cudaMemcpyAsync(d_frame_aA, h_frame, framesize, cudaMemcpyHostToDevice, stream_a);
  kernel1<<<1,1,0, stream_a>>>(d_frame_aA, SIZE);
  cudaCheckErrors("kernel1 fail");
  cudaMemcpyPeerAsync(d_frame_bB, B, d_frame_aA, A, framesize, stream_a );
  cudaCheckErrors("memcpypeer fail");
  cudaEventRecord(absync, stream_a);
  set_device(B);
  // comment out the next line to see the failure
  cudaStreamWaitEvent(stream_b, absync, 0);
  kernel3<<<1,1,0, stream_b>>>(d_frame_bB, SIZE);
  cudaCheckErrors("main sequence fail");

//  cudaCheckErrors("main sequence failure");
  cudaMemcpy(h_frame, d_frame_bB, framesize, cudaMemcpyDeviceToHost);
  cudaCheckErrors("results_a memcpy fail");
  for (int i = 0; i < SIZE; i++)
    if (h_frame[i] != (K1VAL - K3VAL)) {
      printf("results errorn");
      return 1;
      }
  printf("successn");
  return 0;
}

如果按原样运行代码，您应该得到一条success消息。如果您注释掉强制流b(在设备b上)等待流a(在设备a上)的行，那么您将看到results error消息。这演示了如何将一个设备上的流同步到另一个设备上的流。希望能有所帮助。

cudaStreamWaitEvent()允许gpu间同步，因为您可以在属于另一个设备的CUDA事件上插入等待。

所以你需要生产者和消费者之间的gpu间同步是为2个gpu中的每一个分配一些事件(至少2个)，然后在同一个事件上有生产者cudaEventRecord()和消费者cudaStreamWaitEvent()。cudaStreamWaitEvent()将一个命令插入当前设备的命令缓冲区，导致它暂停执行，直到给定的事件被记录。

下面是使用cudaStreamWaitEvent()以这种方式实现对等内存的代码片段。一旦泵启动，生产者和消费者都应该并发地进行PCIe传输，每个都到两个staging缓冲区中的一个(它们分配在便携式固定内存中)。

cudaError_t
chMemcpyPeerToPeer( 
    void *_dst, int dstDevice, 
    const void *_src, int srcDevice, 
    size_t N ) 
{
    cudaError_t status;
    char *dst = (char *) _dst;
    const char *src = (const char *) _src;
    int stagingIndex = 0;
    while ( N ) {
        size_t thisCopySize = min( N, STAGING_BUFFER_SIZE );
        CUDART_CHECK( cudaSetDevice( srcDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[dstDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( g_hostBuffers[stagingIndex], src, thisCopySize, 
            cudaMemcpyDeviceToHost, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[srcDevice][stagingIndex] ) );
        CUDART_CHECK( cudaSetDevice( dstDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[srcDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( dst, g_hostBuffers[stagingIndex], thisCopySize, 
            cudaMemcpyHostToDevice, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[dstDevice][stagingIndex] ) );
        dst += thisCopySize;
        src += thisCopySize;
        N -= thisCopySize;
        stagingIndex = 1 - stagingIndex;
    }
    // Wait until both devices are done
    CUDART_CHECK( cudaSetDevice( srcDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );
    CUDART_CHECK( cudaSetDevice( dstDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );
Error:
    return status;
}

完整源代码见https://github.com/ArchaeaSoftware/cudahandbook/blob/master/memory/peer2peerMemcpy.cu

相关内容

最新更新

热门标签：