C语言 用于矩阵批量乘法的 Cuda 程序



我是 CUDA 程序领域的新手,我正在尝试重复 cublasSgemmBatched 的功能,这意味着我想执行一批矩阵的矩阵-矩阵乘法。我尝试将我的想法实现为以下代码。

#include <stdio.h>
__global__ void BatchMulCUDA(float* array1, float* array2, int narray1, int dim, float* result)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    if (tx < narray1 * dim)
    {
        float temp = 0;
        int index = tx / dim;
#pragma
        for (int i = 0; i < dim; i++)
        {
            temp += array1[tx * dim + i] * array2[index * dim + i];
        }
        result[tx] = temp;
    }
} 
void BatchMulGPU(float* array1, float* array2, int narray1, int dim, float* result)
{
    dim3 threads(1024, 1);
    dim3 grid(narray1 / 1024 + 1, 1);
    int threadsPerBlock = threads.x * threads.y;
    int blocksPerGrid = grid.x * grid.y;
    printf("CUDA kernel launch with %d blocks of %d threadsn", blocksPerGrid, threadsPerBlock);
    BatchMulCUDA<<<grid, threads>>>(array1, array2, narray1, dim, result);
}

然而,奇怪的是,我发现我可以在索引 19730 之前获得正确的输出。在 19730 元素之后,GPU 的输出始终为 0。我不知道问题是什么。我的代码和测试函数的CPU版本如下。是否有任何我没有意识到的硬件限制?

#include "kernel.h"
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
#include <math.h>
double cpuSecond()
{
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return ((double) tp.tv_sec + (double)tp.tv_usec*1e-6);
}
void BatchMulCPU(float* array1, float* array2, int narray1, int dim, float* result)
{
    for (int i = 0; i < narray1 * dim; i++)
    {
        float temp = 0;
        int index = i / dim;
        for (int j = 0; j < dim; j++)
        {
            temp += array1[i * dim + j] * array2[index * dim + j];
        }
        result[i] = temp;
    }
}
int main(int argc, char** argv)
{
    int narray1 = 6980;
    int dim = 4;
    float* array1 = new float[narray1 * dim * dim];
    float* array2 = new float[narray1 * dim];
    float* resultGPU = new float[narray1 * dim];
    float* resultCPU = new float[narray1 * dim];
    float* d_array1;
    float* d_array2;
    float* d_result;
    for (int i = 0; i < narray1 * dim * dim; i++)
    {
        array1[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
    }
    for (int i = 0; i < narray1 * dim; i++)
    {
        array2[i] = static_cast<float> (rand() / (static_cast<float> (RAND_MAX / 10)));
    }
    cudaError_t err;
    double iStart = cpuSecond();
    err = cudaMalloc((void**)&d_array1, narray1 * dim * dim * sizeof(float));
    err = cudaMalloc((void**)&d_array2, narray1 * dim * sizeof(float));
    err = cudaMalloc((void**)&d_result, narray1 * dim * sizeof(float));
    err = cudaMemcpy(d_array1, array1, narray1 * dim * dim * sizeof(float), cudaMemcpyHostToDevice);
    err = cudaMemcpy(d_array2, array2, narray1 * dim * sizeof(float), cudaMemcpyHostToDevice);
    BatchMulGPU(d_array1, d_array2, narray1, dim, d_result);
    err = cudaMemcpy(resultGPU, d_result, narray1 * dim * sizeof(float), cudaMemcpyDeviceToHost);
    double iElaps = cpuSecond() - iStart;
    printf("Total GPU computation time is %lf n" , iElaps);
    iStart = cpuSecond();
    BatchMulCPU(array1, array2, narray1, dim, resultCPU);
    iElaps = cpuSecond() - iStart;
    printf("Total CPU computation time is %lf n" , iElaps);
    float error = 0;
    float temp = 0;
    for (long i = 0; i < narray1 * dim; i++)
    {
        // temp = abs(resultCPU[i] - resultGPU[i]);
        // if (temp > 0.5)
        // {
        //  std::cout << i << std::endl;
        // }
        error += abs(resultCPU[i] - resultGPU[i]);
    }
    printf("Error is %f n", error);
    // for (int i = 19730; i < 19750; i++)
    // {
    //  std::cout << "GPU " << resultGPU[i] << std::endl;
    //  std::cout << "CPU " << resultCPU[i] << std::endl;
    // }
    cudaFree(d_array1);
    cudaFree(d_array2);
    cudaFree(d_result);
    return 0;
}

除了注释中讨论的 WDDM TDR 超时的可能性之外,代码还存在错误。

很明显,内核设计期望启动的总网格大小(线程总数(等于或大于数组数乘以侧尺寸:

int tx = blockIdx.x * blockDim.x + threadIdx.x;
if (tx < narray1 * dim)

narray1*dim是所需的线程数

但是,正在启动的数字仅为narray1

dim3 threads(1024, 1);
dim3 grid(narray1 / 1024 + 1, 1);

如果我们将上面的最后一行更改为:

dim3 grid((narray1*dim) / 1024 + 1, 1);

将解决此代码设计错误。

代码对于少量矩阵(最多 256 个(正常工作的原因是网格大小调整为最小 1024 个线程(即 256*4 ( narray1 * dim (。

顺便说一句,这段代码在功能上与我所看到的cublasSgemmBatched并不相似。 我不认为这段代码是我熟悉的任何矩阵乘法(矩阵点积(。

最新更新