如何在没有内存分配的情况下将指针从 CUDA 返回到 C?



我有返回 3 个指针的 CUDA 函数:csrVal、csrRowPtr、csrColInd。

void dense2Csr (int dim,
cuComplex *dnMatr,
cuComplex *csrVal,
int *csrRowPtr,
int *csrColInd)
{
cusparseHandle_t   cusparseH = NULL;   // residual evaluation
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix
cusparseStatus_t cudaStat1 = CUSPARSE_STATUS_SUCCESS;
int nnZ;
//Input GPU Copy
cuComplex *d_dnMatr;
int *d_nnzRow;

//Output GPU Copy
cuComplex *d_csrVal;
int *d_csrRowPtr;
int *d_csrColInd;

cusparseCreate(&cusparseH); //Create SparseStructure
cudaStreamCreate(&stream);
cusparseSetStream(cusparseH, stream);
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); //Set First Element RowPtr eq. to zero

cudaMalloc((void **)&d_dnMatr   , sizeof(cuComplex)*dim*dim);
cudaMalloc((void **)&d_nnzRow   , sizeof(int)*dim);
cudaMemcpy(d_dnMatr  , dnMatr   , sizeof(cuComplex)*dim*dim  , cudaMemcpyHostToDevice);

cusparseCnnz(cusparseH,
CUSPARSE_DIRECTION_ROW,
dim,
dim,
descrA,
d_dnMatr,
dim,
d_nnzRow,
&nnZ);


cudaMalloc((void **)&d_csrRowPtr   , sizeof(int)*(dim+1));
cudaMalloc((void **)&d_csrColInd   , sizeof(int)*nnZ);
cudaMalloc((void **)&d_csrVal   , sizeof(cuComplex)*nnZ);

cudaStat1 = cusparseCdense2csr(cusparseH,
dim,
dim,
descrA,
d_dnMatr,
dim,
d_nnzRow,
d_csrVal,
d_csrRowPtr,
d_csrColInd);
assert(cudaStat1 == CUSPARSE_STATUS_SUCCESS);
cudaMallocHost((void **)&csrRowPtr   , sizeof(int)*(dim+1));
cudaMallocHost((void **)&csrColInd   , sizeof(int)*nnZ);
cudaMallocHost((void **)&csrVal   , sizeof(cuComplex)*nnZ);
cudaMemcpy(csrVal, d_csrVal, sizeof(cuComplex)*nnZ, cudaMemcpyDeviceToHost);
cudaMemcpy(csrRowPtr, d_csrRowPtr, sizeof(int)*(dim+1), cudaMemcpyDeviceToHost);
cudaMemcpy(csrColInd, d_csrColInd, sizeof(int)*(nnZ), cudaMemcpyDeviceToHost);

if (d_csrVal) cudaFree(d_csrVal);
if (d_csrRowPtr) cudaFree(d_csrRowPtr);
if (d_csrColInd) cudaFree(d_csrColInd);
if (cusparseH  ) cusparseDestroy(cusparseH);
if (stream     ) cudaStreamDestroy(stream);

我用 C 代码调用它(具有 100% 正确的链接):

dense2Csr(dim, Sigma, csrValSigma, csrRowPtrSigma, csrColIndSigma);

dense2Csr(dim, Sigma, &csrValSigma[0], &csrRowPtrSigma[0], &csrColIndSigma[0]);

它以两种方式写信给我

Process finished with exit code 139 (interrupted by signal 11: SIGSEGV)

所以,这是一个内存错误,我只是通过在调用dense2Csr之前在主程序中分配一个主机内存(并且在函数中没有cudaMallocHost)来解决它。但现在我无法以这种方式做到这一点。那么,有没有一个配方可以让函数吃掉一个空的poiters,并让它在这样的设置中返回一个指向内存区域的指针?

看来您已经自己找到了 C 通过引用成语传递,这对于您似乎需要做的事情来说非常有效。 执行相同操作的一种更优雅和合乎逻辑的方法是定义一个结构,其中包含您在函数中分配的指针,并让函数按值返回结构。

因此,您的代码可以像这样修改:

#include <cusparse.h>
#include <cuda_runtime_api.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
typedef struct
{
cuComplex *csrVal;
int *csrRowPtr;
int *csrColInd;
} csr_struct;
csr_struct dense2Csr (int dim, cuComplex *dnMatr)
{
cusparseHandle_t   cusparseH = NULL;   // residual evaluation
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL; // A is a base-0 general matrix
cusparseStatus_t cudaStat1 = CUSPARSE_STATUS_SUCCESS;
int nnZ;
//Input GPU Copy
cuComplex *d_dnMatr;
int *d_nnzRow;
//Output GPU Copy
cuComplex *d_csrVal;
int *d_csrRowPtr;
int *d_csrColInd;
// return value
csr_struct mat;
cusparseCreate(&cusparseH); //Create SparseStructure
cudaStreamCreate(&stream);
cusparseSetStream(cusparseH, stream);
cusparseCreateMatDescr(&descrA);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); //Set First Element RowPtr eq. to zero
cudaMalloc((void **)&d_dnMatr   , sizeof(cuComplex)*dim*dim);
cudaMalloc((void **)&d_nnzRow   , sizeof(int)*dim);
cudaMemcpy(d_dnMatr  , dnMatr   , sizeof(cuComplex)*dim*dim  , cudaMemcpyHostToDevice);
cusparseCnnz(cusparseH,
CUSPARSE_DIRECTION_ROW,
dim, dim, descrA, d_dnMatr, dim,
d_nnzRow, &nnZ);
cudaMalloc((void **)&d_csrRowPtr   , sizeof(int)*(dim+1));
cudaMalloc((void **)&d_csrColInd   , sizeof(int)*nnZ);
cudaMalloc((void **)&d_csrVal   , sizeof(cuComplex)*nnZ);
cudaStat1 = cusparseCdense2csr(cusparseH,
dim, dim, descrA, d_dnMatr, dim, d_nnzRow,
d_csrVal, d_csrRowPtr, d_csrColInd); 
assert(cudaStat1 == CUSPARSE_STATUS_SUCCESS);
cudaMallocHost((void **)&mat.csrRowPtr   , sizeof(int)*(dim+1));
cudaMallocHost((void **)&mat.csrColInd   , sizeof(int)*nnZ);
cudaMallocHost((void **)&mat.csrVal   , sizeof(cuComplex)*nnZ);
cudaMemcpy(mat.csrVal, d_csrVal, sizeof(cuComplex)*nnZ, cudaMemcpyDeviceToHost);
cudaMemcpy(mat.csrRowPtr, d_csrRowPtr, sizeof(int)*(dim+1), cudaMemcpyDeviceToHost);
cudaMemcpy(mat.csrColInd, d_csrColInd, sizeof(int)*(nnZ), cudaMemcpyDeviceToHost);
if (d_csrVal) cudaFree(d_csrVal);
if (d_csrRowPtr) cudaFree(d_csrRowPtr);
if (d_csrColInd) cudaFree(d_csrColInd);
if (cusparseH  ) cusparseDestroy(cusparseH);
if (stream     ) cudaStreamDestroy(stream);
return mat;
}
int main()
{
const int dim = 1024;
const size_t sz = sizeof(cuComplex) * dim * dim;
cuComplex* dmat = malloc(sz);
memset(dmat, 0, sz);
const cuComplex ten_plus_nine_i = { 10.0, 9.0 };
for(int i=0; i<dim; i++)
dmat[i * (dim + 1)] = ten_plus_nine_i;
csr_struct smat = dense2Csr(dim, dmat);
for(int j=0; j<10; j++) {
cuComplex x = smat.csrVal[j];
printf("%d %d %f + %f in", smat.csrColInd[j], smat.csrRowPtr[j], x.x, x.y);
}
return 0;
}

它似乎工作正常(请注意,此示例需要符合 C99 的编译器,即使结构返回代码不这样做):

$ nvcc -Xcompiler="-std=c99" -o intialainen intialainen.c -lcudart -lcusparse
cc1plus: warning: command line option -std=c99 is valid for C/ObjC but not for C++ [enabled by default]
$ ./intialainen 
0 0 10.000000 + 9.000000 i
1 1 10.000000 + 9.000000 i
2 2 10.000000 + 9.000000 i
3 3 10.000000 + 9.000000 i
4 4 10.000000 + 9.000000 i
5 5 10.000000 + 9.000000 i
6 6 10.000000 + 9.000000 i
7 7 10.000000 + 9.000000 i
8 8 10.000000 + 9.000000 i
9 9 10.000000 + 9.000000 i

或直接使用 GCC:

$ gcc -std=c99 -o intialainen intialainen.c -I /opt/cuda-9.0/include -L /opt/cuda-9.0/lib64 -lcudart -lcusparse -lcuda
$ ./intialainen 
0 0 10.000000 + 9.000000 i
1 1 10.000000 + 9.000000 i
2 2 10.000000 + 9.000000 i
3 3 10.000000 + 9.000000 i
4 4 10.000000 + 9.000000 i
5 5 10.000000 + 9.000000 i
6 6 10.000000 + 9.000000 i
7 7 10.000000 + 9.000000 i
8 8 10.000000 + 9.000000 i
9 9 10.000000 + 9.000000 i

函数调用csr_struct smat = dense2Csr(dim, dmat)dense2Csr(dim, dmat, &p1, &p2, &p2)之类的东西更简单,更容易遵循,尽管这完全是一个品味问题。

最新更新