我正在尝试使用 CUDA 7.0 的新cusolverDnSgesvd
例程来计算奇异值。完整代码报告如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main(){
int M = 10;
int N = 10;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for(unsigned int i = 0; i < M; i++){
for(unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. N";
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
//float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));
// --- CUDA SVD execution
//stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
cudaDeviceSynchronize();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "devInfo = " << devInfo_h << "n";
switch(stat){
case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation successn"; break;
case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctlyn"; break;
case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passedn"; break;
case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failedn"; break;
}
if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successfulnn";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;
cusolverDnDestroy(solver_handle);
return 0;
}
如果我要求计算完整的 SVD(带有 jobu = 'A'
和 jobvt = 'A'
的注释行),一切正常。如果我只要求计算奇异值(带有 jobu = 'N'
和 jobvt = 'N'
的行),cusolverDnSgesvd
返回
CUSOLVER_STATUS_INVALID_VALUE
请注意,在这种情况下devInfo = 0
,所以我无法发现无效参数。
另请注意,文档 PDF 缺少有关 rwork
参数的信息,因此我将其作为虚拟参数处理。
目前 cuSolver gesvd
函数仅支持 jobu = 'A'
和 jobvt = 'A'
因此,当您指定其他组合时会出现错误。 从文档中:
注2:gesvd只支持jobu='A'和jobvt='A',并返回矩阵U和VH
OF cusolver<T>nSgesvd
正如列别多夫所说,从 CUDA 8.0 开始,现在只能通过 cusolverDnSgesvd
计算奇异值。我在下面报告了您的代码的略微修改版本,其中包含两次对cusolverDnSgesvd
的调用,一个仅执行奇异值计算
cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)
一个执行完整的 SVD 计算
cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)
如前所述,完整 SVD 案例的两个'A'
字段在仅单数值情况下更改为'N'
。请注意,在仅奇异值的情况下,不需要为奇异向量矩阵U
和V
存储空间。实际上,传递了一个NULL
指针。
仅奇异值计算比完整 SVD 计算更快。在GTX 960上,对于1000x1000
矩阵,计时如下:
Singular values only: 559 ms
Full SVD: 2239 ms
以下是完整的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
/********/
/* MAIN */
/********/
int main(){
int M = 1000;
int N = 1000;
TimingGPU timerGPU;
float elapsedTime;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
// --- CUDA SVD execution - Singular values only
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the singular values calculation onlynn");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrongn", -devInfo_h);
else
printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);
printf("Calculation of the singular values only: %f msnn", elapsedTime);
// --- Moving the results from device to host
//gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
//for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;
// --- CUDA SVD execution - Full SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the full SVD calculationnn");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrongn", -devInfo_h);
else
printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zeron", devInfo_h);
printf("Calculation of the full SVD calculation: %f msnn", elapsedTime);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
编辑 - 不同版本的 CUDA 的性能
我比较了5000x5000
矩阵的仅奇异值计算和CUDA 8.0
、CUDA 9.1
和CUDA 10.0
的全SVD计算的性能。以下是GTX 960的结果。
Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0
__________________________________________________________________
Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________