我正在尝试在我的GTS450
上实现基于Cyclic Reduction
方法的三对角线系统求解器。
本文说明了循环还原
Y. Zhang, J. Cohen, J.D. Owens, "GPU 上的快速三对角求解器"
但是,无论我做什么,我的 CUDA 代码都比顺序代码慢得多。我总共 512 x 512
分的结果是 7ms
,但是在我的 i7 3.4GHz 上它是 5ms
.GPU 没有加速!
这可能是问题所在?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
我在 cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x)
年启动了这个内核,并达到了100%
设备占用率。这个结果让我困惑了好几天。
我的代码有一个改进版本:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
速度提高到512 x 512
系统的Quadro k4000
上0.73ms
,但是上述论文中的代码在GTX280
上运行0.5ms
。
求解三对角线方程组是一个具有挑战性的平行问题,因为经典解方案,即高斯消除,本质上是顺序的。
循环还原包括两个阶段:
-
前向减持。原始系统分为两个独立的三对角线系统,用于两组未知数,一组具有奇数指数,一组具有偶数指数。这样的系统可以独立解决,这一步可以看作是分而治之方案的第一步。两个较小的系统在两个子系统中以相同的方式再次分裂,并重复该过程,直到达到仅
2
方程组。 -
向后替换。首先求解
2
方程组。然后,通过独立求解不同核心上的子系统来爬上分界线结构。
不确定(但如果我错了,请纠正我)您的代码是否会返回一致的结果。 N
似乎没有定义。另外,您正在访问csub[idx-stride]
,但我不确定idx==0
和stride>1
时是什么意思。此外,您正在使用多个条件语句,主要用于边界检查。最后,您的代码缺乏能够处理上述 divide et impera 方案的适当线程结构,从概念上讲,这与 CUDA SDK 缩减示例中使用的非常相似。
正如我在上面的一条评论中提到的,我记得在三对角线求解器中,您可以找到用于求解三对角方程组的循环约简方案的实现。浏览相关的谷歌页面,在我看来,该代码是由上述论文的第一作者(张耀)保留的。代码被复制并粘贴到下面。注意边界检查只做一次(if (iRight >= systemSize) iRight = systemSize - 1;
),从而限制了所涉及的条件语句的数量。还要注意能够处理分治和阻抗方案的线程结构。
张、科恩和欧文斯的代码
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %dn", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}
再添加一个答案,提到三对角线系统可以在cuSPARSE
库的框架内借助函数轻松解决
cusparse<t>gtsv()
cuSPARSE
还提供
cusparse<t>gtsv_nopivot()
与第一个提到的例程不一致,不执行枢轴。上述两个函数都求解具有多个右侧的同一线性系统。批处理例程
cusparse<t>gtsvStridedBatch()
也存在求解多个线性系统的问题。
对于上述所有例程,只需指定下对角线、主对角线和上对角线即可固定系统矩阵。
下面,我将报告一个使用cusparse<t>gtsv()
求解三对角线线性系统的完整示例。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %NdimsNobjs %snerror %Ndims: %snterminating!Nobjs",__FILE__, __LINE__,err,
_cusparseGetErrorEnum(err));
cudaDeviceReset(); assert(0);
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%fn", h_x[k]);
}
在此 gitHub 存储库中,报告了 cuSOLVER 库中可用于解决三对角线线性系统的不同 CUDA 例程的比较。
我看到的东西:
-
第一个
__syncthreads()
似乎是多余的。 -
代码中存在重复的操作集,例如
(-csub[idx-stride]*asub[idx]/bsub[idx-stride])
。使用中间变量来保存结果并重用它们,而不是让 GPU 每次都计算这些集。 -
使用 NVIDIA 探查器查看问题所在。