任何人都知道如何在C和CUDA中通过修改的Gram-Schmidt方法进行QR分解。一些例子/来源/论文或其他?非常感谢。
编辑:我无法回答我的问题,因为有人关闭了它,所以我决定更新我的问题。
/*
* QR decomposition via modified Gram-Schmidt algorithm
*
* @Package = QR-decomposition
* @Program = QR_gpu
* @Version = 13.0928
*/
// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
#include <cuda.h>
// Constant
#define PROG "QR_cpu"
#define VERSION "13.1003"
#define PACKAGE "QR-Decomposition"
// Threads per block
#define THREAD_P_BLOCK 512
// Blocks per grid
#define BLOCKS_P_GRID 512
// macro
/* wrap each API call with the gpuErrchk macro, which will process the return
status of the API call it wraps: http://bit.ly/1dTD0ZE */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
// Prototypes
__global__ void xTA (float *, float *, int, int, int, int, int);
__global__ void scale (float *, int, int, float *);
__global__ void r1_update(float *, float *, int, int, int, int);
__host__ void gpuAssert(cudaError_t, char *, int);
__host__ void print_matrix(float *, int, int, int);
__host__ void print_help (int);
__host__ int option_parser(int, char **, int *, int *);
// Host code
int main (int argc, char **argv) {
int m, n, lda, ldr, i;
float *A_d, *R_d;
//cudaEvent_t t_start, t_stop;
// Get "m" and "n" from command line
if (0 != option_parser(argc, argv, &m, &n)) {
fprintf(stderr, "Can't continue, exiting now!n");
exit(EXIT_FAILURE);
}
size_t A_size = m * n * sizeof(float);
size_t R_size = n * n * sizeof(float);
lda = n; ldr = n;
// Allocate input matrices A_h and R_h in host memory
float *A_h = (float *) malloc(A_size);
float *R_h = (float *) malloc(R_size);
memset(R_h, 0, R_size);
// Initialize input matrix
for (i = 0; i < n; i++)
A_h[i*lda + i] = i + 1;
// Allocate matrices in device memory
gpuErrchk (cudaMalloc(&A_d, A_size));
gpuErrchk (cudaMalloc(&R_d, R_size));
// Copy the A matrix from host memory to device memory
gpuErrchk (cudaMemcpy(A_d, A_h, A_size, cudaMemcpyHostToDevice));
// Set R matrix to 0
gpuErrchk (cudaMemset(R_d, 0, R_size));
/**** Invoke kernel ****/
dim3 dimBlock (THREAD_P_BLOCK, 1, 1);
// dimGrid 'monodimensional' (just x value)
dim3 dimGrid_M ((m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK, 1, 1);
// dimGrid 'bidimensional' (x and y values)
dim3 dimGrid_B (BLOCKS_P_GRID, (m + THREAD_P_BLOCK - 1) / THREAD_P_BLOCK,1);
// Gram-Schmidt algorithm step by step
for (i = 0; i < n; i++) {
// Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
xTA <<< dimBlock, dimGrid_B >>> (R_d, A_d, m, n, lda, ldr, i);
// Step #3 (Is the scale of a column vector)
scale <<< dimBlock, dimGrid_M >>> (A_d + i, m, lda, R_d + i*ldr + i);
// Step #4 (Is the scale of a row)
scale <<< dimBlock, dimGrid_M >>> (R_d + ldr*i, m, 1, R_d + i*ldr + i);
// Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
r1_update <<< dimBlock, dimGrid_B >>> (A_d, R_d + i*lda, m, n, lda, i);
}
// Copy the results from device memory to host memory
gpuErrchk (cudaMemcpy(A_h, A_d, A_size, cudaMemcpyDeviceToHost));
// Free device memory
cudaFree(A_d); cudaFree(R_d);
// Free host memory
free(A_h); free(R_h);
return 0;
}
/**
* ## Kernel 1
*
* Rank 1 update of columns of A
*/
__global__ void r1_update (float *A, float *R, int m, int n, int lda, int k) {
// get x,y cordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.x;
if (x < m && y < n-k-1)
A[x*lda + y + k + 1] -= A[x*lda + k] * R[y + k + 1];
}
/**
* ## Kernel 2
*
* matrix vector product
* Performs R[i] = x'A where x' is a row of A
*
* How leading dimension is used for matrices: http://ibm.co/19PLtIX
*/
__global__ void xTA (float *R, float *A, int m, int n, int lda, int ldr, int k){
// block column * block dim + column (computed by each thread)
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
int j;
// upper triangular matrix
if (i < n - k) {
for (j = 0; j < m; j++)
R[k*ldr + k + i] += A[k*lda + j] * A[j*lda + k + i];
}
}
/**
* ## Kernel 3
*
* mult. for constant s
* d vector
* ld leading dimension (distance from elements)
*/
__global__ void scale (float *d, int m, int ld, float *R_x) {
// block colum * block dim + column (computed by each thread)
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// s = sqrt(R(i,i))
// Static initialization of shared variables is illegal in CUDA.
// The problem is that the semantics of how every thread should treat
// static initialization of shared memory is undefined in the
// programming model. Which thread should do the write? What happens if
// the value is not uniform between threads? How should the compiler
// emit code for such a case and how should the hardware run it?
__shared__ float s; s = sqrt(*R_x);
// and scale
if (i < m) d[i*ld] /= s;
}
/*
* GPU Error Handler (CUDA_SAFE_CALL deprecated from CUDA 5.0)
*/
__host__ void gpuAssert(cudaError_t code, char *file, int line) {
if (code != cudaSuccess) {
fprintf(stderr,"GPUassert: %s %s %dn",
cudaGetErrorString(code), file, line);
exit(code);
}
}
/*
* Print matrix
*
* Print a matrix passed as argument
*/
__host__ void print_matrix (float * matrix, int m, int n, int ld) {
int i, j;
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++)
printf("%0.5f ", matrix[i*ld + j]);
printf("n");
}
}
/*
* The option parser
*
* The function parses the parameters passed from the command line and run
* their own procedures.
*
* Return value:
* 0 on success
* -1 on failure
*
* Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
* for further informations. (thanks to Frodo Looijaard)
*/
__host__ int option_parser (int argc, char **argv, int * m, int * n) {
int opt;
if (argc < 2) {
fprintf(stderr, "The program needs arguments...nn");
print_help(1);
}
opterr = 0;
while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
switch (opt) {
case 'h':
print_help(0);
case 'r':
printf("optarg: %sn", optarg);
if ((*m = atoi(optarg)) < 2) return -1;
break;
case 'c':
printf("optarg: %sn", optarg);
if ((*n = atoi(optarg)) < 2 || *n > *m) return -1;
break;
case '?':
if (optopt == 'r' || optopt == 'c')
fprintf(stderr,"Option -%c requires an argument.n",optopt);
else if (isprint (optopt))
fprintf(stderr,"Unknown option `-%c'.n", optopt);
else
fprintf(stderr,"Unknown option chr `\x%x'.n", optopt);
return -1;
default:
fprintf(stderr, "default switch-case statement reachedn");
return -1;
}
//for (ii = optind; ii < argc; ii++)
// printf ("non-option argument %sn", argv[ii]);
}
return 0;
}
/*
* The helper
*
* Show the info to run the program in the correct way
*/
__host__ void print_help (int exit_code) {
printf("nPKG : %snPROGRAM : %snVERSION : %snn",PACKAGE,PROG,VERSION);
printf("%s [-h] [-r num of rows] [-c num of columns]nn", PROG);
printf(" -h print this help and exitn");
printf(" -r provide the number of rowsn");
printf(" -c provide the number of columsnn");
printf(" Example: ./qr_gpu -r 800 -c 600nn");
exit_code == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}
当我使用 cuda-memcheck 运行程序时,我得到以下结果:
[mcrociara@tesla project_CUDA]$ cuda-memcheck ./qr_gpu -r 4 -c 4 ========= CUDA-MEMCHECK optarg: 4 optarg: 4 GPUassert: 未指定的启动失败 src/qr_gpu.cu 99 ========= 大小为 4 的全局读取无效 ========= 在 xTA 中的0x000000c8 ========= 按块 (0,0,0) 中的线程 (0,0,0)
========= 地址0x3b5273104越界
========= 大小为 4 的全局读取无效 ========= 在 xTA 中的0x000000c8 ========= 按块 (0,0,0) 中的线程 (1,0,0)
========= 地址0x3b5273108越界
========= 大小为 4 的全局读取无效 ========= 在 xTA 中的0x000000c8 ========= 按块 (0,0,0) 中的线程 (2,0,0)
========= 地址0x3b527310c越界
========= 错误摘要:3 个错误
有人可能会帮助理解为什么?我在这个算法上实现了串行版本,似乎可以正常工作:
/*
* QR decomposition via modified Gram-Schmidt algorithm
*/
// Libraries
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <ctype.h>
#include <time.h>
#include <math.h>
// Constant
#define PROG "QR_cpu"
#define VERSION "28.0913"
#define PACKAGE "QR-Decomposition"
// Prototypes
void r1_update(double *, double *, int);
void xTA (double *, double *, int);
void scale (double *, int, double);
void gram (double *, double *);
void print_matrix(double *);
int option_parser(int, char **);
void print_help (int);
// Number of rows
int M = -1;
// Number of columns
int N = -1;
// Leading Dimension of A and R
int LDA = -1;
int LDR = -1;
// The clock
clock_t start, stop;
/*
*
*/
int main (int argc, char **argv) {
int i;
// Get M, N from command line
if (0 != option_parser(argc, argv)) {
fprintf(stderr, "Bad option man!!!n");
exit(EXIT_FAILURE);
}
// Check the size of M and N
if (M > 5000 || N > 5000) {
fprintf(stderr, "Too big man!!!n");
exit(EXIT_FAILURE);
}
// Set the leading dimension of A and R
LDA = N; LDR = N;
// Allocate memory for A and R
double *A = calloc(M * N, sizeof(*A));
double *R = calloc(N * N, sizeof(*R));
// Set the diagonal of A as A(i,i) = i + 1 with i from 0 to N−1
for (i = 0; i < N; i++)
A[i*LDA + i] = i + 1;
start = clock();
gram(A, R);
stop = clock();
printf("nTime: %0.4fnn",(stop - start) / (double)(CLOCKS_PER_SEC));
// print_matrix(A);
// print_matrix(R);
free(A); free(R);
return 0;
}
/**
* Rank 1 update of columns of A
*/
void r1_update (double *A, double *R, int k) {
int i, j;
for(i = 0; i < M; i++)
for(j = k + 1; j < N; j++)
A[i*LDA + j] -= A[i*LDA + k] * R[j];
}
/**
* Matrix vector product
* Performs R[i] = x'A where x' is a row of A
* A : m x k, leading dimebsion, lda
*
* How leading dimension is used for matrices: http://ibm.co/19PLtIX
*/
void xTA (double *R, double *A, int k) {
int i, j;
// upper triangular matrix
for (i = 0; i < N-k; i++)
for (j = 0; j < M; j++)
R[k*LDR + k + i] += A[k*LDA + j] * A[j*LDA + k + i];
}
/**
* Mult. for constant s
* d vector
* ld leading dimension (distance from elements)
*/
void scale (double *d, int ld, double s) {
int i;
for (i = 0; i < M; i++) d[i*ld] *= s;
}
/**
* Performs Modified Gram Schmidt
* ortogonalization of columns of A
* A m x n
* R n x n
*/
void gram (double *A, double *R) {
int i;
double s;
// Modified Gram Schmidt algorithm step by step
for (i = 0; i < N; i++) {
// Step #1 --> R(i,i:n-1) = A'(:,i) * A(:,i:n-1)
xTA(R, A, i);
// Step #2 (Normalizing) --> s = sqrt(R(i,i))
s = 1 / sqrt(R[i*LDR + i]);
// Step #3 (Is the scale of a column vector)
scale(A + i, LDA, s);
// Step #4 (Is the scale of a row)
scale(R + LDR*i, 1, s);
// Step #5 --> A(:,i+1:n−1) = A(:,i+1:n−1) − A(:,i) ∗ R(i,i+1:n−1)
r1_update(A, R + i*LDA, i);
}
}
/*
* Print Matrix
*
* Print a matrix passed as argument
*/
void print_matrix (double * matrix) {
int i, j;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++)
printf("%0.2f ", matrix[i*LDA + j]);
printf("n");
}
}
/*
* The option parser
*
* The function parses the parameters passed from the command line and run
* their own procedures.
*
* Return value:
* 0 on success
* -1 on failure
*
* Please, see http://www.gnu.org/software/libc/manual/html_node/Getopt.html
* for further informations. (thanks to Frodo Looijaard)
*/
int option_parser (int argc, char **argv) {
int opt;
if (argc < 2) {
fprintf(stderr, "This program needs arguments...nn");
print_help(1);
}
opterr = 0;
while ( -1 != (opt = getopt (argc, argv, "hr:c:"))) {
switch (opt) {
case 'h':
print_help(0);
case 'r':
printf("optarg: %sn", optarg);
if ((M = atoi(optarg)) < 2) return -1;
break;
case 'c':
printf("optarg: %sn", optarg);
if ((N = atoi(optarg)) < 2 || N > M) return -1;
break;
case '?':
if (optopt == 'r' || optopt == 'c')
fprintf(stderr,"Option -%c requires an argument.n",optopt);
else if (isprint (optopt))
fprintf(stderr,"Unknown option `-%c'.n", optopt);
else
fprintf(stderr,"Unknown option chr `\x%x'.n", optopt);
return -1;
default:
fprintf(stderr, "default switch-case statement reachedn");
return -1;
}
//for (ii = optind; ii < argc; ii++)
// printf ("Non-option argument %sn", argv[ii]);
}
return 0;
}
/*
* The helper
*
* Shows the info to run the program in the correct way
*/
void print_help (int exit_val) {
printf("nPKG : %snPROGRAM : %snVERSION : %snn",PACKAGE,PROG,VERSION);
printf("%s [-h] [-r num of rows] [-c num of columns]nn", PROG);
printf(" -h print this help and exitn");
printf(" -r provide the number of rowsn");
printf(" -c provide the number of columsnn");
printf(" Example: ./qr_cpu -r 800 -c 600nn");
exit_val == -1 ? exit(EXIT_FAILURE) : exit(EXIT_SUCCESS);
}
提前感谢!
有多种方法可以计算矩阵的 QR 分解。主要方法有:
- 格拉姆-施密特工艺;
- 房主反思;
- 吉文斯轮换;
Gram-Schmidt 是投影和向量减法的序列,可以实现为执行缩减(用于投影)和逐元素数组操作(向量减法)的内核序列。你可以看看论文
a) GPU 上的 QR 分解
b) 在 CUDA 显卡上并行实现经典的革兰-施密特正交化
c) CPU 与 GPU - 格拉姆-施密特算法的性能比较
通过 Householder 反射进行的 QR 分解以矩阵向量运算为主,您可以在论文 a) 中找到一些信息 a), 论文
d) 对 GPU 进行基准测试以调整密集线性代数
V. Volkov 和 J.W. Demmel 的实现可在
LU、QR 和 Cholesky 分解使用 GPU
在我看来,给定旋转作为 QR 分解的并行方法并不流行。基本上,每个 Givens 旋转都会修改两行,因此借助 Sameh-Kuck 模式也可以进行一些并行化,允许多达 n
个并发旋转。您可以在以下位置找到一些信息
使用 CUDA 开发平台对 NVIDIA 8800GTX 进行基准测试
实际上,无法在 CUDA 架构中实现的不同方法之间进行清晰的性能比较。请注意,一些发布的材料涉及对"旧"架构的优化。因此,也许可以通过进一步优化"更新"的GPU世代来实现进一步的改进。