在CUDA中的字符串匹配在增加块时显示不同的结果



我正试图在C中实现与CUDA的字符串匹配程序,我有以下问题。

当我设置1块和每个块1个线程时,模式dfh的结果是2。这是正确的,但当我增加块的结果是4。

文本文件为:

ffskdfhksdjhfksdfksjdfhksdhfksjdhfkjer654yrkhjkfgjhdsrtrhkjchgkjthyoirthygfnbkjgkjdhykhkjchgkjfdhsfykhkbhkjfghkfgjy

这是我的代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *result){
int tid, i;
__shared__ int local_matches;
if(threadIdx.x == 0) local_matches = 0;
__syncthreads();
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
atomicAdd(&local_matches, 1);
}
}
__syncthreads();
if(threadIdx.x == 0) 
atomicAdd(result, local_matches);
}

int main(int argc, char *argv[]){
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *result_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);

pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File errorn"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ldn", file_size);

// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory errorn"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading errorn"); return 4;} 

pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;

match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc errorn"); return 5;}
cudaMalloc((void **)&result_dev, sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, result_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(&total_matches, result_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(result_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %dn", total_matches);
printf("nnnN: %d, Blocks: %d, Threads: %dn", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3fn", total_time);
printf("Kernel time (ms): %.3fn", comp_time);
printf("Data transfer time(ms): %.3fnnn", total_time-comp_time);
}

您需要使用__syncthreads同步同一块的线程。例如,local_matches = 0理论上可以与主回路的atomicAdd同时进行。因此,两者之间需要一个__syncthreads。出于同样的原因,在最后一个if(threadIdx.x == 0)之前还需要一个__syncthreads。我不确定这是唯一的错误。

我建议你使用CUDA-GDB来跟踪这种bug,因为内核很短,而且相对简单。

注意local_matches不需要被共享。事实上,这样做是没有效率的。On可以在本地内存中执行缩减,然后执行最后的atomicAdd。此外,您不需要最内层的条件。您可以简单地执行:local_matches += i >= pattern_size;(编译器可能已经做了这样的优化)。

我终于找到了一个解决办法。

我为每个缓冲区位置设置了一个0值的匹配表,为模式找到的每个位置设置1,并在CPU中添加1。

如果你认为有更好的,请添加一个答案。

这是代码:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int *match){
int tid, i;
for(tid=blockIdx.x*blockDim.x+threadIdx.x; tid<match_size; tid+=blockDim.x){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
match[tid] = 1;
}
else{
match[tid] = 0;
}
}
}

int main(int argc, char *argv[]){
FILE *pFile;
int i;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *match, total_matches;
//CUDA variables
int blocks, threads_per_block;
int *match_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);

pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File errorn"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ldn", file_size);

// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory errorn"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading errorn"); return 4;} 

pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;

match = (int *) malloc (sizeof(int)*match_size);
if (match == NULL) {printf ("Malloc errorn"); return 5;}
cudaMalloc((void **)&match_dev, match_size*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, match_dev);
cudaThreadSynchronize();
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(match, match_dev, match_size*sizeof(int), cudaMemcpyDeviceToHost);
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
total_matches = 0;
for(i=0; i<match_size; i++){
total_matches += match[i];
}
cudaFree(match_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %dn", total_matches);
printf("nnnN: %d, Blocks: %d, Threads: %dn", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3fn", total_time);
printf("Kernel time (ms): %.3fn", comp_time);
printf("Data transfer time(ms): %.3fnnn", total_time-comp_time);
}

我认为这是一个更好的解决方案。

它只适用于每个块2个线程的能力。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#define MAX_THREADS_PER_BLOCK 100
__global__ void string_matching(char *buffer, char *pattern, int match_size, int pattern_size, int blocks, int slice, int extra, int *gout){
int tid, i;
int thread_index = blockIdx.x*blockDim.x + threadIdx.x;
int start = thread_index*slice;
int stop = start + slice;
if(thread_index == blocks*blockDim.x - 1){
stop += extra;
}
if(stop > match_size){
stop = match_size;
}
__shared__ int r[MAX_THREADS_PER_BLOCK];
int sum = 0;
for(tid=start; tid<stop; tid++){
for (i = 0; i < pattern_size && pattern[i] == buffer[i + tid]; ++i);
if(i >= pattern_size){
sum++;
}
}
r[threadIdx.x] = sum;
__syncthreads();
//works only for power of 2 threads_per_block
for (int size = blockDim.x/2; size>0; size/=2) { //uniform
if (threadIdx.x<size)
r[threadIdx.x] += r[threadIdx.x+size];
__syncthreads();
}

printf("Block: %d, Thread: %d, Global Thread: %d, Start: %d, Stop: %d, Matches: %d, Block Matches: %dn", blockIdx.x, threadIdx.x, thread_index, start, stop, r[threadIdx.x], r[0]);

if(threadIdx.x == 0){
gout[blockIdx.x] = r[0];
}

}

int main(int argc, char *argv[]){
int i;
FILE *pFile;
long file_size, match_size, pattern_size;
char * buffer;
char * filename, *pattern;
size_t result;
int *results;
int total_matches;
//CUDA variables
int blocks, threads_per_block, total_threads, slice, extra;
int *results_dev;
char *buffer_dev, *pattern_dev;
float total_time, comp_time;
cudaEvent_t total_start, total_stop, comp_start, comp_stop;
cudaEventCreate(&total_start);
cudaEventCreate(&total_stop);
cudaEventCreate(&comp_start);
cudaEventCreate(&comp_stop);
if (argc != 5) {
printf ("Usage : %s <file_name> <string> <blocks> <threads_per_block>n", argv[0]);
return 1;
}
filename = argv[1];
pattern = argv[2];
blocks = strtol(argv[3], NULL, 10);
threads_per_block = strtol(argv[4], NULL, 10);

pFile = fopen ( filename , "rb" );
if (pFile==NULL) {printf ("File errorn"); return 2;}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
file_size = ftell (pFile);
rewind (pFile);
printf("file size is %ldn", file_size);

// allocate memory to contain the file:
buffer = (char*) malloc (sizeof(char)*file_size);
if (buffer == NULL) {printf ("Memory errorn"); return 3;}
// copy the file into the buffer:
result = fread (buffer,1,file_size,pFile);
if (result != file_size) {printf ("Reading errorn"); return 4;} 

pattern_size = strlen(pattern);
match_size = file_size - pattern_size + 1;
results = (int *)malloc(blocks*sizeof(int));
cudaMalloc((void **)&results_dev, blocks*sizeof(int));
cudaMalloc((void **)&buffer_dev, file_size*sizeof(char));
cudaMalloc((void **)&pattern_dev, pattern_size*sizeof(char));
cudaEventRecord(total_start);
cudaEventRecord(comp_start);
cudaMemcpy(buffer_dev, buffer, file_size*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(pattern_dev, pattern, pattern_size*sizeof(char), cudaMemcpyHostToDevice);
total_threads = blocks*threads_per_block;
slice = match_size/total_threads;
extra = match_size%total_threads;
string_matching<<<blocks, threads_per_block>>>(buffer_dev, pattern_dev, match_size, pattern_size, blocks, slice, extra, results_dev);
cudaEventRecord(comp_stop);
cudaEventSynchronize(comp_stop);
cudaEventElapsedTime(&comp_time, comp_start, comp_stop);
cudaMemcpy(results, results_dev, blocks*sizeof(int), cudaMemcpyDeviceToHost);
total_matches = 0;
for(i=0; i<blocks; i++){
total_matches += results[i];
}
cudaEventRecord(total_stop);
cudaEventSynchronize(total_stop);
cudaEventElapsedTime(&total_time, total_start, total_stop);
cudaFree(results_dev);
cudaFree(buffer_dev);
cudaFree(pattern_dev);
fclose (pFile);
free (buffer);
//Print result
printf("Total matches: %dn", total_matches);
printf("nnnN: %d, Blocks: %d, Threads: %dn", file_size, blocks, blocks*threads_per_block);
printf("Total time (ms): %.3fn", total_time);
printf("Kernel time (ms): %.3fn", comp_time);
printf("Data transfer time(ms): %.3fnnn", total_time-comp_time);
}

循环上的步幅对于使用多个块是不正确的,即两个块都在做全部的工作(因此每个块都找到两个条目)。正确的grid-stride循环如下所示

for (int tid = blockIdx.x * blockDim.x + threadIdx.x; 
tid < match_size; 
tid += blockDim.x * gridDim.x /* <-- fix */) {
// ...
}

最新更新