下面的代码是关于Sha-1算法的。
内核中有一个很大的循环(HashRounds=0x40000,262144次)。现在的情况是:
1.在调试模式下运行时,它将报告错误30.但是如果我减少周期时间,例如50000次,则不会报告任何错误。
2.在释放模式下运行时,这是正常的,但是当我增加线程规模时,例如块= 48,线程= 192,它会报告相同的问题。
环境:GTX560Ti+Win8+Visual Studio 2012+Cuda5.5
恳求你的帮助!!
#include <stdio.h>
#include <string.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
typedef unsigned int uint32;
typedef unsigned char byte;
typedef struct {
uint32 state[5];
uint32 count[2];
byte buffer[64];
byte workspace[64]; // Temporary buffer.
} hash_context;
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00)
|(rol(block->l[i],8)&0x00FF00FF))
#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15]
^block->l[(i+2)&15]^block->l[i&15],1))
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) {z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#define R1(v,w,x,y,z,i) {z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);}
#define R2(v,w,x,y,z,i) {z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);}
#define R3(v,w,x,y,z,i) {z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);}
#define R4(v,w,x,y,z,i) {z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);}
cudaError_t addWithCuda();
__global__ void cryptKernel();
__device__ void hash_initial(hash_context* context);
__device__ void hash_process( hash_context * context, byte * data, size_t len);
__device__ void SHA1Transform(uint32 state[5], byte workspace[64], byte buffer[64]);
int main()
{
cudaError_t cudaStatus = addWithCuda();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
printf("over");
getchar();
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda()
{
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cryptKernel<<<1,1>>>();
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %sn", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!n", cudaStatus);
goto Error;
}
Error:
return cudaStatus;
}
__global__ void cryptKernel()
{
int i = blockIdx.x*blockDim.x+threadIdx.x;
byte RawPsw[24] = {' '};
hash_context c;
hash_initial(&c);
const int HashRounds=0x40000;
for (int I=0;I<HashRounds;I++)
{
hash_process( &c, RawPsw, 24);
}
}
__device__ void SHA1Transform(uint32 state[5], byte workspace[64], byte buffer[64])
{
uint32 a, b, c, d, e;
typedef union {
byte c[64];
uint32 l[16];
} CHAR64LONG16;
CHAR64LONG16* block;
block = (CHAR64LONG16*)buffer;
/* Copy context->state[] to working vars */
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
}
__device__ void hash_initial(hash_context* context)
{
/* SHA1 initialization constants */
context->state[0] = 0x67452301;
context->state[1] = 0xEFCDAB89;
context->state[2] = 0x98BADCFE;
context->state[3] = 0x10325476;
context->state[4] = 0xC3D2E1F0;
context->count[0] = context->count[1] = 0;
}
/* Run your data through this. */
__device__ void hash_process( hash_context * context, byte * data, size_t len)
{
unsigned int i, j;
uint32 blen = ((uint32)len)<<3;
j = (context->count[0] >> 3) & 63;
if ((context->count[0] += blen) < blen ) context->count[1]++;
context->count[1] += (uint32)(len >> 29);
if ((j + len) > 63) {
memcpy(&context->buffer[j], data, (i = 64-j));
SHA1Transform(context->state, context->workspace, context->buffer);
for ( ; i + 63 < len; i += 64) {
SHA1Transform(context->state, context->workspace, &data[i]);
}
j = 0;
}
else i = 0;
if (len > i)
memcpy(&context->buffer[j], &data[i], len - i);
}
当我增加循环时间或线程规模时,它会报告错误代码 30.It 这意味着内核执行时间太长。因此,它击中了Windows TDR机制。关闭TDR将解决此问题。