当配置为在多个卡上运行时,以下代码会在一段时间后可靠地导致分段故障(我有一个4 GPU系统(NVIDIA Titan的))。这是我以前试图解决的问题的一个最小的例子(请参阅此处:从libcuda.so.1调用的__pthread_getspecific中的分段错误)
代码功能描述:
首先,它在每张卡上分配大量内存(在前N张卡上,其中N为1-4),其想法是尽可能地强调卡。然后它继续启动16个线程,每个线程在一个随机卡上运行几个内核。它重复几次,每次随机挑选一张牌,然后退出。主线程等待所有16个线程完成,然后无限期地重复该过程,或者直到其中一个线程由于某种原因失败。
在主线程的几次(通常在20次左右,但可能或多或少)迭代之后,它会导致分段错误。堆栈通常如下所示:
#0 0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1 0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2 0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3 0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4 0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5 0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6 0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7 0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8 0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9 0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208
完整的源代码:
main.cpp:
#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
class CriticalSection
{
pthread_mutex_t cs;
public:
CriticalSection();
~CriticalSection();
void Lock( void );
void Unlock( void );
};
CriticalSection::CriticalSection()
{
assert( pthread_mutex_init( &cs, NULL ) == 0 );
}
CriticalSection::~CriticalSection()
{
assert( pthread_mutex_destroy( &cs ) == 0 );
}
void CriticalSection::Lock( void )
{
assert( pthread_mutex_lock( &cs ) == 0 );
}
void CriticalSection::Unlock( void )
{
assert( pthread_mutex_unlock( &cs ) == 0 );
}
class DeviceWrapper
{
protected:
CriticalSection m_cs;
public:
int32_t m_i32DeviceId;
uint32_t* m_pdu32Data;
uint16_t* m_pdu16Res;
uint32_t m_u32Count;
DeviceWrapper();
~DeviceWrapper();
void Lock( void );
void Unlock( void );
bool Init( const int32_t i32DevId, const uint32_t u32Count );
bool Free();
};
DeviceWrapper::DeviceWrapper()
{
m_i32DeviceId = 0;
m_pdu32Data = NULL;
m_pdu16Res = NULL;
m_u32Count = 0;
}
DeviceWrapper::~DeviceWrapper()
{
}
void DeviceWrapper::Lock( void )
{
m_cs.Lock();
}
void DeviceWrapper::Unlock( void )
{
m_cs.Unlock();
}
bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
if ( cudaSetDevice( i32DevId ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to set device %dn", i32DevId );
return false;
}
if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %dn", u32Count, i32DevId );
return false;
}
if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
{
printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %dn", u32Count, i32DevId );
return false;
}
m_u32Count = u32Count;
m_i32DeviceId = i32DevId;
return true;
}
bool DeviceWrapper::Free()
{
if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to set device %dn", m_i32DeviceId );
return false;
}
if ( cudaFree( m_pdu32Data ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %dn", m_i32DeviceId );
return false;
}
if ( cudaFree( m_pdu16Res ) != cudaSuccess )
{
printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %dn", m_i32DeviceId );
return false;
}
m_pdu32Data = NULL;
m_pdu16Res = NULL;
m_u32Count = 0;
m_i32DeviceId = 0;
return true;
}
bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );
void* DoWork( void* pArg );
static bool bRun = true;
static DeviceWrapper devices[4];
int main( int argc, char* argv[] )
{
if ( argc != 2 )
{
printf( "Usage: %s <number of cards to use>n", argv[0] );
return 1;
}
uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );
if ( !u32CardsToUse || u32CardsToUse > 4 )
{
printf( "Invalid argument, must be in range 1-4n" );
return 2;
}
for ( int32_t i = 0; i < u32CardsToUse; i++ )
{
if ( !devices[i].Init( i, 0x20000000 ) )
{
for ( uint32_t j = 0; j < i; j++ )
{
devices[j].Free();
}
printf( "Failed to init device %dn", i );
return 3;
}
}
uint32_t u32IterationsCompleted = 0;
while ( bRun )
{
pthread_t pWorkers[ 16 ];
memset( pWorkers, 0, 16 * sizeof( pthread_t ) );
for ( uint32_t i = 0; i < 16; i++ )
{
int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );
if ( iReturnValue != 0 )
{
printf( "Error calling pthread_create: %dn", iReturnValue );
return 4;
}
}
for ( uint32_t i = 0; i < 16; i++ )
{
pthread_join( pWorkers[i], NULL );
}
printf( "Iterations completed: %un", ++u32IterationsCompleted );
}
printf( "Finishedn" );
fflush( stdout );
return 0;
}
void* DoWork( void* pArg )
{
uint32_t u32CardsToUse = uint32_t( pArg );
uint32_t u32TestCount = (rand() % 4) + 4;
for ( uint32_t i = 0; i < u32TestCount; i++ )
{
int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );
devices[ i32DeviceId ].Lock();
if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
{
printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)n", i32DeviceId, i );
bRun = false;
devices[ i32DeviceId ].Unlock();
return NULL;
}
devices[ i32DeviceId ].Unlock();
}
return NULL;
}
cuda_test.cu:
#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>
__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
gidx += blockDim.x * gridDim.x;
}
}
__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;
gidx += blockDim.x * gridDim.x;
}
}
__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;
gidx += blockDim.x * gridDim.x;
}
}
__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;
gidx += blockDim.x * gridDim.x;
}
}
__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
uint32_t u32File = pu32Data[gidx]; // 32-bit value to find the log2 of
uint32_t u32FileLog = 0; // result of log2 will go here
uint32_t u32Shift = 0;
u32FileLog = (u32File > 0xFFFF) << 4;
u32File >>= u32FileLog;
u32Shift = (u32File > 0xFF) << 3;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32Shift = (u32File > 0xF) << 2;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32Shift = (u32File > 0x3) << 1;
u32File >>= u32Shift;
u32FileLog |= u32Shift;
u32FileLog |= (u32File >> 1);
uint32_t u32Other = gidx; // 32-bit value to find the log2 of
uint32_t u32OtherLog = 0; // result of log2 will go here
u32Shift = 0;
u32OtherLog = (u32Other > 0xFFFF) << 4;
u32Other >>= u32OtherLog;
u32Shift = (u32Other > 0xFF) << 3;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32Shift = (u32Other > 0xF) << 2;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32Shift = (u32Other > 0x3) << 1;
u32Other >>= u32Shift;
u32OtherLog |= u32Shift;
u32OtherLog |= (u32Other >> 1);
if ( u32FileLog >= u32OtherLog )
{
pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
}
else
{
pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
}
gidx += blockDim.x * gridDim.x;
}
}
__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
uint32_t u32Value = 0;
if ( pu32Data[gidx] >= gidx )
{
u32Value = pu32Data[gidx] - gidx;
}
else
{
u32Value = gidx - pu32Data[gidx];
}
uint32_t u32Log = 0; // result of log2 will go here
uint32_t u32Shift = 0;
u32Log = (u32Value > 0xFFFF) << 4;
u32Value >>= u32Log;
u32Shift = (u32Value > 0xFF) << 3;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Shift = (u32Value > 0xF) << 2;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Shift = (u32Value > 0x3) << 1;
u32Value >>= u32Shift;
u32Log |= u32Shift;
u32Log |= (u32Value >> 1);
pu16Results[ gidx ] += (uint16_t)u32Log;
gidx += blockDim.x * gridDim.x;
}
}
__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;
gidx += blockDim.x * gridDim.x;
}
}
__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;
while ( gidx < u32Count )
{
pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
gidx += blockDim.x * gridDim.x;
}
}
bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{
for ( uint32_t i = 0; i < u32Iterations; i++ )
{
if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
{
return false;
}
if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
{
return false;
}
for ( uint32_t j = 0; j < 3; j++ )
{
HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
}
cudaDeviceSynchronize();
}
return true;
}
生成文件:
IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda
all:
nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
$(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
@echo DONE TEST_CUDA BUILD
clean:
rm -f *.o test_cuda.out
我使用CUDA 5.5和最新的驱动程序(319.32),运行Ubuntu Linux(64位)。
我的问题是:代码中是否存在可能导致这种行为的错误?为什么我只使用一张卡时没有看到崩溃?只是撞车的可能性要小得多吗?(似乎使用更多的牌会比使用更少的牌更快地出现崩溃)额外奖励:是否有其他人使用此代码看到崩溃
我找到了一个解决崩溃的解决方法:
其想法是,每个使用的GPU恰好有一个线程,其他工作线程将工作提交给这些线程以在GPU上运行。我能够在机器上运行测试应用程序一天,没有任何问题(完成了6000多次迭代)。