cooperative_groups::this_grid() 会导致任何 CUDA API 调用返回'unknown error'



按照CUDA示例中的相同步骤启动内核并使用cooperative_groups::this_grid().sync()在网格上同步会导致任何CUDA API调用失败。使用时cooperative_groups::this_thread_block().sync()工作良好,给出了正确的结果。

我使用以下代码和CMakeLists.txt(cmake版本3.11.1(在TITAN V GPU(驱动程序版本410.73(和Ubuntu 16.04.5 LTS上使用CUDA 10进行测试。该代码也可在github上获得,以便于重现错误。

该代码读取一个数组,然后将其反转(从[0 1 2 ... 9]反转为[9 8 7 ... 0](。为了做到这一点,每个线程从数组中读取一个元素,同步,然后将其元素写入正确的目标。代码可以很容易地修改,以确保this_thread_block().sync()工作良好。只需将arr_size更改为小于1024,然后使用cg::thread_block barrier = cg::this_thread_block();即可。

test_cg.cu

#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
//Error handling micro, wrap it around function whenever possible
if (err != cudaSuccess) {
printf("n%s in %s at line %dn", cudaGetErrorString(err), file, line);
#ifdef _WIN32
system("pause");
#else
exit(EXIT_FAILURE);
#endif
}
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************

//********************** cg kernel 
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
uint32_t *d_arr){
uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;
if (tid < num_elements){
uint32_t my_element = d_arr[tid];
//to sync across the whole grid 
cg::grid_group barrier = cg::this_grid();
//to sync within a single block 
//cg::thread_block barrier = cg::this_thread_block();
//wait for all reads 
barrier.sync();
uint32_t tar_id = num_elements - tid - 1;
d_arr[tar_id] = my_element;
}
}
//******************************************************************************

//********************** execute  
void execute_test(const int sm_count){
//host array 
const uint32_t arr_size = 1 << 20; //1M 
uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
//fill with sequential numbers
std::iota(h_arr, h_arr + arr_size, 0);
//device array 
uint32_t* d_arr;
CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
cudaMemcpyHostToDevice));
//launch config
const int threads = 512;
//following the same steps done in conjugateGradientMultiBlockCG.cu 
//cuda sample to launch kernel that sync across grid 
//https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436
int num_blocks_per_sm = 0;
CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
(void*)testing_cg_grid_sync, threads, 0));
dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);
if(arr_size > grid_dim.x*block_dim.x){
printf("n The grid size (numBlocks*numThreads) is less than array size.n");
exit(EXIT_FAILURE);
}
printf("n Launching %d blocks, each containing %d threads", grid_dim.x,
block_dim.x);
//argument passed to the kernel     
void *kernel_args[] = {
(void *)&arr_size,
(void *)&d_arr, };

//finally launch the kernel 
cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
grid_dim, block_dim, kernel_args);

//make sure everything went okay
CUDA_ERROR(cudaGetLastError());
CUDA_ERROR(cudaDeviceSynchronize());

//get results on the host 
CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
cudaMemcpyDeviceToHost));
//validate 
for (uint32_t i = 0; i < arr_size; i++){
if (h_arr[i] != arr_size - i - 1){
printf("n Result mismatch in h_arr[%u] = %un", i, h_arr[i]);
exit(EXIT_FAILURE);
}
}
}
//******************************************************************************
int main(int argc, char**argv) {
//set to Titan V
uint32_t device_id = 0;
cudaSetDevice(device_id);
//get sm count 
cudaDeviceProp devProp;
CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
int sm_count = devProp.multiProcessorCount;
//execute 
execute_test(sm_count);
printf("n Mission accomplished n");
return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)  
#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
set(CMAKE_BUILD_TYPE Release)
endif ()
SET(CUDA_SEPARABLE_COMPILATION ON)
########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################

########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE  ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)    
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################

运行此代码可以得到:

unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67

这是使用cudaMalloc的第一行。我通过从设备中查询__CUDA_ARCH__来确保代码被编译为正确的体系结构,结果是700。如果您发现我在代码或CMakeLists.txt文件中出错,请告诉我。

在外部帮助下,使代码正常工作的解决方案是在第二个set(CUDA_NVCC_FLAGS.....之后添加string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared")。原因是我的/usr/local/cuda-10.0/lib64/下只有libcudadevrt.a,所以我必须用信号通知CUDA链接共享/动态运行时库,因为默认情况是链接到静态在第二个CCD_ 18之后的CCD_。原因是sm_70标志未正确传递给链接器。

此外,仅使用CUDA_NVCC_FLAGS只会将sm_70信息传递给编译器,而不是链接器。而仅使用CMAKE_NVCC_FLAGS将报告error: namespace "cooperative_groups" has no member "grid_group"错误。

最新更新