我正在学习如何使用cmake来编译包含.cpp文件的项目和.cu文件。目前,我正在使用一个只有两个文件的玩具示例:main.cpp和kernel.cu。文件是
main.cpp:
#include <stdlib.h>
#include <string.h>
extern void kernel_wrapper(int *a, int *b);
int main(int argc, char *argv[]){
int a = 2;
int b = 3;
printf("Input: a = %d, b = %dn",a,b);
kernel_wrapper(&a, &b);
printf("Ran: a = %d, b = %dn",a,b);
return 0;
}
kernel.cu:
//#include "cuPrintf.cu"
#include <stdio.h>
__global__ void kernel(int *a, int *b){
int tx = threadIdx.x;
// cuPrintf("tx = %dn", tx);
switch( tx ){
case 0:
*a = *a + 10;
break;
case 1:
*b = *b + 3;
break;
default:
break;
}
}
void kernel_wrapper(int *a, int *b){
// cudaPrintfInit();
//cuPrintf("Anything...?");
printf("Anything...?n");
int *d_1, *d_2;
dim3 threads( 2, 1 );
dim3 blocks( 1, 1 );
cudaMalloc( (void **)&d_1, sizeof(int) );
cudaMalloc( (void **)&d_2, sizeof(int) );
cudaMemcpy( d_1, a, sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_2, b, sizeof(int), cudaMemcpyHostToDevice );
kernel<<< blocks, threads >>>( d_1, d_2 );
cudaMemcpy( a, d_1, sizeof(int), cudaMemcpyDeviceToHost );
cudaMemcpy( b, d_2, sizeof(int), cudaMemcpyDeviceToHost );
printf("Output: a = %dn", a[0]);
cudaFree(d_1);
cudaFree(d_2);
// cudaPrintfDisplay(stdout, true);
// cudaPrintfEnd();
}
cmake文件的灵感来自这篇文章:cmake脚本CUDA 6.0与c++ 11
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -lpthread")
cmake_minimum_required(VERSION 3.2)
project( CUDAAndCP )
find_package(CUDA REQUIRED)
# For compilation ...
# Specify target & source files to compile it from
# Pass options to NVCC
set(
CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS};
-O3 -gencode arch=compute_30,code=sm_30;
--std=c++11
)
cuda_add_library(kernel_obj kernel.cu)
cuda_add_executable(main main.cpp)
target_link_libraries(main ${CUDA_LIBRARIES})
target_link_libraries(main kernel_obj)
我确实编译了主二进制文件,但当我运行它时,结果是
Input: a = 2, b = 3
Anything...?
Output: a = 2
Ran: a = 2, b = 3
代替
Input: a = 2, b = 3
Anything...?
Output: a = 2
Ran: a = 12, b = 6
我通过运行
得到了正确的主二进制文件g++ -c main.cpp
nvcc -c kernel.cu
nvcc -o main main.o kernel.o
所以看起来cuda文件没有正确链接到主二进制文件。我真的不明白为什么,任何帮助都很感激!
我在ubuntu 14.04, cuda 7.5和cmake 3.2.0上运行这个程序
我认为代码生成有些棘手。我的平台和你的一样,我用的是GTX 980。
真正的问题是arch=compute_30,code=sm_30;
。如果将其改为52,则一切正常。
如果使用裸命令行构建,CUDA 7.5 nvcc
默认生成compute_20
和sm_20
代码。