CUDA 无法使用所有可用的常量内存



我有一个代码,它使用协作组来执行一些操作。因此,我用编译代码

/usr/local/cuda/bin/nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, --device-c -g -O2 foo.cu

然后我尝试调用设备链接器:

/usr/local/cuda/bin/nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, -g -dlink foo.o

然后产生错误:

ptxas错误:文件使用了太多全局常量数据(0x10100字节,最大0x10000)

问题是由我分配常量内存的方式引起的:

__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)];

其中CONST_MEM=65536字节,这是我从SM_61的设备查询中得到的。然而,如果我将恒定内存减少到64536,问题就消失了。这几乎就像常数记忆是";"保留";出于编译期间的某些目的。我搜索了CUDA的文档,但没有找到令人满意的答案。使用最大可用的恒定内存量是否安全?为什么会出现这个问题?

编辑:这是触发SM_61:错误的代码片段

#include <algorithm>
#include <vector>
#include <type_traits>
#include <cuda_runtime.h>
#include <cfloat>
#include <iostream>
#include <cooperative_groups.h>
using namespace cooperative_groups;

struct foo_params {
float * points;
float * centers;
int * centersDist;
int * centersIndex;
int numPoints;
};
__constant__ float d_cnst_centers[65536 / sizeof(float)];
template <int R, int C>
__device__ int 
nearestCenter(float * points, float * pC) {
float mindist = FLT_MAX;
int minidx = 0;
int clistidx = 0;
for(int i=0; i<C;i++) {
clistidx = i*R;
float dist;
{
float *point = points;
float *center = &pC[clistidx];
float accum;
for(int i = 0; i<R; i++) {
float delta = point[i] - center[i];
accum += delta*delta;
}
dist = sqrt(accum);
}
/* ... */
}
return minidx;
}

template<int R, int C, bool bRO, bool ROWMAJ=true>
__global__ void getNeatestCenter(struct foo_params params) {
float * points = params.points;
float * centers = params.centers;
int * centersDist = params.centersDist;
int * centersIndex = params.centersIndex;
int numPoints = params.numPoints;
grid_group grid = this_grid();
{
int idx = blockIdx.x*blockDim.x+threadIdx.x;
if (idx < numPoints) {
centersIndex[idx] = nearestCenter<R,C>(&points[idx*R], d_cnst_centers);
}
}
/* ... other code */
}
int main () {
// foo paramaters, for illustration purposes
struct foo_params param;
param.points = NULL;
param.centers = NULL;
param.centersDist = NULL;
param.centersIndex = NULL;
param.numPoints = 1000000;
void *p_params = &param;
int minGridSize = 0, blockSize = 0;
cudaOccupancyMaxPotentialBlockSize(
&minGridSize,
&blockSize,
(void*)getNeatestCenter<128, 64, true>,
0,
0);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
cudaLaunchCooperativeKernel((void *)getNeatestCenter<32, 32, true>, dimGrid, dimBlock, &p_params);
}

问题似乎是由以下线路引起的:

grid_group grid = this_grid();

其似乎在没有已知原因的情况下使用大约0x100字节的恒定存储器。

这个答案是推测性的,因为OP没有提供最小但完整的重编程代码。

GPU包含用于程序存储的不同部分的多个常量内存组。其中一个库供程序员使用。重要的是,CUDA标准数学库代码使用相同的库,因为数学库代码通过函数内联成为程序员代码的一部分。在过去,这是显而易见的,因为整个CUDA数学库最初只是几个头文件。

一些数学函数内部需要常量数据的小表。具体的例子是sincostan。当使用这些数学函数时,程序员可用的__constant__数据量从64KB减少了一小部分。以下是一些示例程序,用于演示,使用CUDA 8工具链和-arch=sm_61:编译

#include <stdio.h>
#include <stdlib.h>
#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};
__global__ void kernel (int i, float f)
{
float r = d_cnst_centers[i] * expf(f);
printf ("r=%15.8fn", r);
}
int main (void)
{
kernel<<<1,1>>>(0,25.0f);
cudaDeviceSynchronize();
return EXIT_SUCCESS;
}

这将进行精细编译,并在运行时打印r=72004902912.00000000。现在让我们将expf更改为sinf:

#include <stdio.h>
#include <stdlib.h>
#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};
__global__ void kernel (int i, float f)
{
float r = d_cnst_centers[i] * sinf(f);
printf ("r=%15.8fn", r);
}
int main (void)
{
kernel<<<1,1>>>(0,25.0f);
cudaDeviceSynchronize();
return EXIT_SUCCESS;
}

这会在编译过程中引发错误:ptxas error : File uses too much global constant data (0x10018 bytes, 0x10000 max)

如果我们使用双精度函数sin,则需要更多的恒定内存:

#include <stdio.h>
#include <stdlib.h>
#define CONST_MEM (65536)
__constant__ float d_cnst_centers[CONST_MEM / sizeof(float)] = {1};
__global__ void kernel (int i, float f)
{
float r = d_cnst_centers[i] * sin((double)f);
printf ("r=%15.8fn", r);
}
int main (void)
{
kernel<<<1,1>>>(0,25.0f);
cudaDeviceSynchronize();
return EXIT_SUCCESS;
}

我们收到错误消息:ptxas error : File uses too much global constant data (0x10110 bytes, 0x10000 max)

为了记录这个用例中到底发生了什么,我在编译过程中拼凑了以下工作。希望它能揭示这个问题是如何产生的,以及一些有用的诊断工具,同时消除一些误解。

请注意,这是一项正在进行的工作,随着更多信息的曝光,可能会定期更新。请根据您的意愿进行编辑和贡献

首先,正如注释中所指出的,完全可以分配每个字节的恒定内存,直到64kb的限制。这个例子与最初的问题中描述的用例非常相似

const int sz = 65536;
const int NMax = sz / sizeof(float);
__constant__ float buffer[NMax];
__global__ 
void akernel(const float* __restrict__ arg1, float* __restrict__ arg2, int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float ans = 0;
#pragma unroll 128
for(int i=0; i<NMax; i++)  {
float val = buffer[i];
float y = (i%2 == 0) ? 1.f : -1.f;
float x = val / 255.f;
ans = ans + y * sinf(x);
}
arg2[tid] = ans + arg1[tid];
}
}

并且它编译起来没有问题(此处为Godbolt链接)。这证明了问题中的链接器阶段必须从其他代码中引入额外的常量内存分配,无论是用户代码、其他设备库还是设备运行时支持。

因此,让我们将注意力转向更新问题中发布的repo案例,经过适度修改,它将通过略微减少恒定内存占用来通过编译和链接阶段,缓冲区为64536字节:

$ nvcc -arch=sm_61 --device-c -g -O2 -Xptxas="-v" -o constmemuse.cu.o constmemuse.cu 
constmemuse.cu(51): warning: variable "centers" was declared but never referenced
constmemuse.cu(52): warning: variable "centersDist" was declared but never referenced
constmemuse.cu(31): warning: variable "dist" was set but never used
detected during instantiation of "void getNeatestCenter<R,C,bRO,ROWMAJ>(foo_params) [with R=128, C=64, bRO=true, ROWMAJ=true]" 
constmemuse.cu(26): warning: variable "mindist" was declared but never referenced
detected during instantiation of "void getNeatestCenter<R,C,bRO,ROWMAJ>(foo_params) [with R=128, C=64, bRO=true, ROWMAJ=true]" 
ptxas info    : 0 bytes gmem, 64536 bytes cmem[3]
ptxas info    : Function properties for cudaDeviceGetAttribute
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params' for 'sm_61'
ptxas info    : Function properties for _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 5 registers, 360 bytes cmem[0]
ptxas info    : Function properties for cudaMalloc
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessor
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaGetDevice
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params' for 'sm_61'
ptxas info    : Function properties for _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 5 registers, 360 bytes cmem[0]
ptxas info    : Function properties for cudaFuncGetAttributes
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Function properties for cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads

几点:

  • 64536 bytes cmem[3]显示了我们指定的用户可控常量内存组的大小
  • ptxas info : Used 5 registers, 360 bytes cmem[0]显示了函数的寄存器用法,cmem[0]是内部保留的常量内存组,用于保存内核参数和编译器放入常量内存的任何其他内容。请注意,寄存器溢出会进入本地内存,而不是常量内存

所以现在让我们运行设备链接步骤:

$ nvcc -arch=sm_61 -gencode=arch=compute_61,code=sm_61, -g -dlink -Xnvlink="-v" -o constmemuse.o constmemuse.cu.o
nvlink info    : 9944 bytes gmem, 64792 bytes cmem[3] (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 20 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 23 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 28 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 23 registers, 0 stack, 2056 bytes smem, 448 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 10 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 12 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 17 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 14 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_': (target: sm_61)
nvlink info    : used 16 registers, 0 stack, 2056 bytes smem, 416 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 16 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 14 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 17 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 8 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 11 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 12 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 11 registers, 0 stack, 0 bytes smem, 400 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '__nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_': (target: sm_61)
nvlink info    : used 21 registers, 0 stack, 0 bytes smem, 424 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '_Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 360 bytes cmem[0], 0 bytes lmem (target: sm_61)
nvlink info    : Function properties for '_Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params': (target: sm_61)
nvlink info    : used 6 registers, 0 stack, 0 bytes smem, 360 bytes cmem[0], 0 bytes lmem (target: sm_61)

更多备注:

  • 9944 bytes gmem, 64792 bytes cmem[3]现在显示链接模块的全局和恒定内存预留。正如您所看到的,我们在常量库0中继承了256个额外的字节,这是用户可修改的库,加上9944个静态保留的全局内存。如果数组分配为65536字节(如问题所述),则链接将失败,因为它超过了64kb的限制
  • 您可以看到,在链接阶段,许多设备运行库函数(memcpy和memset)已经被自动链接

很明显,连接设备运行时的额外恒定内存使用量,可以通过cuobjdumppost-hoc来确认。编译的对象:

$ cuobjdump -res-usage constmemuse.cu.o
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
Resource usage:
Common:
GLOBAL:0 CONSTANT[3]:64536
Function cudaDeviceGetAttribute:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params:
REG:5 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaMalloc:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaOccupancyMaxActiveBlocksPerMultiprocessor:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaGetDevice:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params:
REG:5 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaFuncGetAttributes:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags:
REG:5 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0
Fatbin ptx code:
================
arch = sm_61
code version = [6,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
ptxasOptions = -v --compile-only  

以及链接后的对象:

$ cuobjdump -res-usage constmemuse.o
Fatbin elf code:
================
arch = sm_61
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
Resource usage:
Common:
GLOBAL:9944 CONSTANT[3]:64792
Function _Z16getNeatestCenterILi128ELi64ELb1ELb1EEv10foo_params:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
Function _Z16getNeatestCenterILi32ELi32ELb1ELb1EEv10foo_params:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:360 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:21 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:11 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:12 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:11 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:8 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceIjLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:400 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:17 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:14 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi0ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:16 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi0ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi0EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memset_3d_deviceImLi1ELi1ELi1EEvPhhjT_S1_S1_S1_S1_jjjjjjjS1_S0_:
REG:6 STACK:0 SHARED:0 LOCAL:0 CONSTANT[0]:424 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:16 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:14 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 CONSTANT[2]:4 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:17 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:12 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceIjLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:416 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:23 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:28 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:23 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi0ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:20 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi0ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi0EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function __nv_static_51__38_cuda_device_runtime_compute_75_cpp1_ii_8b1a5d37__Z16memcpy_3d_deviceImLi1ELi1ELi1EEvPKhPhT_S3_S3_S3_S3_S3_S3_jjjjjjjjS3_S1_S2_:
REG:10 STACK:0 SHARED:2056 LOCAL:0 CONSTANT[0]:448 TEXTURE:0 SURFACE:0 SAMPLER:0
Function cudaCGGetIntrinsicHandle:
REG:6 STACK:0 SHARED:0 LOCAL:0 TEXTURE:0 SURFACE:0 SAMPLER:0

在公认的答案中已经证明,数学库可以为一些三角函数和超越函数的系数和查找表保留常量内存。然而,在这种情况下,原因似乎是在内核中使用协作组所发出的支持样板。进一步深入研究额外的bank 0常量内存的确切来源需要对该代码进行反汇编和逆向工程,我现在不打算这样做。

最新更新