如何在不将指针声明为易失性的情况下强制执行 CUDA 全局内存一致性

我先做一些情境化。我正在尝试使用 CUDA 中的 deques 实现一种非阻塞工作窃取方法。deques（aDeques）位于全局内存中的块分段数组中，popWork（）设备函数的目标是弹出工作以馈送线程。除了全局 deques 之外，每个块在共享内存（aLocalStack）中都有一个堆栈，它可以在本地工作。流行音乐分为 3 个级别。第一次尝试是在共享堆栈中，第二次尝试是在块拥有的双端，第三次尝试是工作窃取其他双端。每个 deque 都有全局底部和弹出指针，它们位于全局内存数组（aiDequesBottoms 和 auiDequesAges）中。我的问题是，当我在 GTS450 中测试代码时，当一个块更改全局双端指针时，其他块看不到这些更改。似乎缓存没有更新。我还在GT520卡中进行了测试，其中没有出现问题。我在aiDequeFlags数组中遇到过类似的问题。这些问题通过声明其易失性来解决。不幸的是，我不能对十进制指针数组做同样的事情，因为我稍后需要在它们上使用原子函数。很抱歉没有将问题放在更简单的示例中，但我无法重现此行为。第一个片段解释了 popWork（）接口。

template <int iDequeSize> //Size of each segment in aDeques 
bool __inline__ __device__ popWork(
    volatile int *aiDequeFlags , //Flags that indicates if a deque is active (has work)
    int *aiDequesBottoms , //Deque bottom pointers
    unsigned int *auiDequesAges , //Deque top pointers (29 higher bits) + 
                                  //Tag bits(3 lower bits).
    const Int2Array *aDeques , //Deques (Int2Array is an interface for 2 int arrays)
    int &uiStackBot , //Shared memory stack pointer
    int2 *aLocalStack , //Shared memory local stack
    const int &iTid , //threadIdx.x
    const int &iBid , //blockIdx.x
    //All other parameters are output
unsigned int &uiPopDequeIdx , //Choosen deque for pop
    int2 *popStartIdxAndSize , //Arrays of pop start index and sizes
    bool *bPopFlag , //Array of flags for pop in each level
unsigned int &uiActiveDequesIdx , //Flag to indicate pop failed (no more work)
    int2 &work //Actual acquired thread work)

第二个代码段具有整个函数。使用该函数的内核以 8 个块、64 个线程启动，开始时只有 deque 0 有 1 个工作，而所有其他 deques 都是空的。有一些调试 printf 调用来生成日志，该日志将显示在下一个代码片段中。

template <int iDequeSize>
bool __inline__ __device__ popWork(volatile int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Int2Array *aDeques , int &uiStackBot , int2 *aLocalStack , const int &iTid , const int &iBid ,
unsigned int &uiPopDequeIdx , int2 *popStartIdxAndSize , bool *bPopFlag , unsigned int &uiActiveDequesIdx , int2 &work)
{
//Pop from local stack
if(iTid == 0)
{
    unsigned int uiAge = 0;
    bPopFlag[0] = popBottom(uiStackBot , uiAge , popStartIdxAndSize[iBid]); 
    bPopFlag[3] = bPopFlag[0];
}
__syncthreads();
if(bPopFlag[0])
{
    if(iTid < popStartIdxAndSize[iBid].y)
    {
        work = aLocalStack[popStartIdxAndSize[iBid].x + iTid];
    }
}
else
{
    if(iTid == 0)
    {   //Try to pop from block deque
        bPopFlag[1] = popBottom(aiDequesBottoms[iBid] , auiDequesAges[iBid] , popStartIdxAndSize[iBid]);
        if(bPopFlag[1])
        {
            uiPopDequeIdx = iBid;
            //Debug
            if(iBid == 0)
            {
                printf("Block %d pop global deque. Bottom=%dn" , iBid , aiDequesBottoms[iBid]);
            }
            //
        }
        else
        {
            aiDequeFlags[iBid] = 0;
            popStartIdxAndSize[iBid].x = INFTY;
            uiPopDequeIdx = INFTY;
        }
        bPopFlag[3] = bPopFlag[1];
        bPopFlag[2] = false;
    }
    __syncthreads();
    if(!bPopFlag[1])
    {
        //Verify if lazy steal can be done.
        if(iTid < NDEQUES)
        {
            if(popStartIdxAndSize[iTid].x != INFTY && iTid != iBid)
            {
                atomicMin(&uiPopDequeIdx , iTid);
                bPopFlag[2] = true;
                bPopFlag[3] = true;
            }
        }
        __syncthreads();
        if(iTid == uiPopDequeIdx)
        {
            popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
            popStartIdxAndSize[iTid].x = INFTY;
        }
        while(!bPopFlag[3])
        {   //No more work, try to steal some!
            __syncthreads();
            if(iTid == 0)
            {
                uiActiveDequesIdx = 0;
            }
            __syncthreads();
            if(iTid < NDEQUES)
            {
                if(aiDequeFlags[iTid] == 1)
                {
                    uiActiveDequesIdx = 1;
                    //Debug
                    printf("Block %d steal attempt on block %d. Victim bottom=%dn" , blockIdx.x , threadIdx.x , aiDequesBottoms[iTid]);
                    //
                    if(popTop(aiDequesBottoms , auiDequesAges , iTid , popStartIdxAndSize[iTid]))
                    {
                        aiDequeFlags[iBid] = 1;
                        atomicMin(&uiPopDequeIdx , iTid);
                        bPopFlag[3] = true;
                        //Debug
                        //printf("%d ss %d %d %dn" , iBid , iTid , popStartIdxAndSize[iTid].x , popStartIdxAndSize[iTid].y);
                        //
                    }
                }
            }
            __syncthreads();
            if(uiActiveDequesIdx == 0)
            { //No more work to steal. End.
                break;
            }
            if(iTid == uiPopDequeIdx)
            {
                popStartIdxAndSize[iBid] = popStartIdxAndSize[iTid];
                popStartIdxAndSize[iTid].x = INFTY;
            }
            __syncthreads();
        }
    }
    __syncthreads();
    if(bPopFlag[3] && iTid < popStartIdxAndSize[iBid].y) //assuming number of threads >= WORK_SIZE
    {
        aDeques->getElement(work , uiPopDequeIdx*iDequeSize + popStartIdxAndSize[iBid].x + iTid);
    }
}
return bPopFlag[3];

}

最后一个代码段是生成的日志。推送线（"块 X 推送。底部=Y"）是由此处未显示的推送函数生成的。请记住，在开始时，只有块 0 有 1 个工作。

Block 0 pop global deque. Bottom=0
Block 4 steal attempt on block 0. Victim bottom=0
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 4 steal attempt on block 0. Victim bottom=0
Block 7 steal attempt on block 0. Victim bottom=1
Block 0 push. Bottom=448
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 6 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 0. Victim bottom=1
Block 3 steal attempt on block 0. Victim bottom=1
Block 7 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 0. Victim bottom=1
Block 2 steal attempt on block 4. Victim bottom=0
Block 1 steal attempt on block 0. Victim bottom=1
Block 1 steal attempt on block 4. Victim bottom=0
Block 5 steal attempt on block 0. Victim bottom=1
Block 5 steal attempt on block 4. Victim bottom=0
Block 4 push. Bottom=384

可以看出，只有块 4 可以看到块 0 deque 底部指针的变化。我尝试在指针发生任何更改后添加一些 __threadfence（）调用，但没有成功。感谢您的关注！

根据评论，似乎唯一有效的解决方案是关闭 L1 缓存。这可以通过在编译时将以下开关传递给 nvcc 在程序范围内完成：

–Xptxas –dlcm=cg

L1 缓存是 SM 的属性/资源，而不是整个设备。由于线程块在特定 SM 上执行，因此其 L1 缓存中一个线程块的活动可能与另一个线程块及其 L1 缓存的活动不一致（假设它恰好在不同的 SM 上运行），即使它们都引用全局内存中的相同位置。不同SM中的L1缓存彼此之间没有连接，不能保证彼此一致。

请注意，L2 缓存是设备范围的，因此从各个线程块的角度来看是"一致的"。关闭 L1 缓存对 L2 缓存没有影响，因此仍有可能获得一些缓存好处，但是满足 L2 请求所需的时间比满足 L1 请求所需的时间长，因此关闭 L1 缓存程序范围是一个相当大的锤子，试图让事情正常工作。

变量定义前面的 volatile 关键字应该具有告诉编译器在加载时跳过 L1 缓存的效果（根据我的理解）。但是易失性本身不会解决写入路径，因此一个 SM 中的一个线程块可以执行volatile读取，从 L2 中提取一个值，修改该值，然后将其写回，最终在 L1 中结束（直到它被逐出）。如果另一个线程块读取相同的全局值，则可能看不到更新的效果。

勤奋地使用 __threadfence（）虽然很乏味，但应该强制任何此类更新从 L1 到 L2，以便其他线程块可以读取它们。但是，这仍然会留下从写入值到其他SM/线程块可观察到的同步间隙。

（全局）原子还应该具有直接进入"全局内存"来读取和写入所用值的效果。

还应该遍历代码，以确保正确处理从全局同步位置的每个可能的读取（例如，使用 volatile 或使用原子学），并且正确处理对全局同步位置的每个可能的写入（例如，使用 __threadfence() 或原子），并检查不同块之间的竞争条件。

正如所发现的，在GPU中创建稳定的全局同步环境的过程并非易事。这些其他问题也可能引起人们的兴趣（例如关于开普勒）（以及例如讨论全局信号量）。

编辑：为了回答评论中发布的问题，我会这样说：

也许没有问题。但是__threadfence()不保证（我知道）最长完成时间。因此，在对全局位置进行更新时，仅更新与执行线程块/SM 关联的 L1。然后我们击中了__threadfence(). 据推测，threadfence 需要一些时间才能完成，在此期间，另一个线程块可能驻留在同一个 SM 上，被引入执行（而前一个线程/warp/块在线程围栏处停滞），并在与该 SM 关联的（本地）L1 中"看到"更新的全局值。在其他 SM 中执行的其他线程块将看到"过时"值，直到__threadfence()完成。这就是我所说的可能的"同步差距"。两个不同的块仍然可以在短时间内看到两个不同的值。这是否重要将取决于全局值如何用于块之间的同步（因为这是正在讨论的主题）。因此，原子 + 易失性可能是比易失性 + 线程围栏更好的选择，以尝试覆盖同步的读取和写入路径。

编辑#2：从评论来看，使用原子加volatile的组合也解决了这个问题。

坦率地说，我发现您的代码过于复杂，而且 - 更重要的是 - 不完整。popBottom和popTop如何运作？此外，push操作是如何实施的？必须仔细设计这两个，以便正常工作并确保不会发生某些同步问题。

例如：当一个块试图将某些东西推送到其全局内存队列，而另一个块试图在同一时刻从中读取时，会发生什么？这非常重要，如果做得不好，它可能会在某些非常罕见的情况下崩溃，例如，您可能会从尚未写入的数据单元格中弹出。

当我实现类似的东西时 - 在所有块之间共享的单个全局内存双倍，我另外将每个数据单元标记为：空，占用和死。在伪代码中，算法或多或少是这样工作的：

/* Objects of this class should reside in CUDA global memory */
template <typename T, size_t size>
class WorkQueue {
private:
    size_t head, tail;
    size_t status[size];
    T data[size];
    enum {
        FieldFree = 0,
        FieldDead = 1,
        FieldTaken = 2
    };      
public:
    /* 
       This construction should actually be done by host on the device,
       before the actual kernel using it is launched!
       Zeroing the memory should suffice.
    */
    WorkQueue() : head(0), tail(0) {
        for (size_t i=0; i<size; ++i)
            status[i]=FieldFree;
    }   
    __device__ bool isEmpty() { return head==tail; }
    /* single thread of a block should call this */
    __device__ bool push(const T& val) {
        size_t oldFieldStatus;
        do {
            size_t cell = atomicInc(&tail,size-1);
            data[cell]=val;
            __threadfence(); //wait untill all blocks see the above change
            oldFieldStatus=atomicCAS(&status[cell],FieldFree,FieldTaken); //mark the cell as occupied
        } while (oldFieldStatus!=FieldFree); 
        return true;
    }
    /* single thread of a block should call this */
    __device__ bool pop(T& out) {
        size_t cellStatus;
        size_t cell;
        do {
            cell=atomicInc(&head,size-1);
            cellStatus=atomicCAS(&status[cell],FieldFree,FieldDead);
            //If cell was free, make it dead - any data stored there will not be processed. Ever.
        } while (cellStatus==FieldDead);
        if (cellStatus!=FieldTaken)
            return false;
        out = data[cell];
        status[cell]=FieldFree;
        return true;
    }
};

如果没有单元状态，我没有看到实现它的可靠方法 - 否则，如果来自两个不同块的两个线程尝试将/弹出到同一个取消队列单元中，就会发生不好的事情。使用上述方法，可能发生最坏的情况，弹出线程将无法弹出，返回 false 并将单元格标记为 dead ，并且推送线程将重试推送到下一个单元格。背后的想法是，如果弹出线程无法弹出，那么无论如何可能没有太多工作要做，并且块可以终止。使用这种方法，您将只"杀死"与并行运行的块一样多的单元格。

请注意，在上面的代码中，我不检查溢出！

相关内容

最新更新

热门标签：