为什么这个微不足道的opengl程序使用100%的cpu?



下面的opengl代码是专门为GPU设计的,这迫使CPU在GPU完成其工作时等待一点。特别是,它在glFinish()调用时这样做,CPU每帧等待99.87%的时间。该程序在我的系统(windows 10, gtx1070)上以10fps的速度运行,并禁用了垂直同步。这都是意料之中的,如果不是因为当CPU应该等待时,它莫名其妙地占用了100%的CPU时间,导致过热。

在6个使用intel gpu的系统、4个使用amd gpu的系统和5个使用nvidia gpu的系统上测试后,只有使用nvidia gpu的系统有问题。到目前为止,我所能得出的结论是,这个问题是英伟达和opengl特有的。Directx应用程序不会显示这个问题,事实上,在Firefox上运行一个禁用ANGLE的gpu最大化webgl页面也可能重现这个问题(启用ANGLE时不会发生)。

我用下面的代码编译:

C:mingw64binx86_64-w64-mingw32-gcc.exe %~dp0main.c -o %~dp0main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:mingw64bin -LC:mingw64lib -IC:mingw64include -lgdi32 -lopengl32

最少的代码(我建议调整片段着色器,使你击中大约10 fps,使问题更加明显):

#include <windows.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER     0x8892
#define GL_DYNAMIC_DRAW     0x88E8
#define GL_FRAGMENT_SHADER  0x8B30
#define GL_VERTEX_SHADER    0x8B31
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) {
    return DefWindowProc(hwnd, msg, wParam, lParam);
}
int main() {
    HDC hdc;
    {
        WNDCLASS wc;
        memset(&wc, 0, sizeof(wc));
        wc.style         = CS_OWNDC;
        wc.lpfnWndProc   = WndProc;
        wc.lpszClassName = "gldemo";
        if (!RegisterClass(&wc)) return 0;
        HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
        hdc = GetDC(hwnd);
        
        const PIXELFORMATDESCRIPTOR pfd = {0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
        SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
        wglMakeCurrent(hdc, wglCreateContext(hdc));
        
        ShowWindow(hwnd, SW_SHOW);
    }
    
    void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
    void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
    void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
    GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
    void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
    void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
    void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
    void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
    GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
    void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
    GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
    void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
    void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
    
    const char *g_vertCode =
        "#version 420n"
        "in vec3 vertexPosition;"
        "void main() {gl_Position = vec4(vertexPosition.xyz, 1.);}";
        
    const char *g_fragCode =
        "#version 420n"
        "void main() {"
            "float res = 0.5;"
            "for (int t=0;t<58000;t++) {" // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070. 
                "res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
            "}"
            "gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
        "}";
    
    GLuint prog = glCreateProgram();
    
    GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertshader, 1, &g_vertCode, 0);
    glCompileShader(vertshader);
    glAttachShader(prog, vertshader);
    
    GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragshader, 1, &g_fragCode, 0);
    glCompileShader(fragshader);
    glAttachShader(prog, fragshader);
    
    glLinkProgram(prog);
    glUseProgram(prog);
    
    GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
    
    float verts[4*3] = {1.0f, 1.0f, 0.0f,  -1.0f, 1.0f, 0.0f,  1.0f, -1.0f, 0.0f,  -1.0f, -1.0f, 0.0f};
    
    GLuint vboId;
    glGenBuffers(1, &vboId);
    glBindBuffer(GL_ARRAY_BUFFER, vboId);
    glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
    glEnableVertexAttribArray(attribVertexPosition);
    glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
    
    for (;;) {
        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
        //__asm__("int $3");
        glFinish(); // Halts here with 100% cpu on my system.
        SwapBuffers(hdc);
        
        MSG msg; char done = 0;
        while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) {
            if (msg.message == WM_QUIT) done = 1;
            TranslateMessage(&msg);
            DispatchMessage(&msg);
        }
        if (done) break;
    }
    
    return 0;
}

这不是一个答案,所以我不会把它标记为一个,但我现在可以提供一个解决方案。

这个想法是glFlush函数保证gpu已经开始处理,所以我们可以在纸上,在glFlush之后和glFinish之前插入一个等待,以减少glFinish花费在cpu上的时间。这只有在等待函数的分辨率不超过1ms时才可行。如果您使用Sleep或Windows API中具有可调整超时的任何其他函数,则可以通过在程序开始时调用timeBeginPeriod(1)来设置。为了确保等待不会超调,这可能导致gpu变得空闲,在下一帧中等待的公式是等待所花费的时间加上glFinish上花费的剩余时间减去0.5毫秒。

所有这一切的结果是,gpu保持良好的100%使用,根据任务管理器,但cpu利用率只有在香草glFinish将花费的时间低于4毫秒时才开始显着增加,如果它低于大约1.5毫秒,机制完全脱离自己。在实际实现中,可以考虑在限时区域内插入SwapBuffers调用,以便在计算中包含可能存在的任何帧限制系统,并且可能使用Waitable Timers而不是Sleep

编译:

C:mingw64binx86_64-w64-mingw32-gcc.exe %~dp0main.c -o %~dp0main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:mingw64bin -LC:mingw64lib -IC:mingw64include -lgdi32 -lopengl32 -lwinmm

更新代码:

#include <windows.h>
#include <stdio.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER     0x8892
#define GL_DYNAMIC_DRAW     0x88E8
#define GL_FRAGMENT_SHADER  0x8B30
#define GL_VERTEX_SHADER    0x8B31
LARGE_INTEGER Frequency;
unsigned long long int elapsedMicroseconds(LARGE_INTEGER StartingTime) {
    LARGE_INTEGER EndingTime;
    QueryPerformanceCounter(&EndingTime);
    return (1000000*(EndingTime.QuadPart - StartingTime.QuadPart))/Frequency.QuadPart;
}
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) {
    return DefWindowProc(hwnd, msg, wParam, lParam);
}
int main() {
    QueryPerformanceFrequency(&Frequency);
    timeBeginPeriod(1);
    
    HDC hdc;
    {
        WNDCLASS wc;
        memset(&wc, 0, sizeof(wc));
        wc.style         = CS_OWNDC;
        wc.lpfnWndProc   = WndProc;
        wc.lpszClassName = "gldemo";
        if (!RegisterClass(&wc)) return 0;
        HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
        hdc = GetDC(hwnd);
        
        const PIXELFORMATDESCRIPTOR pfd = {0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
        SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
        wglMakeCurrent(hdc, wglCreateContext(hdc));
        
        ShowWindow(hwnd, SW_SHOW);
    }
    
    void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
    void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
    void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
    GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
    void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
    void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
    void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
    void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
    GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
    void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
    GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
    void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
    void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
    
    const char *g_vertCode =
        "#version 420n"
        "in vec3 vertexPosition;"
        "void main() {gl_Position = vec4(vertexPosition.xyz, 1.);}";
        
    const char *g_fragCode =
        "#version 420n"
        "void main() {"
            "float res = 0.5;"
            "for (int t=0;t<58000;t++) {" // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070. 
                "res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
            "}"
            "gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
        "}";
    
    GLuint prog = glCreateProgram();
    
    GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertshader, 1, &g_vertCode, 0);
    glCompileShader(vertshader);
    glAttachShader(prog, vertshader);
    
    GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragshader, 1, &g_fragCode, 0);
    glCompileShader(fragshader);
    glAttachShader(prog, fragshader);
    
    glLinkProgram(prog);
    glUseProgram(prog);
    
    GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
    
    float verts[4*3] = {1.0f, 1.0f, 0.0f,  -1.0f, 1.0f, 0.0f,  1.0f, -1.0f, 0.0f,  -1.0f, -1.0f, 0.0f};
    
    GLuint vboId;
    glGenBuffers(1, &vboId);
    glBindBuffer(GL_ARRAY_BUFFER, vboId);
    glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
    glEnableVertexAttribArray(attribVertexPosition);
    glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
    
    LARGE_INTEGER syncer;
    long long int waitfor = 0;
    
    for (;;) {
        //__asm__("int $3");
        
        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
        glFlush();
        
        QueryPerformanceCounter(&syncer);
            if (waitfor>0) Sleep(waitfor/1000);
            glFinish();
        waitfor = elapsedMicroseconds(syncer)-500;
        
        SwapBuffers(hdc);
        
        MSG msg; char done = FALSE;
        while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) {
            if (msg.message == WM_QUIT) done = TRUE;
            TranslateMessage(&msg);
            DispatchMessage(&msg);
        }
        if (done) break;
    }
    
    return 0;
}

最新更新