下面的opengl代码是专门为GPU设计的,这迫使CPU在GPU完成其工作时等待一点。特别是,它在glFinish()
调用时这样做,CPU每帧等待99.87%的时间。该程序在我的系统(windows 10, gtx1070)上以10fps的速度运行,并禁用了垂直同步。这都是意料之中的,如果不是因为当CPU应该等待时,它莫名其妙地占用了100%的CPU时间,导致过热。
在6个使用intel gpu的系统、4个使用amd gpu的系统和5个使用nvidia gpu的系统上测试后,只有使用nvidia gpu的系统有问题。到目前为止,我所能得出的结论是,这个问题是英伟达和opengl特有的。Directx应用程序不会显示这个问题,事实上,在Firefox上运行一个禁用ANGLE的gpu最大化webgl页面也可能重现这个问题(启用ANGLE时不会发生)。
我用下面的代码编译:
C:mingw64binx86_64-w64-mingw32-gcc.exe %~dp0main.c -o %~dp0main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:mingw64bin -LC:mingw64lib -IC:mingw64include -lgdi32 -lopengl32
最少的代码(我建议调整片段着色器,使你击中大约10 fps,使问题更加明显):
#include <windows.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER 0x8892
#define GL_DYNAMIC_DRAW 0x88E8
#define GL_FRAGMENT_SHADER 0x8B30
#define GL_VERTEX_SHADER 0x8B31
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) {
return DefWindowProc(hwnd, msg, wParam, lParam);
}
int main() {
HDC hdc;
{
WNDCLASS wc;
memset(&wc, 0, sizeof(wc));
wc.style = CS_OWNDC;
wc.lpfnWndProc = WndProc;
wc.lpszClassName = "gldemo";
if (!RegisterClass(&wc)) return 0;
HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
hdc = GetDC(hwnd);
const PIXELFORMATDESCRIPTOR pfd = {0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
wglMakeCurrent(hdc, wglCreateContext(hdc));
ShowWindow(hwnd, SW_SHOW);
}
void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
const char *g_vertCode =
"#version 420n"
"in vec3 vertexPosition;"
"void main() {gl_Position = vec4(vertexPosition.xyz, 1.);}";
const char *g_fragCode =
"#version 420n"
"void main() {"
"float res = 0.5;"
"for (int t=0;t<58000;t++) {" // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070.
"res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
"}"
"gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
"}";
GLuint prog = glCreateProgram();
GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertshader, 1, &g_vertCode, 0);
glCompileShader(vertshader);
glAttachShader(prog, vertshader);
GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragshader, 1, &g_fragCode, 0);
glCompileShader(fragshader);
glAttachShader(prog, fragshader);
glLinkProgram(prog);
glUseProgram(prog);
GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
float verts[4*3] = {1.0f, 1.0f, 0.0f, -1.0f, 1.0f, 0.0f, 1.0f, -1.0f, 0.0f, -1.0f, -1.0f, 0.0f};
GLuint vboId;
glGenBuffers(1, &vboId);
glBindBuffer(GL_ARRAY_BUFFER, vboId);
glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
glEnableVertexAttribArray(attribVertexPosition);
glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
for (;;) {
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
//__asm__("int $3");
glFinish(); // Halts here with 100% cpu on my system.
SwapBuffers(hdc);
MSG msg; char done = 0;
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) {
if (msg.message == WM_QUIT) done = 1;
TranslateMessage(&msg);
DispatchMessage(&msg);
}
if (done) break;
}
return 0;
}
这不是一个答案,所以我不会把它标记为一个,但我现在可以提供一个解决方案。
这个想法是glFlush
函数保证gpu已经开始处理,所以我们可以在纸上,在glFlush
之后和glFinish
之前插入一个等待,以减少glFinish
花费在cpu上的时间。这只有在等待函数的分辨率不超过1ms时才可行。如果您使用Sleep
或Windows API中具有可调整超时的任何其他函数,则可以通过在程序开始时调用timeBeginPeriod(1)
来设置。为了确保等待不会超调,这可能导致gpu变得空闲,在下一帧中等待的公式是等待所花费的时间加上glFinish
上花费的剩余时间减去0.5毫秒。
所有这一切的结果是,gpu保持良好的100%使用,根据任务管理器,但cpu利用率只有在香草glFinish
将花费的时间低于4毫秒时才开始显着增加,如果它低于大约1.5毫秒,机制完全脱离自己。在实际实现中,可以考虑在限时区域内插入SwapBuffers
调用,以便在计算中包含可能存在的任何帧限制系统,并且可能使用Waitable Timers
而不是Sleep
。
编译:
C:mingw64binx86_64-w64-mingw32-gcc.exe %~dp0main.c -o %~dp0main.exe -static-libgcc -std=c11 -ggdb -O2 -Wall -BC:mingw64bin -LC:mingw64lib -IC:mingw64include -lgdi32 -lopengl32 -lwinmm
更新代码:
#include <windows.h>
#include <stdio.h>
#include <GL/gl.h>
typedef signed long long int GLsizeiptr;
typedef char GLchar;
#define GL_ARRAY_BUFFER 0x8892
#define GL_DYNAMIC_DRAW 0x88E8
#define GL_FRAGMENT_SHADER 0x8B30
#define GL_VERTEX_SHADER 0x8B31
LARGE_INTEGER Frequency;
unsigned long long int elapsedMicroseconds(LARGE_INTEGER StartingTime) {
LARGE_INTEGER EndingTime;
QueryPerformanceCounter(&EndingTime);
return (1000000*(EndingTime.QuadPart - StartingTime.QuadPart))/Frequency.QuadPart;
}
LRESULT CALLBACK WndProc(HWND hwnd, UINT msg, WPARAM wParam, LPARAM lParam) {
return DefWindowProc(hwnd, msg, wParam, lParam);
}
int main() {
QueryPerformanceFrequency(&Frequency);
timeBeginPeriod(1);
HDC hdc;
{
WNDCLASS wc;
memset(&wc, 0, sizeof(wc));
wc.style = CS_OWNDC;
wc.lpfnWndProc = WndProc;
wc.lpszClassName = "gldemo";
if (!RegisterClass(&wc)) return 0;
HWND hwnd = CreateWindow("gldemo", "Demo", WS_POPUP, 0, 0, 1920/2, 1080/2, 0, 0, NULL, 0);
hdc = GetDC(hwnd);
const PIXELFORMATDESCRIPTOR pfd = {0,0, PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
SetPixelFormat(hdc, ChoosePixelFormat(hdc, &pfd), &pfd);
wglMakeCurrent(hdc, wglCreateContext(hdc));
ShowWindow(hwnd, SW_SHOW);
}
void (*glGenBuffers)(GLsizei, GLuint *) = (void*)wglGetProcAddress("glGenBuffers");
void (*glBindBuffer)(GLenum, GLuint) = (void*)wglGetProcAddress("glBindBuffer");
void (*glBufferData)(GLenum, GLsizeiptr, void *, GLenum) = (void*)wglGetProcAddress("glBufferData");
GLuint (*glCreateShader)(GLuint) = (void*)wglGetProcAddress("glCreateShader");
void (*glAttachShader)(GLuint, GLuint) = (void*)wglGetProcAddress("glAttachShader");
void (*glCompileShader)(GLuint) = (void*)wglGetProcAddress("glCompileShader");
void (*glShaderSource)(GLuint, GLuint, const char **, const GLint *) = (void*)wglGetProcAddress("glShaderSource");
void (*glEnableVertexAttribArray)(GLuint) = (void*)wglGetProcAddress("glEnableVertexAttribArray");
GLuint (*glGetAttribLocation)(GLuint, GLchar *) = (void*)wglGetProcAddress("glGetAttribLocation");
void (*glVertexAttribPointer)(GLuint, GLint, GLenum, GLboolean, GLsizei, void *) = (void*)wglGetProcAddress("glVertexAttribPointer");
GLuint (*glCreateProgram)() = (void*)wglGetProcAddress("glCreateProgram");
void (*glLinkProgram)(GLuint) = (void*)wglGetProcAddress("glLinkProgram");
void (*glUseProgram)(GLuint) = (void*)wglGetProcAddress("glUseProgram");
const char *g_vertCode =
"#version 420n"
"in vec3 vertexPosition;"
"void main() {gl_Position = vec4(vertexPosition.xyz, 1.);}";
const char *g_fragCode =
"#version 420n"
"void main() {"
"float res = 0.5;"
"for (int t=0;t<58000;t++) {" // tweak this to make sure you're outputting ~10 fps. 58000 is ok for a gtx 1070.
"res = fract(sin(dot(gl_FragCoord.xy+res, vec2(12.9898,78.233))) * 43758.5453);"
"}"
"gl_FragColor = vec4(vec3(res)*0.4, 1.0);"
"}";
GLuint prog = glCreateProgram();
GLuint vertshader = glCreateShader(GL_VERTEX_SHADER);
glShaderSource(vertshader, 1, &g_vertCode, 0);
glCompileShader(vertshader);
glAttachShader(prog, vertshader);
GLuint fragshader = glCreateShader(GL_FRAGMENT_SHADER);
glShaderSource(fragshader, 1, &g_fragCode, 0);
glCompileShader(fragshader);
glAttachShader(prog, fragshader);
glLinkProgram(prog);
glUseProgram(prog);
GLuint attribVertexPosition = glGetAttribLocation(prog, "vertexPosition");
float verts[4*3] = {1.0f, 1.0f, 0.0f, -1.0f, 1.0f, 0.0f, 1.0f, -1.0f, 0.0f, -1.0f, -1.0f, 0.0f};
GLuint vboId;
glGenBuffers(1, &vboId);
glBindBuffer(GL_ARRAY_BUFFER, vboId);
glBufferData(GL_ARRAY_BUFFER, 4*3*4, verts, GL_DYNAMIC_DRAW);
glEnableVertexAttribArray(attribVertexPosition);
glVertexAttribPointer(attribVertexPosition, 3, GL_FLOAT, 0, 12, (void*)0);
LARGE_INTEGER syncer;
long long int waitfor = 0;
for (;;) {
//__asm__("int $3");
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glFlush();
QueryPerformanceCounter(&syncer);
if (waitfor>0) Sleep(waitfor/1000);
glFinish();
waitfor = elapsedMicroseconds(syncer)-500;
SwapBuffers(hdc);
MSG msg; char done = FALSE;
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) {
if (msg.message == WM_QUIT) done = TRUE;
TranslateMessage(&msg);
DispatchMessage(&msg);
}
if (done) break;
}
return 0;
}