Optimize CUDA support to avoid roundtrip to CPU memory

2026-06-16 05:30:58 +00:00 · 2021-12-06 18:22:39 -06:00
parent f0c292f508
commit 3e9aea1f7a
5 changed files with 141 additions and 8 deletions
@@ -88,6 +88,11 @@ unix:!macx {
            PKGCONFIG += libdrm
            CONFIG += libdrm
        }
+
+        packagesExist(ffnvcodec) {
+            PKGCONFIG += ffnvcodec
+            CONFIG += cuda
+        }
    }

    packagesExist(wayland-client) {
@@ -196,7 +201,6 @@ ffmpeg {
    SOURCES += \
        streaming/video/ffmpeg.cpp \
        streaming/video/ffmpeg-renderers/sdlvid.cpp \
-        streaming/video/ffmpeg-renderers/cuda.cpp \
        streaming/video/ffmpeg-renderers/pacer/pacer.cpp \
        streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.cpp

@@ -204,7 +208,6 @@ ffmpeg {
        streaming/video/ffmpeg.h \
        streaming/video/ffmpeg-renderers/renderer.h \
        streaming/video/ffmpeg-renderers/sdlvid.h \
-        streaming/video/ffmpeg-renderers/cuda.h \
        streaming/video/ffmpeg-renderers/pacer/pacer.h \
        streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.h
 }
@@ -261,6 +264,13 @@ libdrm {
        LIBS += -ldl
    }
 }
+cuda {
+    message(CUDA support enabled)
+
+    DEFINES += HAVE_CUDA
+    SOURCES += streaming/video/ffmpeg-renderers/cuda.cpp
+    HEADERS += streaming/video/ffmpeg-renderers/cuda.h
+}
 config_EGL {
    message(EGL renderer selected)

@@ -1,5 +1,13 @@
 #include "cuda.h"

+#include <ffnvcodec/dynlink_loader.h>
+
+#include <SDL_opengl.h>
+
+extern "C" {
+    #include <libavutil/hwcontext_cuda.h>
+}
+
 CUDARenderer::CUDARenderer()
    : m_HwContext(nullptr)
 {
@@ -55,3 +63,96 @@ bool CUDARenderer::isDirectRenderingSupported()
    return false;
 }

+bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
+{
+    static CudaFunctions* funcs;
+    CUresult err;
+    AVCUDADeviceContext* devCtx = (AVCUDADeviceContext*)(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx->hwctx);
+    bool ret = false;
+
+    if (!funcs) {
+        // One-time init of CUDA library
+        cuda_load_functions(&funcs, nullptr);
+        if (!funcs) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
+            return false;
+        }
+    }
+
+    SDL_assert(frame->format == AV_PIX_FMT_CUDA);
+
+    // Push FFmpeg's CUDA context to use for our CUDA operations
+    err = funcs->cuCtxPushCurrent(devCtx->cuda_ctx);
+    if (err != CUDA_SUCCESS) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
+        return false;
+    }
+
+    // NV12 has 2 planes
+    for (int i = 0; i < 2; i++) {
+        CUgraphicsResource cudaResource;
+        CUarray cudaArray;
+        GLint tex;
+
+        // Get the ID of this plane's texture
+        glActiveTexture(GL_TEXTURE0 + i);
+        glGetIntegerv(GL_TEXTURE_BINDING_2D, &tex);
+
+        // Register it with CUDA
+        err = funcs->cuGraphicsGLRegisterImage(&cudaResource, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
+        if (err != CUDA_SUCCESS) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsGLRegisterImage() failed: %d", err);
+            goto Exit;
+        }
+
+        // Map it to allow us to use it as a copy destination
+        err = funcs->cuGraphicsMapResources(1, &cudaResource, devCtx->stream);
+        if (err != CUDA_SUCCESS) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
+            funcs->cuGraphicsUnregisterResource(cudaResource);
+            goto Exit;
+        }
+
+        // Get a pointer to the mapped array
+        err = funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, cudaResource, 0, 0);
+        if (err != CUDA_SUCCESS) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsSubResourceGetMappedArray() failed: %d", err);
+            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
+            funcs->cuGraphicsUnregisterResource(cudaResource);
+            goto Exit;
+        }
+
+        CUDA_MEMCPY2D cu2d = {
+            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
+            .srcDevice = (CUdeviceptr)frame->data[i],
+            .srcPitch = (size_t)frame->linesize[i],
+            .dstMemoryType = CU_MEMORYTYPE_ARRAY,
+            .dstArray = cudaArray,
+            .dstPitch = (size_t)frame->width >> i,
+            .WidthInBytes = (size_t)frame->width,
+            .Height = (size_t)frame->height >> i
+        };
+
+        // Do the copy
+        err = funcs->cuMemcpy2D(&cu2d);
+        if (err != CUDA_SUCCESS) {
+            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuMemcpy2D() failed: %d", err);
+            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
+            funcs->cuGraphicsUnregisterResource(cudaResource);
+            goto Exit;
+        }
+
+        funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
+        funcs->cuGraphicsUnregisterResource(cudaResource);
+    }
+
+    ret = true;
+
+Exit:
+    {
+        CUcontext dummy;
+        funcs->cuCtxPopCurrent(&dummy);
+    }
+    return ret;
+}
+
@@ -12,6 +12,9 @@ public:
    virtual bool needsTestFrame() override;
    virtual bool isDirectRenderingSupported() override;

+    // Helper function used by SDLRenderer to read our CUDA frame
+    static bool copyCudaFrameToBoundTexture(AVFrame* frame);
+
 private:
    AVBufferRef* m_HwContext;
 };
@@ -5,6 +5,10 @@

 #include <Limelight.h>

+#ifdef HAVE_CUDA
+#include "cuda.h"
+#endif
+
 SdlRenderer::SdlRenderer()
    : m_Renderer(nullptr),
      m_Texture(nullptr),
@@ -203,7 +207,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        return;
    }

-    if (frame->hw_frames_ctx != nullptr) {
+    if (frame->hw_frames_ctx != nullptr && frame->format != AV_PIX_FMT_CUDA) {
        // If we are acting as the frontend for a hardware
        // accelerated decoder, we'll need to read the frame
        // back to render it.
@@ -254,6 +258,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        case AV_PIX_FMT_YUV420P:
            sdlFormat = SDL_PIXELFORMAT_YV12;
            break;
+        case AV_PIX_FMT_CUDA:
        case AV_PIX_FMT_NV12:
            sdlFormat = SDL_PIXELFORMAT_NV12;
            break;
@@ -290,7 +295,18 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        }
    }

-    if (frame->format == AV_PIX_FMT_YUV420P) {
+    if (frame->format == AV_PIX_FMT_CUDA) {
+#ifdef HAVE_CUDA
+        SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
+        CUDARenderer::copyCudaFrameToBoundTexture(frame);
+        SDL_GL_UnbindTexture(m_Texture);
+#else
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION,
+                     "Got CUDA frame, but not built with CUDA support!");
+        goto Exit;
+#endif
+    }
+    else if (frame->format == AV_PIX_FMT_YUV420P) {
        SDL_UpdateYUVTexture(m_Texture, nullptr,
                             frame->data[0],
                             frame->linesize[0],
@@ -6,7 +6,6 @@
 #include <h264_stream.h>

 #include "ffmpeg-renderers/sdlvid.h"
-#include "ffmpeg-renderers/cuda.h"

 #ifdef Q_OS_WIN32
 #include "ffmpeg-renderers/dxva2.h"
@@ -36,6 +35,10 @@
 #include "ffmpeg-renderers/eglvid.h"
 #endif

+#ifdef HAVE_CUDA
+#include "ffmpeg-renderers/cuda.h"
+#endif
+
 // This is gross but it allows us to use sizeof()
 #include "ffmpeg_videosamples.cpp"

@@ -567,11 +570,11 @@ IFFmpegRenderer* FFmpegVideoDecoder::createHwAccelRenderer(const AVCodecHWConfig
    // Second pass for our second-tier hwaccel implementations
    else if (pass == 1) {
        switch (hwDecodeCfg->device_type) {
+#ifdef HAVE_CUDA
        case AV_HWDEVICE_TYPE_CUDA:
-            // CUDA should only be used if all other options fail, since it requires
-            // read-back of frames. This should only be used for the NVIDIA+Wayland case
-            // with VDPAU covering the NVIDIA+X11 scenario.
+            // CUDA should only be used to cover the NVIDIA+Wayland case
            return new CUDARenderer();
+#endif
        default:
            return nullptr;
        }