Optimize CUDA support to avoid roundtrip to CPU memory

2025-08-17 17:06:08 +00:00 · 2021-12-06 18:22:39 -06:00 · 2021-12-06 18:22:39 -06:00 · 3e9aea1f7a
commit 3e9aea1f7a
parent f0c292f508
5 changed files with 141 additions and 8 deletions
--- a/app/app.pro
+++ b/app/app.pro
@ -88,6 +88,11 @@ unix:!macx {
            PKGCONFIG += libdrm
            CONFIG += libdrm
        }
        packagesExist(ffnvcodec) {
            PKGCONFIG += ffnvcodec
            CONFIG += cuda
        }
    }
    packagesExist(wayland-client) {
@ -196,7 +201,6 @@ ffmpeg {
    SOURCES += \
        streaming/video/ffmpeg.cpp \
        streaming/video/ffmpeg-renderers/sdlvid.cpp \
        streaming/video/ffmpeg-renderers/cuda.cpp \
        streaming/video/ffmpeg-renderers/pacer/pacer.cpp \
        streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.cpp
@ -204,7 +208,6 @@ ffmpeg {
        streaming/video/ffmpeg.h \
        streaming/video/ffmpeg-renderers/renderer.h \
        streaming/video/ffmpeg-renderers/sdlvid.h \
        streaming/video/ffmpeg-renderers/cuda.h \
        streaming/video/ffmpeg-renderers/pacer/pacer.h \
        streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.h
 }
@ -261,6 +264,13 @@ libdrm {
        LIBS += -ldl
    }
 }
 cuda {
    message(CUDA support enabled)
    DEFINES += HAVE_CUDA
    SOURCES += streaming/video/ffmpeg-renderers/cuda.cpp
    HEADERS += streaming/video/ffmpeg-renderers/cuda.h
 }
 config_EGL {
    message(EGL renderer selected)
--- a/app/streaming/video/ffmpeg-renderers/cuda.cpp
+++ b/app/streaming/video/ffmpeg-renderers/cuda.cpp
@ -1,5 +1,13 @@
 #include "cuda.h"
 #include <ffnvcodec/dynlink_loader.h>
 #include <SDL_opengl.h>
 extern "C" {
    #include <libavutil/hwcontext_cuda.h>
 }
 CUDARenderer::CUDARenderer()
    : m_HwContext(nullptr)
 {
@ -55,3 +63,96 @@ bool CUDARenderer::isDirectRenderingSupported()
    return false;
 }
 bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
 {
    static CudaFunctions* funcs;
    CUresult err;
    AVCUDADeviceContext* devCtx = (AVCUDADeviceContext*)(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx->hwctx);
    bool ret = false;
    if (!funcs) {
        // One-time init of CUDA library
        cuda_load_functions(&funcs, nullptr);
        if (!funcs) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
            return false;
        }
    }
    SDL_assert(frame->format == AV_PIX_FMT_CUDA);
    // Push FFmpeg's CUDA context to use for our CUDA operations
    err = funcs->cuCtxPushCurrent(devCtx->cuda_ctx);
    if (err != CUDA_SUCCESS) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
        return false;
    }
    // NV12 has 2 planes
    for (int i = 0; i < 2; i++) {
        CUgraphicsResource cudaResource;
        CUarray cudaArray;
        GLint tex;
        // Get the ID of this plane's texture
        glActiveTexture(GL_TEXTURE0 + i);
        glGetIntegerv(GL_TEXTURE_BINDING_2D, &tex);
        // Register it with CUDA
        err = funcs->cuGraphicsGLRegisterImage(&cudaResource, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsGLRegisterImage() failed: %d", err);
            goto Exit;
        }
        // Map it to allow us to use it as a copy destination
        err = funcs->cuGraphicsMapResources(1, &cudaResource, devCtx->stream);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
            funcs->cuGraphicsUnregisterResource(cudaResource);
            goto Exit;
        }
        // Get a pointer to the mapped array
        err = funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, cudaResource, 0, 0);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsSubResourceGetMappedArray() failed: %d", err);
            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
            funcs->cuGraphicsUnregisterResource(cudaResource);
            goto Exit;
        }
        CUDA_MEMCPY2D cu2d = {
            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
            .srcDevice = (CUdeviceptr)frame->data[i],
            .srcPitch = (size_t)frame->linesize[i],
            .dstMemoryType = CU_MEMORYTYPE_ARRAY,
            .dstArray = cudaArray,
            .dstPitch = (size_t)frame->width >> i,
            .WidthInBytes = (size_t)frame->width,
            .Height = (size_t)frame->height >> i
        };
        // Do the copy
        err = funcs->cuMemcpy2D(&cu2d);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuMemcpy2D() failed: %d", err);
            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
            funcs->cuGraphicsUnregisterResource(cudaResource);
            goto Exit;
        }
        funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
        funcs->cuGraphicsUnregisterResource(cudaResource);
    }
    ret = true;
 Exit:
    {
        CUcontext dummy;
        funcs->cuCtxPopCurrent(&dummy);
    }
    return ret;
 }
--- a/app/streaming/video/ffmpeg-renderers/cuda.h
+++ b/app/streaming/video/ffmpeg-renderers/cuda.h
@ -12,6 +12,9 @@ public:
    virtual bool needsTestFrame() override;
    virtual bool isDirectRenderingSupported() override;
    // Helper function used by SDLRenderer to read our CUDA frame
    static bool copyCudaFrameToBoundTexture(AVFrame* frame);
 private:
    AVBufferRef* m_HwContext;
 };
--- a/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
+++ b/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
@ -5,6 +5,10 @@
 #include <Limelight.h>
 #ifdef HAVE_CUDA
 #include "cuda.h"
 #endif
 SdlRenderer::SdlRenderer()
    : m_Renderer(nullptr),
      m_Texture(nullptr),
@ -203,7 +207,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        return;
    }
-    if (frame->hw_frames_ctx != nullptr) {
+    if (frame->hw_frames_ctx != nullptr && frame->format != AV_PIX_FMT_CUDA) {
        // If we are acting as the frontend for a hardware
        // accelerated decoder, we'll need to read the frame
        // back to render it.
@ -254,6 +258,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        case AV_PIX_FMT_YUV420P:
            sdlFormat = SDL_PIXELFORMAT_YV12;
            break;
        case AV_PIX_FMT_CUDA:
        case AV_PIX_FMT_NV12:
            sdlFormat = SDL_PIXELFORMAT_NV12;
            break;
@ -290,7 +295,18 @@ void SdlRenderer::renderFrame(AVFrame* frame)
        }
    }
-    if (frame->format == AV_PIX_FMT_YUV420P) {
+    if (frame->format == AV_PIX_FMT_CUDA) {
 #ifdef HAVE_CUDA
        SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
        CUDARenderer::copyCudaFrameToBoundTexture(frame);
        SDL_GL_UnbindTexture(m_Texture);
 #else
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION,
                     "Got CUDA frame, but not built with CUDA support!");
        goto Exit;
 #endif
    }
    else if (frame->format == AV_PIX_FMT_YUV420P) {
        SDL_UpdateYUVTexture(m_Texture, nullptr,
                             frame->data[0],
                             frame->linesize[0],
--- a/app/streaming/video/ffmpeg.cpp
+++ b/app/streaming/video/ffmpeg.cpp
@ -6,7 +6,6 @@
 #include <h264_stream.h>
 #include "ffmpeg-renderers/sdlvid.h"
 #include "ffmpeg-renderers/cuda.h"
 #ifdef Q_OS_WIN32
 #include "ffmpeg-renderers/dxva2.h"
@ -36,6 +35,10 @@
 #include "ffmpeg-renderers/eglvid.h"
 #endif
 #ifdef HAVE_CUDA
 #include "ffmpeg-renderers/cuda.h"
 #endif
 // This is gross but it allows us to use sizeof()
 #include "ffmpeg_videosamples.cpp"
@ -567,11 +570,11 @@ IFFmpegRenderer* FFmpegVideoDecoder::createHwAccelRenderer(const AVCodecHWConfig
    // Second pass for our second-tier hwaccel implementations
    else if (pass == 1) {
        switch (hwDecodeCfg->device_type) {
 #ifdef HAVE_CUDA
        case AV_HWDEVICE_TYPE_CUDA:
-            // CUDA should only be used if all other options fail, since it requires
+            // CUDA should only be used to cover the NVIDIA+Wayland case
            // read-back of frames. This should only be used for the NVIDIA+Wayland case
            // with VDPAU covering the NVIDIA+X11 scenario.
            return new CUDARenderer();
 #endif
        default:
            return nullptr;
        }