mirror of
https://github.com/moonlight-stream/moonlight-qt.git
synced 2025-07-02 07:46:07 +00:00
Optimize CUDA support to avoid roundtrip to CPU memory
This commit is contained in:
parent
f0c292f508
commit
3e9aea1f7a
14
app/app.pro
14
app/app.pro
@ -88,6 +88,11 @@ unix:!macx {
|
|||||||
PKGCONFIG += libdrm
|
PKGCONFIG += libdrm
|
||||||
CONFIG += libdrm
|
CONFIG += libdrm
|
||||||
}
|
}
|
||||||
|
|
||||||
|
packagesExist(ffnvcodec) {
|
||||||
|
PKGCONFIG += ffnvcodec
|
||||||
|
CONFIG += cuda
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
packagesExist(wayland-client) {
|
packagesExist(wayland-client) {
|
||||||
@ -196,7 +201,6 @@ ffmpeg {
|
|||||||
SOURCES += \
|
SOURCES += \
|
||||||
streaming/video/ffmpeg.cpp \
|
streaming/video/ffmpeg.cpp \
|
||||||
streaming/video/ffmpeg-renderers/sdlvid.cpp \
|
streaming/video/ffmpeg-renderers/sdlvid.cpp \
|
||||||
streaming/video/ffmpeg-renderers/cuda.cpp \
|
|
||||||
streaming/video/ffmpeg-renderers/pacer/pacer.cpp \
|
streaming/video/ffmpeg-renderers/pacer/pacer.cpp \
|
||||||
streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.cpp
|
streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.cpp
|
||||||
|
|
||||||
@ -204,7 +208,6 @@ ffmpeg {
|
|||||||
streaming/video/ffmpeg.h \
|
streaming/video/ffmpeg.h \
|
||||||
streaming/video/ffmpeg-renderers/renderer.h \
|
streaming/video/ffmpeg-renderers/renderer.h \
|
||||||
streaming/video/ffmpeg-renderers/sdlvid.h \
|
streaming/video/ffmpeg-renderers/sdlvid.h \
|
||||||
streaming/video/ffmpeg-renderers/cuda.h \
|
|
||||||
streaming/video/ffmpeg-renderers/pacer/pacer.h \
|
streaming/video/ffmpeg-renderers/pacer/pacer.h \
|
||||||
streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.h
|
streaming/video/ffmpeg-renderers/pacer/nullthreadedvsyncsource.h
|
||||||
}
|
}
|
||||||
@ -261,6 +264,13 @@ libdrm {
|
|||||||
LIBS += -ldl
|
LIBS += -ldl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cuda {
|
||||||
|
message(CUDA support enabled)
|
||||||
|
|
||||||
|
DEFINES += HAVE_CUDA
|
||||||
|
SOURCES += streaming/video/ffmpeg-renderers/cuda.cpp
|
||||||
|
HEADERS += streaming/video/ffmpeg-renderers/cuda.h
|
||||||
|
}
|
||||||
config_EGL {
|
config_EGL {
|
||||||
message(EGL renderer selected)
|
message(EGL renderer selected)
|
||||||
|
|
||||||
|
@ -1,5 +1,13 @@
|
|||||||
#include "cuda.h"
|
#include "cuda.h"
|
||||||
|
|
||||||
|
#include <ffnvcodec/dynlink_loader.h>
|
||||||
|
|
||||||
|
#include <SDL_opengl.h>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
#include <libavutil/hwcontext_cuda.h>
|
||||||
|
}
|
||||||
|
|
||||||
CUDARenderer::CUDARenderer()
|
CUDARenderer::CUDARenderer()
|
||||||
: m_HwContext(nullptr)
|
: m_HwContext(nullptr)
|
||||||
{
|
{
|
||||||
@ -55,3 +63,96 @@ bool CUDARenderer::isDirectRenderingSupported()
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
|
||||||
|
{
|
||||||
|
static CudaFunctions* funcs;
|
||||||
|
CUresult err;
|
||||||
|
AVCUDADeviceContext* devCtx = (AVCUDADeviceContext*)(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx->hwctx);
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
if (!funcs) {
|
||||||
|
// One-time init of CUDA library
|
||||||
|
cuda_load_functions(&funcs, nullptr);
|
||||||
|
if (!funcs) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SDL_assert(frame->format == AV_PIX_FMT_CUDA);
|
||||||
|
|
||||||
|
// Push FFmpeg's CUDA context to use for our CUDA operations
|
||||||
|
err = funcs->cuCtxPushCurrent(devCtx->cuda_ctx);
|
||||||
|
if (err != CUDA_SUCCESS) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NV12 has 2 planes
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
CUgraphicsResource cudaResource;
|
||||||
|
CUarray cudaArray;
|
||||||
|
GLint tex;
|
||||||
|
|
||||||
|
// Get the ID of this plane's texture
|
||||||
|
glActiveTexture(GL_TEXTURE0 + i);
|
||||||
|
glGetIntegerv(GL_TEXTURE_BINDING_2D, &tex);
|
||||||
|
|
||||||
|
// Register it with CUDA
|
||||||
|
err = funcs->cuGraphicsGLRegisterImage(&cudaResource, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
|
||||||
|
if (err != CUDA_SUCCESS) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsGLRegisterImage() failed: %d", err);
|
||||||
|
goto Exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map it to allow us to use it as a copy destination
|
||||||
|
err = funcs->cuGraphicsMapResources(1, &cudaResource, devCtx->stream);
|
||||||
|
if (err != CUDA_SUCCESS) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
|
||||||
|
funcs->cuGraphicsUnregisterResource(cudaResource);
|
||||||
|
goto Exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a pointer to the mapped array
|
||||||
|
err = funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, cudaResource, 0, 0);
|
||||||
|
if (err != CUDA_SUCCESS) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsSubResourceGetMappedArray() failed: %d", err);
|
||||||
|
funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
|
||||||
|
funcs->cuGraphicsUnregisterResource(cudaResource);
|
||||||
|
goto Exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_MEMCPY2D cu2d = {
|
||||||
|
.srcMemoryType = CU_MEMORYTYPE_DEVICE,
|
||||||
|
.srcDevice = (CUdeviceptr)frame->data[i],
|
||||||
|
.srcPitch = (size_t)frame->linesize[i],
|
||||||
|
.dstMemoryType = CU_MEMORYTYPE_ARRAY,
|
||||||
|
.dstArray = cudaArray,
|
||||||
|
.dstPitch = (size_t)frame->width >> i,
|
||||||
|
.WidthInBytes = (size_t)frame->width,
|
||||||
|
.Height = (size_t)frame->height >> i
|
||||||
|
};
|
||||||
|
|
||||||
|
// Do the copy
|
||||||
|
err = funcs->cuMemcpy2D(&cu2d);
|
||||||
|
if (err != CUDA_SUCCESS) {
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuMemcpy2D() failed: %d", err);
|
||||||
|
funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
|
||||||
|
funcs->cuGraphicsUnregisterResource(cudaResource);
|
||||||
|
goto Exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
|
||||||
|
funcs->cuGraphicsUnregisterResource(cudaResource);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = true;
|
||||||
|
|
||||||
|
Exit:
|
||||||
|
{
|
||||||
|
CUcontext dummy;
|
||||||
|
funcs->cuCtxPopCurrent(&dummy);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -12,6 +12,9 @@ public:
|
|||||||
virtual bool needsTestFrame() override;
|
virtual bool needsTestFrame() override;
|
||||||
virtual bool isDirectRenderingSupported() override;
|
virtual bool isDirectRenderingSupported() override;
|
||||||
|
|
||||||
|
// Helper function used by SDLRenderer to read our CUDA frame
|
||||||
|
static bool copyCudaFrameToBoundTexture(AVFrame* frame);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
AVBufferRef* m_HwContext;
|
AVBufferRef* m_HwContext;
|
||||||
};
|
};
|
||||||
|
@ -5,6 +5,10 @@
|
|||||||
|
|
||||||
#include <Limelight.h>
|
#include <Limelight.h>
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
#include "cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
SdlRenderer::SdlRenderer()
|
SdlRenderer::SdlRenderer()
|
||||||
: m_Renderer(nullptr),
|
: m_Renderer(nullptr),
|
||||||
m_Texture(nullptr),
|
m_Texture(nullptr),
|
||||||
@ -203,7 +207,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (frame->hw_frames_ctx != nullptr) {
|
if (frame->hw_frames_ctx != nullptr && frame->format != AV_PIX_FMT_CUDA) {
|
||||||
// If we are acting as the frontend for a hardware
|
// If we are acting as the frontend for a hardware
|
||||||
// accelerated decoder, we'll need to read the frame
|
// accelerated decoder, we'll need to read the frame
|
||||||
// back to render it.
|
// back to render it.
|
||||||
@ -254,6 +258,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
|
|||||||
case AV_PIX_FMT_YUV420P:
|
case AV_PIX_FMT_YUV420P:
|
||||||
sdlFormat = SDL_PIXELFORMAT_YV12;
|
sdlFormat = SDL_PIXELFORMAT_YV12;
|
||||||
break;
|
break;
|
||||||
|
case AV_PIX_FMT_CUDA:
|
||||||
case AV_PIX_FMT_NV12:
|
case AV_PIX_FMT_NV12:
|
||||||
sdlFormat = SDL_PIXELFORMAT_NV12;
|
sdlFormat = SDL_PIXELFORMAT_NV12;
|
||||||
break;
|
break;
|
||||||
@ -290,7 +295,18 @@ void SdlRenderer::renderFrame(AVFrame* frame)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (frame->format == AV_PIX_FMT_YUV420P) {
|
if (frame->format == AV_PIX_FMT_CUDA) {
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
|
||||||
|
CUDARenderer::copyCudaFrameToBoundTexture(frame);
|
||||||
|
SDL_GL_UnbindTexture(m_Texture);
|
||||||
|
#else
|
||||||
|
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION,
|
||||||
|
"Got CUDA frame, but not built with CUDA support!");
|
||||||
|
goto Exit;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else if (frame->format == AV_PIX_FMT_YUV420P) {
|
||||||
SDL_UpdateYUVTexture(m_Texture, nullptr,
|
SDL_UpdateYUVTexture(m_Texture, nullptr,
|
||||||
frame->data[0],
|
frame->data[0],
|
||||||
frame->linesize[0],
|
frame->linesize[0],
|
||||||
|
@ -6,7 +6,6 @@
|
|||||||
#include <h264_stream.h>
|
#include <h264_stream.h>
|
||||||
|
|
||||||
#include "ffmpeg-renderers/sdlvid.h"
|
#include "ffmpeg-renderers/sdlvid.h"
|
||||||
#include "ffmpeg-renderers/cuda.h"
|
|
||||||
|
|
||||||
#ifdef Q_OS_WIN32
|
#ifdef Q_OS_WIN32
|
||||||
#include "ffmpeg-renderers/dxva2.h"
|
#include "ffmpeg-renderers/dxva2.h"
|
||||||
@ -36,6 +35,10 @@
|
|||||||
#include "ffmpeg-renderers/eglvid.h"
|
#include "ffmpeg-renderers/eglvid.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
#include "ffmpeg-renderers/cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
// This is gross but it allows us to use sizeof()
|
// This is gross but it allows us to use sizeof()
|
||||||
#include "ffmpeg_videosamples.cpp"
|
#include "ffmpeg_videosamples.cpp"
|
||||||
|
|
||||||
@ -567,11 +570,11 @@ IFFmpegRenderer* FFmpegVideoDecoder::createHwAccelRenderer(const AVCodecHWConfig
|
|||||||
// Second pass for our second-tier hwaccel implementations
|
// Second pass for our second-tier hwaccel implementations
|
||||||
else if (pass == 1) {
|
else if (pass == 1) {
|
||||||
switch (hwDecodeCfg->device_type) {
|
switch (hwDecodeCfg->device_type) {
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
case AV_HWDEVICE_TYPE_CUDA:
|
case AV_HWDEVICE_TYPE_CUDA:
|
||||||
// CUDA should only be used if all other options fail, since it requires
|
// CUDA should only be used to cover the NVIDIA+Wayland case
|
||||||
// read-back of frames. This should only be used for the NVIDIA+Wayland case
|
|
||||||
// with VDPAU covering the NVIDIA+X11 scenario.
|
|
||||||
return new CUDARenderer();
|
return new CUDARenderer();
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user