mirror of
https://github.com/moonlight-stream/moonlight-common-c.git
synced 2026-04-11 10:16:22 +00:00
Use nanors for optimized Reed-Solomon FEC decoding (#125)
This commit is contained in:
@@ -55,8 +55,7 @@ void RtpaInitializeQueue(PRTP_AUDIO_QUEUE queue) {
|
||||
// works correctly. This is possible because the data and FEC shard count is
|
||||
// constant and known in advance.
|
||||
const unsigned char parity[] = { 0x77, 0x40, 0x38, 0x0e, 0xc7, 0xa7, 0x0d, 0x6c };
|
||||
memcpy(&queue->rs->m[16], parity, sizeof(parity));
|
||||
memcpy(queue->rs->parity, parity, sizeof(parity));
|
||||
memcpy(queue->rs->p, parity, sizeof(parity));
|
||||
}
|
||||
|
||||
static void validateFecBlockState(PRTP_AUDIO_QUEUE queue) {
|
||||
@@ -444,7 +443,7 @@ static bool completeFecBlock(PRTP_AUDIO_QUEUE queue, PRTPA_FEC_BLOCK block) {
|
||||
memset(block->dataPackets[dropIndex], 0, sizeof(RTP_PACKET) + block->blockSize);
|
||||
#endif
|
||||
|
||||
int res = reed_solomon_reconstruct(queue->rs, shards, block->marks, RTPA_TOTAL_SHARDS, block->blockSize);
|
||||
int res = reed_solomon_decode(queue->rs, shards, block->marks, RTPA_TOTAL_SHARDS, block->blockSize);
|
||||
if (res != 0) {
|
||||
// We should always have enough data to recover the entire block since we checked above.
|
||||
LC_ASSERT(res == 0);
|
||||
|
||||
@@ -2,7 +2,14 @@
|
||||
|
||||
#include "Video.h"
|
||||
|
||||
#include "rs.h"
|
||||
#include "rswrapper.h"
|
||||
|
||||
typedef struct _reed_solomon {
|
||||
int ds;
|
||||
int ps;
|
||||
int ts;
|
||||
uint8_t p[];
|
||||
} reed_solomon;
|
||||
|
||||
// Maximum time to wait for an OOS data/FEC shard
|
||||
// after the entire FEC block should have been received
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "Limelight-internal.h"
|
||||
#include "rs.h"
|
||||
#include "rswrapper.h"
|
||||
|
||||
#if defined(LC_DEBUG) && !defined(LC_FUZZING)
|
||||
// This enables FEC validation mode with a synthetic drop
|
||||
@@ -328,7 +328,7 @@ static int reconstructFrame(PRTP_VIDEO_QUEUE queue) {
|
||||
}
|
||||
}
|
||||
|
||||
ret = reed_solomon_reconstruct(rs, packets, marks, totalPackets, receiveSize);
|
||||
ret = reed_solomon_decode(rs, packets, marks, totalPackets, receiveSize);
|
||||
|
||||
// We should always provide enough parity to recover the missing data successfully.
|
||||
// If this fails, something is probably wrong with our FEC state.
|
||||
|
||||
191
src/rswrapper.c
Normal file
191
src/rswrapper.c
Normal file
@@ -0,0 +1,191 @@
|
||||
/**
|
||||
* @file src/rswrapper.c
|
||||
* @brief Wrappers for nanors vectorization with different ISA options
|
||||
*/
|
||||
|
||||
// _FORTIY_SOURCE can cause some versions of GCC to try to inline
|
||||
// memset() with incompatible target options when compiling rs.c
|
||||
#ifdef _FORTIFY_SOURCE
|
||||
#undef _FORTIFY_SOURCE
|
||||
#endif
|
||||
|
||||
// The assert() function is decorated with __cold on macOS which
|
||||
// is incompatible with Clang's target multiversioning feature
|
||||
#ifndef NDEBUG
|
||||
#define NDEBUG
|
||||
#endif
|
||||
|
||||
#define DECORATE_FUNC_I(a, b) a##b
|
||||
#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b)
|
||||
|
||||
// Append an ISA suffix to the public RS API
|
||||
#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX)
|
||||
#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX)
|
||||
#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX)
|
||||
#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX)
|
||||
#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX)
|
||||
#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX)
|
||||
|
||||
// Append an ISA suffix to internal functions to prevent multiple definition errors
|
||||
#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX)
|
||||
#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX)
|
||||
#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX)
|
||||
#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX)
|
||||
#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX)
|
||||
#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX)
|
||||
#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX)
|
||||
#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX)
|
||||
#define scal DECORATE_FUNC(scal, ISA_SUFFIX)
|
||||
#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX)
|
||||
#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX)
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)))
|
||||
|
||||
// Compile a variant for SSSE3
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function)
|
||||
#elif __GNUC__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("ssse3")
|
||||
#endif
|
||||
#define ISA_SUFFIX _ssse3
|
||||
#define OBLAS_SSE3
|
||||
#include "../nanors/rs.c"
|
||||
#undef OBLAS_SSE3
|
||||
#undef ISA_SUFFIX
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif __GNUC__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
||||
// Compile a variant for AVX2
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
|
||||
#elif __GNUC__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx2")
|
||||
#endif
|
||||
#define ISA_SUFFIX _avx2
|
||||
#define OBLAS_AVX2
|
||||
#include "../nanors/rs.c"
|
||||
#undef OBLAS_AVX2
|
||||
#undef ISA_SUFFIX
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif __GNUC__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
||||
// Compile a variant for AVX512BW
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function)
|
||||
#elif __GNUC__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512f,avx512bw")
|
||||
#endif
|
||||
#define ISA_SUFFIX _avx512
|
||||
#define OBLAS_AVX512
|
||||
#include "../nanors/rs.c"
|
||||
#undef OBLAS_AVX512
|
||||
#undef ISA_SUFFIX
|
||||
#if defined(__clang__)
|
||||
#pragma clang attribute pop
|
||||
#elif __GNUC__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// Compile a default variant
|
||||
#define ISA_SUFFIX _def
|
||||
#include "../nanors/deps/obl/autoshim.h"
|
||||
#include "../nanors/rs.c"
|
||||
#undef ISA_SUFFIX
|
||||
|
||||
#undef reed_solomon_init
|
||||
#undef reed_solomon_new
|
||||
#undef reed_solomon_new_static
|
||||
#undef reed_solomon_release
|
||||
#undef reed_solomon_decode
|
||||
#undef reed_solomon_encode
|
||||
|
||||
#include "rswrapper.h"
|
||||
|
||||
reed_solomon_new_t reed_solomon_new_fn;
|
||||
reed_solomon_release_t reed_solomon_release_fn;
|
||||
reed_solomon_encode_t reed_solomon_encode_fn;
|
||||
reed_solomon_decode_t reed_solomon_decode_fn;
|
||||
|
||||
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
|
||||
|
||||
#if defined(_M_AMD64)
|
||||
// For some reason this is needed to avoid a "C1189 No target architecture" error from winnt.h
|
||||
# define _AMD64_
|
||||
#endif
|
||||
#include <processthreadsapi.h>
|
||||
BOOL _msc_supports_ssse3(void) { return IsProcessorFeaturePresent(PF_SSSE3_INSTRUCTIONS_AVAILABLE); }
|
||||
BOOL _msc_supports_avx2(void) { return IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE); }
|
||||
BOOL _msc_supports_avx512f(void) { return IsProcessorFeaturePresent(PF_AVX512F_INSTRUCTIONS_AVAILABLE); }
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief This initializes the RS function pointers to the best vectorized version available.
|
||||
* @details The streaming code will directly invoke these function pointers during encoding.
|
||||
*/
|
||||
void reed_solomon_init(void) {
|
||||
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
|
||||
// Visual Studio
|
||||
if (_msc_supports_avx512f()) {
|
||||
reed_solomon_new_fn = reed_solomon_new_avx512;
|
||||
reed_solomon_release_fn = reed_solomon_release_avx512;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_avx512;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_avx512;
|
||||
reed_solomon_init_avx512();
|
||||
} else if (_msc_supports_avx2()) {
|
||||
reed_solomon_new_fn = reed_solomon_new_avx2;
|
||||
reed_solomon_release_fn = reed_solomon_release_avx2;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_avx2;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_avx2;
|
||||
reed_solomon_init_avx2();
|
||||
} else if (_msc_supports_ssse3()) {
|
||||
reed_solomon_new_fn = reed_solomon_new_ssse3;
|
||||
reed_solomon_release_fn = reed_solomon_release_ssse3;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
|
||||
reed_solomon_init_ssse3();
|
||||
} else
|
||||
|
||||
#elif defined(__x86_64__)
|
||||
// gcc & clang
|
||||
if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
|
||||
reed_solomon_new_fn = reed_solomon_new_avx512;
|
||||
reed_solomon_release_fn = reed_solomon_release_avx512;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_avx512;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_avx512;
|
||||
reed_solomon_init_avx512();
|
||||
} else if (__builtin_cpu_supports("avx2")) {
|
||||
reed_solomon_new_fn = reed_solomon_new_avx2;
|
||||
reed_solomon_release_fn = reed_solomon_release_avx2;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_avx2;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_avx2;
|
||||
reed_solomon_init_avx2();
|
||||
} else if (__builtin_cpu_supports("ssse3")) {
|
||||
reed_solomon_new_fn = reed_solomon_new_ssse3;
|
||||
reed_solomon_release_fn = reed_solomon_release_ssse3;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
|
||||
reed_solomon_init_ssse3();
|
||||
} else
|
||||
|
||||
#endif
|
||||
//
|
||||
{
|
||||
reed_solomon_new_fn = reed_solomon_new_def;
|
||||
reed_solomon_release_fn = reed_solomon_release_def;
|
||||
reed_solomon_encode_fn = reed_solomon_encode_def;
|
||||
reed_solomon_decode_fn = reed_solomon_decode_def;
|
||||
reed_solomon_init_def();
|
||||
}
|
||||
}
|
||||
32
src/rswrapper.h
Normal file
32
src/rswrapper.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/**
|
||||
* @file src/rswrapper.h
|
||||
* @brief Wrappers for nanors vectorization
|
||||
* @details This is a drop-in replacement for nanors rs.h
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
// standard includes
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct _reed_solomon reed_solomon;
|
||||
|
||||
typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards);
|
||||
typedef void (*reed_solomon_release_t)(reed_solomon *rs);
|
||||
typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs);
|
||||
typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs);
|
||||
|
||||
extern reed_solomon_new_t reed_solomon_new_fn;
|
||||
extern reed_solomon_release_t reed_solomon_release_fn;
|
||||
extern reed_solomon_encode_t reed_solomon_encode_fn;
|
||||
extern reed_solomon_decode_t reed_solomon_decode_fn;
|
||||
|
||||
#define reed_solomon_new reed_solomon_new_fn
|
||||
#define reed_solomon_release reed_solomon_release_fn
|
||||
#define reed_solomon_encode reed_solomon_encode_fn
|
||||
#define reed_solomon_decode reed_solomon_decode_fn
|
||||
|
||||
/**
|
||||
* @brief This initializes the RS function pointers to the best vectorized version available.
|
||||
* @details The streaming code will directly invoke these function pointers during encoding.
|
||||
*/
|
||||
void reed_solomon_init(void);
|
||||
Reference in New Issue
Block a user