Use nanors for optimized Reed-Solomon FEC decoding (#125)

This commit is contained in:
Andy Grundman
2026-02-19 00:36:52 -05:00
committed by GitHub
parent 1d0e91d91a
commit de364b6ecd
16 changed files with 5493 additions and 689 deletions

View File

@@ -55,8 +55,7 @@ void RtpaInitializeQueue(PRTP_AUDIO_QUEUE queue) {
// works correctly. This is possible because the data and FEC shard count is
// constant and known in advance.
const unsigned char parity[] = { 0x77, 0x40, 0x38, 0x0e, 0xc7, 0xa7, 0x0d, 0x6c };
memcpy(&queue->rs->m[16], parity, sizeof(parity));
memcpy(queue->rs->parity, parity, sizeof(parity));
memcpy(queue->rs->p, parity, sizeof(parity));
}
static void validateFecBlockState(PRTP_AUDIO_QUEUE queue) {
@@ -444,7 +443,7 @@ static bool completeFecBlock(PRTP_AUDIO_QUEUE queue, PRTPA_FEC_BLOCK block) {
memset(block->dataPackets[dropIndex], 0, sizeof(RTP_PACKET) + block->blockSize);
#endif
int res = reed_solomon_reconstruct(queue->rs, shards, block->marks, RTPA_TOTAL_SHARDS, block->blockSize);
int res = reed_solomon_decode(queue->rs, shards, block->marks, RTPA_TOTAL_SHARDS, block->blockSize);
if (res != 0) {
// We should always have enough data to recover the entire block since we checked above.
LC_ASSERT(res == 0);

View File

@@ -2,7 +2,14 @@
#include "Video.h"
#include "rs.h"
#include "rswrapper.h"
typedef struct _reed_solomon {
int ds;
int ps;
int ts;
uint8_t p[];
} reed_solomon;
// Maximum time to wait for an OOS data/FEC shard
// after the entire FEC block should have been received

View File

@@ -1,5 +1,5 @@
#include "Limelight-internal.h"
#include "rs.h"
#include "rswrapper.h"
#if defined(LC_DEBUG) && !defined(LC_FUZZING)
// This enables FEC validation mode with a synthetic drop
@@ -328,7 +328,7 @@ static int reconstructFrame(PRTP_VIDEO_QUEUE queue) {
}
}
ret = reed_solomon_reconstruct(rs, packets, marks, totalPackets, receiveSize);
ret = reed_solomon_decode(rs, packets, marks, totalPackets, receiveSize);
// We should always provide enough parity to recover the missing data successfully.
// If this fails, something is probably wrong with our FEC state.

191
src/rswrapper.c Normal file
View File

@@ -0,0 +1,191 @@
/**
* @file src/rswrapper.c
* @brief Wrappers for nanors vectorization with different ISA options
*/
// _FORTIY_SOURCE can cause some versions of GCC to try to inline
// memset() with incompatible target options when compiling rs.c
#ifdef _FORTIFY_SOURCE
#undef _FORTIFY_SOURCE
#endif
// The assert() function is decorated with __cold on macOS which
// is incompatible with Clang's target multiversioning feature
#ifndef NDEBUG
#define NDEBUG
#endif
#define DECORATE_FUNC_I(a, b) a##b
#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b)
// Append an ISA suffix to the public RS API
#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX)
#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX)
#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX)
#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX)
#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX)
#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX)
// Append an ISA suffix to internal functions to prevent multiple definition errors
#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX)
#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX)
#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX)
#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX)
#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX)
#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX)
#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX)
#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX)
#define scal DECORATE_FUNC(scal, ISA_SUFFIX)
#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX)
#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX)
#if defined(__x86_64__) || defined(__i386__) || (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)))
// Compile a variant for SSSE3
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function)
#elif __GNUC__
#pragma GCC push_options
#pragma GCC target("ssse3")
#endif
#define ISA_SUFFIX _ssse3
#define OBLAS_SSE3
#include "../nanors/rs.c"
#undef OBLAS_SSE3
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#elif __GNUC__
#pragma GCC pop_options
#endif
// Compile a variant for AVX2
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
#elif __GNUC__
#pragma GCC push_options
#pragma GCC target("avx2")
#endif
#define ISA_SUFFIX _avx2
#define OBLAS_AVX2
#include "../nanors/rs.c"
#undef OBLAS_AVX2
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#elif __GNUC__
#pragma GCC pop_options
#endif
// Compile a variant for AVX512BW
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function)
#elif __GNUC__
#pragma GCC push_options
#pragma GCC target("avx512f,avx512bw")
#endif
#define ISA_SUFFIX _avx512
#define OBLAS_AVX512
#include "../nanors/rs.c"
#undef OBLAS_AVX512
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#elif __GNUC__
#pragma GCC pop_options
#endif
#endif
// Compile a default variant
#define ISA_SUFFIX _def
#include "../nanors/deps/obl/autoshim.h"
#include "../nanors/rs.c"
#undef ISA_SUFFIX
#undef reed_solomon_init
#undef reed_solomon_new
#undef reed_solomon_new_static
#undef reed_solomon_release
#undef reed_solomon_decode
#undef reed_solomon_encode
#include "rswrapper.h"
reed_solomon_new_t reed_solomon_new_fn;
reed_solomon_release_t reed_solomon_release_fn;
reed_solomon_encode_t reed_solomon_encode_fn;
reed_solomon_decode_t reed_solomon_decode_fn;
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
#if defined(_M_AMD64)
// For some reason this is needed to avoid a "C1189 No target architecture" error from winnt.h
# define _AMD64_
#endif
#include <processthreadsapi.h>
BOOL _msc_supports_ssse3(void) { return IsProcessorFeaturePresent(PF_SSSE3_INSTRUCTIONS_AVAILABLE); }
BOOL _msc_supports_avx2(void) { return IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE); }
BOOL _msc_supports_avx512f(void) { return IsProcessorFeaturePresent(PF_AVX512F_INSTRUCTIONS_AVAILABLE); }
#endif
/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void reed_solomon_init(void) {
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
// Visual Studio
if (_msc_supports_avx512f()) {
reed_solomon_new_fn = reed_solomon_new_avx512;
reed_solomon_release_fn = reed_solomon_release_avx512;
reed_solomon_encode_fn = reed_solomon_encode_avx512;
reed_solomon_decode_fn = reed_solomon_decode_avx512;
reed_solomon_init_avx512();
} else if (_msc_supports_avx2()) {
reed_solomon_new_fn = reed_solomon_new_avx2;
reed_solomon_release_fn = reed_solomon_release_avx2;
reed_solomon_encode_fn = reed_solomon_encode_avx2;
reed_solomon_decode_fn = reed_solomon_decode_avx2;
reed_solomon_init_avx2();
} else if (_msc_supports_ssse3()) {
reed_solomon_new_fn = reed_solomon_new_ssse3;
reed_solomon_release_fn = reed_solomon_release_ssse3;
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
reed_solomon_init_ssse3();
} else
#elif defined(__x86_64__)
// gcc & clang
if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
reed_solomon_new_fn = reed_solomon_new_avx512;
reed_solomon_release_fn = reed_solomon_release_avx512;
reed_solomon_encode_fn = reed_solomon_encode_avx512;
reed_solomon_decode_fn = reed_solomon_decode_avx512;
reed_solomon_init_avx512();
} else if (__builtin_cpu_supports("avx2")) {
reed_solomon_new_fn = reed_solomon_new_avx2;
reed_solomon_release_fn = reed_solomon_release_avx2;
reed_solomon_encode_fn = reed_solomon_encode_avx2;
reed_solomon_decode_fn = reed_solomon_decode_avx2;
reed_solomon_init_avx2();
} else if (__builtin_cpu_supports("ssse3")) {
reed_solomon_new_fn = reed_solomon_new_ssse3;
reed_solomon_release_fn = reed_solomon_release_ssse3;
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
reed_solomon_init_ssse3();
} else
#endif
//
{
reed_solomon_new_fn = reed_solomon_new_def;
reed_solomon_release_fn = reed_solomon_release_def;
reed_solomon_encode_fn = reed_solomon_encode_def;
reed_solomon_decode_fn = reed_solomon_decode_def;
reed_solomon_init_def();
}
}

32
src/rswrapper.h Normal file
View File

@@ -0,0 +1,32 @@
/**
* @file src/rswrapper.h
* @brief Wrappers for nanors vectorization
* @details This is a drop-in replacement for nanors rs.h
*/
#pragma once
// standard includes
#include <stdint.h>
typedef struct _reed_solomon reed_solomon;
typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards);
typedef void (*reed_solomon_release_t)(reed_solomon *rs);
typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs);
typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs);
extern reed_solomon_new_t reed_solomon_new_fn;
extern reed_solomon_release_t reed_solomon_release_fn;
extern reed_solomon_encode_t reed_solomon_encode_fn;
extern reed_solomon_decode_t reed_solomon_decode_fn;
#define reed_solomon_new reed_solomon_new_fn
#define reed_solomon_release reed_solomon_release_fn
#define reed_solomon_encode reed_solomon_encode_fn
#define reed_solomon_decode reed_solomon_decode_fn
/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void reed_solomon_init(void);