mirror of
https://github.com/moonlight-stream/moonlight-common-c.git
synced 2026-04-02 22:06:10 +00:00
SIMDe includes support for AltiVec, LSX, NEON, and more. It also includes fallbacks that are optimized for compiler autovectorization for other architectures (like RISC-V RVV) Since there's no runtime dispatching for these, they will be selected depending on which instructions are enabled by the toolchain at compile-time.
266 lines
17 KiB
C
266 lines
17 KiB
C
// Note about the frequent use of #undef before most defines in this file: These exist to
|
|
// prevent a lot of (harmless) redefined macro warnings if this file is re-included multiple
|
|
// times in the same build context. Sunshine and Moonlight use this trick to bundle all arch
|
|
// builds into the same binary along with runtime detection. An example can be found at
|
|
// https://github.com/LizardByte/Sunshine/blob/master/src/rswrapper.c
|
|
|
|
#include "oblas_lite.h"
|
|
|
|
#if defined(OBLAS_TINY)
|
|
static inline uint8_t gf2_8_mul(uint16_t a, uint16_t b)
|
|
{
|
|
if (!a || !b) {
|
|
return 0;
|
|
}
|
|
|
|
// Perform 8-bit, carry-less multiplication of |a| and |b|.
|
|
return GF2_8_EXP[GF2_8_LOG[a] + GF2_8_LOG[b]];
|
|
}
|
|
|
|
static void obl_axpy_ref(u8 *a, u8 *b, u8 u, unsigned k)
|
|
{
|
|
register u8 *ap = a, *ae = &a[k], *bp = b;
|
|
for (; ap != ae; ap++, bp++)
|
|
*ap ^= gf2_8_mul(u, *bp);
|
|
}
|
|
|
|
static void obl_scal_ref(u8 *a, u8 *b, u8 u, unsigned k)
|
|
{
|
|
(void)b;
|
|
register u8 *ap = a, *ae = &a[k];
|
|
for (; ap != ae; ap++)
|
|
*ap = gf2_8_mul(u, *ap);
|
|
}
|
|
|
|
#else
|
|
#include "gf2_8_mul_table.h"
|
|
|
|
static void obl_axpy_ref(u8 *a, u8 *b, u8 u, unsigned k)
|
|
{
|
|
register const u8 *u_row = &GF2_8_MUL[u << 8];
|
|
register u8 *ap = a, *ae = &a[k], *bp = b;
|
|
for (; ap != ae; ap++, bp++)
|
|
*ap ^= u_row[*bp];
|
|
}
|
|
|
|
static void obl_scal_ref(u8 *a, u8 *b, u8 u, unsigned k)
|
|
{
|
|
(void)b;
|
|
register const u8 *u_row = &GF2_8_MUL[u << 8];
|
|
register u8 *ap = a, *ae = &a[k];
|
|
for (; ap != ae; ap++)
|
|
*ap = u_row[*ap];
|
|
}
|
|
#endif
|
|
|
|
void obl_axpyb32_ref(u8 *a, u32 *b, u8 u, unsigned k)
|
|
{
|
|
for (unsigned idx = 0, p = 0; idx < k; idx += 8 * sizeof(u32), p++) {
|
|
u32 tmp = b[p];
|
|
while (tmp > 0) {
|
|
#ifdef _MSC_VER
|
|
unsigned long index = 0;
|
|
_BitScanForward(&index, tmp);
|
|
unsigned tz = (unsigned int)index;
|
|
#else
|
|
unsigned tz = __builtin_ctz(tmp);
|
|
#endif
|
|
tmp = tmp & (tmp - 1);
|
|
a[tz + idx] ^= u;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if defined(OBLAS_AVX512)
|
|
#include <immintrin.h>
|
|
|
|
#undef OBLAS_ALIGN
|
|
#define OBLAS_ALIGN 64
|
|
|
|
#undef OBL_SHUF
|
|
#define OBL_SHUF(op, a, b, f) \
|
|
do { \
|
|
const u8 *u_lo = GF2_8_SHUF_LO + u * 16; \
|
|
const u8 *u_hi = GF2_8_SHUF_HI + u * 16; \
|
|
const __m512i mask = _mm512_set1_epi8(0x0f); \
|
|
const __m128i ulo_128 = _mm_loadu_si128((__m128i *)u_lo); \
|
|
const __m128i uhi_128 = _mm_loadu_si128((__m128i *)u_hi); \
|
|
const __m512i urow_lo = _mm512_broadcast_i32x4(ulo_128); \
|
|
const __m512i urow_hi = _mm512_broadcast_i32x4(uhi_128); \
|
|
__m512i *ap = (__m512i *)a, *ae = (__m512i *)(a + k - (k % sizeof(__m512i))), *bp = (__m512i *)b; \
|
|
for (; ap < ae; ap++, bp++) { \
|
|
__m512i bx = _mm512_loadu_si512(bp); \
|
|
__m512i lo = _mm512_and_si512(bx, mask); \
|
|
bx = _mm512_srli_epi64(bx, 4); \
|
|
__m512i hi = _mm512_and_si512(bx, mask); \
|
|
lo = _mm512_shuffle_epi8(urow_lo, lo); \
|
|
hi = _mm512_shuffle_epi8(urow_hi, hi); \
|
|
_mm512_storeu_si512(ap, f(_mm512_loadu_si512(ap), _mm512_xor_si512(lo, hi))); \
|
|
} \
|
|
op##_ref((u8 *)ap, (u8 *)bp, u, k % sizeof(__m512i)); \
|
|
} while (0)
|
|
|
|
#undef OBL_SHUF_XOR
|
|
#define OBL_SHUF_XOR _mm512_xor_si512
|
|
|
|
#undef OBL_AXPYB32
|
|
#define OBL_AXPYB32(a, b, u, k) \
|
|
do { \
|
|
__m512i *ap = (__m512i *)a, *ae = (__m512i *)(a + k); \
|
|
__m512i scatter = \
|
|
_mm512_set_epi32(0x03030303, 0x03030303, 0x02020202, 0x02020202, 0x01010101, 0x01010101, 0x00000000, 0x00000000, \
|
|
0x03030303, 0x03030303, 0x02020202, 0x02020202, 0x01010101, 0x01010101, 0x00000000, 0x00000000); \
|
|
__m512i cmpmask = \
|
|
_mm512_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201, \
|
|
0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201); \
|
|
__m512i up = _mm512_set1_epi8(u); \
|
|
for (unsigned p = 0; ap < ae; p++, ap++) { \
|
|
__m512i bcast = _mm512_set1_epi32(b[p]); \
|
|
__m512i ret = _mm512_shuffle_epi8(bcast, scatter); \
|
|
ret = _mm512_andnot_si512(ret, cmpmask); \
|
|
__mmask64 tmp = _mm512_cmpeq_epi8_mask(ret, _mm512_setzero_si512()); \
|
|
ret = _mm512_mask_blend_epi8(tmp, _mm512_setzero_si512(), up); \
|
|
_mm512_storeu_si512(ap, _mm512_xor_si512(_mm512_loadu_si512(ap), ret)); \
|
|
} \
|
|
} while (0)
|
|
|
|
#elif defined(OBLAS_AVX2)
|
|
#include <immintrin.h>
|
|
|
|
#undef OBLAS_ALIGN
|
|
#define OBLAS_ALIGN 32
|
|
|
|
#undef OBL_SHUF
|
|
#define OBL_SHUF(op, a, b, f) \
|
|
do { \
|
|
const u8 *u_lo = GF2_8_SHUF_LO + u * 16; \
|
|
const u8 *u_hi = GF2_8_SHUF_HI + u * 16; \
|
|
const __m256i mask = _mm256_set1_epi8(0x0f); \
|
|
const __m256i urow_lo = _mm256_loadu2_m128i((__m128i *)u_lo, (__m128i *)u_lo); \
|
|
const __m256i urow_hi = _mm256_loadu2_m128i((__m128i *)u_hi, (__m128i *)u_hi); \
|
|
__m256i *ap = (__m256i *)a, *ae = (__m256i *)(a + k - (k % sizeof(__m256i))), *bp = (__m256i *)b; \
|
|
for (; ap < ae; ap++, bp++) { \
|
|
__m256i bx = _mm256_loadu_si256(bp); \
|
|
__m256i lo = _mm256_and_si256(bx, mask); \
|
|
bx = _mm256_srli_epi64(bx, 4); \
|
|
__m256i hi = _mm256_and_si256(bx, mask); \
|
|
lo = _mm256_shuffle_epi8(urow_lo, lo); \
|
|
hi = _mm256_shuffle_epi8(urow_hi, hi); \
|
|
_mm256_storeu_si256(ap, f(_mm256_loadu_si256(ap), _mm256_xor_si256(lo, hi))); \
|
|
} \
|
|
op##_ref((u8 *)ap, (u8 *)bp, u, k % sizeof(__m256i)); \
|
|
} while (0)
|
|
|
|
#undef OBL_SHUF_XOR
|
|
#define OBL_SHUF_XOR _mm256_xor_si256
|
|
|
|
#undef OBL_AXPYB32
|
|
#define OBL_AXPYB32(a, b, u, k) \
|
|
do { \
|
|
__m256i *ap = (__m256i *)a, *ae = (__m256i *)(a + k); \
|
|
__m256i scatter = \
|
|
_mm256_set_epi32(0x03030303, 0x03030303, 0x02020202, 0x02020202, 0x01010101, 0x01010101, 0x00000000, 0x00000000); \
|
|
__m256i cmpmask = \
|
|
_mm256_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201, 0x80402010, 0x08040201); \
|
|
__m256i up = _mm256_set1_epi8(u); \
|
|
for (unsigned p = 0; ap < ae; p++, ap++) { \
|
|
__m256i bcast = _mm256_set1_epi32(b[p]); \
|
|
__m256i ret = _mm256_shuffle_epi8(bcast, scatter); \
|
|
ret = _mm256_andnot_si256(ret, cmpmask); \
|
|
ret = _mm256_and_si256(_mm256_cmpeq_epi8(ret, _mm256_setzero_si256()), up); \
|
|
_mm256_storeu_si256(ap, _mm256_xor_si256(_mm256_loadu_si256(ap), ret)); \
|
|
} \
|
|
} while (0)
|
|
|
|
#else
|
|
|
|
#if defined(OBLAS_SSE3)
|
|
#include <emmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#else
|
|
#define SIMDE_ENABLE_NATIVE_ALIASES
|
|
#include <simde/x86/ssse3.h>
|
|
#endif
|
|
|
|
#undef OBLAS_ALIGN
|
|
#define OBLAS_ALIGN 16
|
|
|
|
#undef OBL_SHUF
|
|
#define OBL_SHUF(op, a, b, f) \
|
|
do { \
|
|
const u8 *u_lo = GF2_8_SHUF_LO + u * 16; \
|
|
const u8 *u_hi = GF2_8_SHUF_HI + u * 16; \
|
|
const __m128i mask = _mm_set1_epi8(0x0f); \
|
|
const __m128i urow_lo = _mm_loadu_si128((__m128i *)u_lo); \
|
|
const __m128i urow_hi = _mm_loadu_si128((__m128i *)u_hi); \
|
|
__m128i *ap = (__m128i *)a, *ae = (__m128i *)(a + k - (k % sizeof(__m128i))), *bp = (__m128i *)b; \
|
|
for (; ap < ae; ap++, bp++) { \
|
|
__m128i bx = _mm_loadu_si128(bp); \
|
|
__m128i lo = _mm_and_si128(bx, mask); \
|
|
bx = _mm_srli_epi64(bx, 4); \
|
|
__m128i hi = _mm_and_si128(bx, mask); \
|
|
lo = _mm_shuffle_epi8(urow_lo, lo); \
|
|
hi = _mm_shuffle_epi8(urow_hi, hi); \
|
|
_mm_storeu_si128(ap, f(_mm_loadu_si128(ap), _mm_xor_si128(lo, hi))); \
|
|
} \
|
|
op##_ref((u8 *)ap, (u8 *)bp, u, k % sizeof(__m128i)); \
|
|
} while (0)
|
|
|
|
#undef OBL_SHUF_XOR
|
|
#define OBL_SHUF_XOR _mm_xor_si128
|
|
|
|
#undef OBL_AXPYB32
|
|
#define OBL_AXPYB32(a, b, u, k) \
|
|
do { \
|
|
__m128i *ap = (__m128i *)a, *ae = (__m128i *)(a + k); \
|
|
__m128i scatter_hi = _mm_set_epi32(0x03030303, 0x03030303, 0x02020202, 0x02020202); \
|
|
__m128i scatter_lo = _mm_set_epi32(0x01010101, 0x01010101, 0x00000000, 0x00000000); \
|
|
__m128i cmpmask = _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201); \
|
|
__m128i up = _mm_set1_epi8(u); \
|
|
for (unsigned p = 0; ap < ae; p++, ap++) { \
|
|
__m128i bcast = _mm_set1_epi32(b[p]); \
|
|
__m128i ret_lo = _mm_shuffle_epi8(bcast, scatter_lo); \
|
|
__m128i ret_hi = _mm_shuffle_epi8(bcast, scatter_hi); \
|
|
ret_lo = _mm_andnot_si128(ret_lo, cmpmask); \
|
|
ret_hi = _mm_andnot_si128(ret_hi, cmpmask); \
|
|
ret_lo = _mm_and_si128(_mm_cmpeq_epi8(ret_lo, _mm_setzero_si128()), up); \
|
|
ret_hi = _mm_and_si128(_mm_cmpeq_epi8(ret_hi, _mm_setzero_si128()), up); \
|
|
_mm_storeu_si128(ap, _mm_xor_si128(_mm_loadu_si128(ap), ret_lo)); \
|
|
ap++; \
|
|
_mm_storeu_si128(ap, _mm_xor_si128(_mm_loadu_si128(ap), ret_hi)); \
|
|
} \
|
|
} while (0)
|
|
#endif
|
|
|
|
#define OBL_NOOP(a, b) (b)
|
|
void obl_axpy(u8 *a, u8 *b, u8 u, unsigned k)
|
|
{
|
|
if (u == 1) {
|
|
register u8 *ap = a, *ae = &a[k], *bp = b;
|
|
for (; ap < ae; ap++, bp++)
|
|
*ap ^= *bp;
|
|
} else {
|
|
OBL_SHUF(obl_axpy, a, b, OBL_SHUF_XOR);
|
|
}
|
|
}
|
|
|
|
void obl_scal(u8 *a, u8 u, unsigned k)
|
|
{
|
|
OBL_SHUF(obl_scal, a, a, OBL_NOOP);
|
|
}
|
|
|
|
void obl_swap(u8 *a, u8 *b, unsigned k)
|
|
{
|
|
register u8 *ap = a, *ae = &a[k], *bp = b;
|
|
for (; ap < ae; ap++, bp++) {
|
|
u8 tmp = *ap;
|
|
*ap = *bp;
|
|
*bp = tmp;
|
|
}
|
|
}
|
|
|
|
void obl_axpyb32(u8 *a, u32 *b, u8 u, unsigned k)
|
|
{
|
|
OBL_AXPYB32(a, b, u, k);
|
|
}
|