From 67181aa353fd2accdb661ed87327a9b67bcddf73 Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 16:58:36 +0300 Subject: [PATCH 1/7] refactor: MIPS: fix __builtin_clz usage Due to historical reasons __builtin_clz result is undefined for 0. Fix that in MIPS port. Signed-off-by: Siarhei Volkau --- silk/mips/macros_mipsr1.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/silk/mips/macros_mipsr1.h b/silk/mips/macros_mipsr1.h index af408802c..5fbe1ab7e 100644 --- a/silk/mips/macros_mipsr1.h +++ b/silk/mips/macros_mipsr1.h @@ -29,7 +29,10 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef SILK_MACROS_MIPSR1_H__ #define SILK_MACROS_MIPSR1_H__ -#define mips_clz(x) __builtin_clz(x) +static inline int mips_clz(opus_uint32 x) +{ + return x ? __builtin_clz(x) : 32; +} #undef silk_SMULWB static inline int silk_SMULWB(int a, int b) From 41ebef3280cec71def7721ca092311a016f0d7ed Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 17:02:57 +0300 Subject: [PATCH 2/7] refactor: MIPS: fix silk_CLZ16 port silk_CLZ16 MIPS port does sign extension from opus_int16 to opus_int32. In case of negative input it will return -16 instead of expected 0. Input should be zero extended for mips_clz / __builtin_clz. Signed-off-by: Siarhei Volkau --- silk/mips/macros_mipsr1.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/silk/mips/macros_mipsr1.h b/silk/mips/macros_mipsr1.h index 5fbe1ab7e..0393a33fa 100644 --- a/silk/mips/macros_mipsr1.h +++ b/silk/mips/macros_mipsr1.h @@ -78,9 +78,9 @@ static inline int silk_SMLAWW(int a, int b, int c) static inline opus_int32 silk_CLZ16(opus_int16 in16) { int re32; - opus_int32 in32 = (opus_int32 )in16; + opus_uint32 in32 = (opus_uint16)in16; re32 = mips_clz(in32); - re32-=16; + re32 -= 16; return re32; } From 00e141f9f3be187cb33bcf0383ff777107768305 Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 17:12:41 +0300 Subject: [PATCH 3/7] MIPS: enable OPUS_FAST_INT64 for any MIPS While 32-bit MIPS doesn't fit for rules described for OPUS_FAST_INT64 enabling, it has fast 32x32 multiplication with full 64-bit result. That's enough to enable OPUS_FAST_INT64 for any MIPS since OPUS_FAST_INT64 guards various multiplication implementation. Maybe it's worth to get it more precise name? e.g. OPUS_FAST_MULT. GCC macro __mips covers both 32- and 64-bit MIPS. Signed-off-by: Siarhei Volkau --- celt/arch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/celt/arch.h b/celt/arch.h index a6055f403..dd095b218 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -121,7 +121,7 @@ void celt_fatal(const char *str, const char *file, int line) /* Set this if opus_int64 is a native type of the CPU. */ /* Assume that all LP64 architectures have fast 64-bit types; also x86_64 (which can be ILP32 for x32) and Win64 (which is LLP64). */ -#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) +#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) || defined (__mips) #define OPUS_FAST_INT64 1 #else #define OPUS_FAST_INT64 0 From 3c252b78057029dc9b3b040915e6f877ce7803b0 Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 17:28:44 +0300 Subject: [PATCH 4/7] MIPS: generalize build options Current MIPS port supports special implementation for CPUs with DSP extension support. It is enabled by setting MIPSr1_ASM flag for compiler. The name MIPSr1_ASM is very confusing, r1 might mean: - MIPS I - MIPS32/MIPS64 release 1 - MIPS DSP ASE revision 1 (the correct one) Let's remove it and check GCC's builtin define __mips_dsp instead. This allows use default autotools build system instead of hand-crafted makefile. Signed-off-by: Siarhei Volkau --- Makefile.mips | 169 ------------------------ celt/_kiss_fft_guts.h | 2 +- celt/celt.c | 2 +- celt/fixed_generic.h | 2 +- celt/mdct.c | 2 +- celt/pitch.h | 2 +- celt/vq.c | 2 +- silk/NSQ_del_dec.c | 2 +- silk/SigProc_FIX.h | 2 +- silk/fixed/noise_shape_analysis_FIX.c | 4 +- silk/fixed/warped_autocorrelation_FIX.c | 2 +- silk/macros.h | 2 +- 12 files changed, 12 insertions(+), 181 deletions(-) delete mode 100644 Makefile.mips diff --git a/Makefile.mips b/Makefile.mips deleted file mode 100644 index bc12ba86e..000000000 --- a/Makefile.mips +++ /dev/null @@ -1,169 +0,0 @@ -#################### COMPILE OPTIONS ####################### - -# Uncomment this for fixed-point build -FIXED_POINT=1 - -# It is strongly recommended to uncomment one of these -# VAR_ARRAYS: Use C99 variable-length arrays for stack allocation -# USE_ALLOCA: Use alloca() for stack allocation -# If none is defined, then the fallback is a non-threadsafe global array -CFLAGS := -DUSE_ALLOCA $(CFLAGS) -#CFLAGS := -DVAR_ARRAYS $(CFLAGS) - -# These options affect performance -# HAVE_LRINTF: Use C99 intrinsics to speed up float-to-int conversion -CFLAGS := -DHAVE_LRINTF $(CFLAGS) - -###################### END OF OPTIONS ###################### - --include package_version - -include silk_sources.mk -include celt_sources.mk -include opus_sources.mk - -ifdef FIXED_POINT -SILK_SOURCES += $(SILK_SOURCES_FIXED) -else -SILK_SOURCES += $(SILK_SOURCES_FLOAT) -OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) -endif - -EXESUFFIX = -LIBPREFIX = lib -LIBSUFFIX = .a -OBJSUFFIX = .o - -CC = $(TOOLCHAIN_PREFIX)cc$(TOOLCHAIN_SUFFIX) -AR = $(TOOLCHAIN_PREFIX)ar -RANLIB = $(TOOLCHAIN_PREFIX)ranlib -CP = $(TOOLCHAIN_PREFIX)cp - -cppflags-from-defines = $(addprefix -D,$(1)) -cppflags-from-includes = $(addprefix -I,$(1)) -ldflags-from-ldlibdirs = $(addprefix -L,$(1)) -ldlibs-from-libs = $(addprefix -l,$(1)) - -WARNINGS = -Wall -W -Wstrict-prototypes -Wextra -Wcast-align -Wnested-externs -Wshadow - -CFLAGS += -mips32r2 -mno-mips16 -std=gnu99 -O2 -g $(WARNINGS) -DENABLE_ASSERTIONS -DMIPSr1_ASM -DOPUS_BUILD -mdspr2 -march=74kc -mtune=74kc -mmt -mgp32 - -CINCLUDES = include silk celt - -ifdef FIXED_POINT -CFLAGS += -DFIXED_POINT=1 -DDISABLE_FLOAT_API -CINCLUDES += silk/fixed -else -CINCLUDES += silk/float -endif - - -LIBS = m - -LDLIBDIRS = ./ - -CFLAGS += $(call cppflags-from-defines,$(CDEFINES)) -CFLAGS += $(call cppflags-from-includes,$(CINCLUDES)) -LDFLAGS += $(call ldflags-from-ldlibdirs,$(LDLIBDIRS)) -LDLIBS += $(call ldlibs-from-libs,$(LIBS)) - -COMPILE.c.cmdline = $(CC) -c $(CFLAGS) -o $@ $< -LINK.o = $(CC) $(LDPREFLAGS) $(LDFLAGS) -LINK.o.cmdline = $(LINK.o) $^ $(LDLIBS) -o $@$(EXESUFFIX) - -ARCHIVE.cmdline = $(AR) $(ARFLAGS) $@ $^ && $(RANLIB) $@ - -%$(OBJSUFFIX):%.c - $(COMPILE.c.cmdline) - -%$(OBJSUFFIX):%.cpp - $(COMPILE.cpp.cmdline) - -# Directives - - -# Variable definitions -LIB_NAME = opus -TARGET = $(LIBPREFIX)$(LIB_NAME)$(LIBSUFFIX) - -SRCS_C = $(SILK_SOURCES) $(CELT_SOURCES) $(OPUS_SOURCES) - -OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(SRCS_C)) - -OPUSDEMO_SRCS_C = src/opus_demo.c -OPUSDEMO_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSDEMO_SRCS_C)) - -TESTOPUSAPI_SRCS_C = tests/test_opus_api.c -TESTOPUSAPI_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSAPI_SRCS_C)) - -TESTOPUSDECODE_SRCS_C = tests/test_opus_decode.c -TESTOPUSDECODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSDECODE_SRCS_C)) - -TESTOPUSENCODE_SRCS_C = tests/test_opus_encode.c tests/opus_encode_regressions.c -TESTOPUSENCODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSENCODE_SRCS_C)) - -TESTOPUSEXTENSIONS_SRCS_C = tests/test_opus_extensions.c -TESTOPUSEXTENSIONS_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSEXTENSIONS_SRCS_C)) - -TESTOPUSPADDING_SRCS_C = tests/test_opus_padding.c -TESTOPUSPADDING_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSPADDING_SRCS_C)) - -OPUSCOMPARE_SRCS_C = src/opus_compare.c -OPUSCOMPARE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSCOMPARE_SRCS_C)) - -TESTS := test_opus_api test_opus_decode test_opus_encode test_opus_extensions test_opus_padding - -# Rules -all: lib opus_demo opus_compare $(TESTS) - -lib: $(TARGET) - -check: all - for test in $(TESTS); do ./$$test; done - -$(TARGET): $(OBJS) - $(ARCHIVE.cmdline) - -opus_demo$(EXESUFFIX): $(OPUSDEMO_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_api$(EXESUFFIX): $(TESTOPUSAPI_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_decode$(EXESUFFIX): $(TESTOPUSDECODE_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_encode$(EXESUFFIX): $(TESTOPUSENCODE_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_extensions$(EXESUFFIX): $(TESTOPUSEXTENSIONS_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_padding$(EXESUFFIX): $(TESTOPUSPADDING_OBJS) $(TARGET) - $(LINK.o.cmdline) - -opus_compare$(EXESUFFIX): $(OPUSCOMPARE_OBJS) - $(LINK.o.cmdline) - -celt/celt.o: CFLAGS += -DPACKAGE_VERSION='$(PACKAGE_VERSION)' -celt/celt.o: package_version - -package_version: force - @if [ -x ./update_version ]; then \ - ./update_version || true; \ - elif [ ! -e ./package_version ]; then \ - echo 'PACKAGE_VERSION="unknown"' > ./package_version; \ - fi - -force: - -clean: - rm -f opus_demo$(EXESUFFIX) opus_compare$(EXESUFFIX) $(TARGET) \ - test_opus_api$(EXESUFFIX) test_opus_decode$(EXESUFFIX) \ - test_opus_encode$(EXESUFFIX) test_opus_extensions$(EXESUFFIX) \ - test_opus_padding$(EXESUFFIX) \ - $(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS) $(TESTOPUSAPI_OBJS) \ - $(TESTOPUSDECODE_OBJS) $(TESTOPUSENCODE_OBJS) \ - $(TESTOPUSEXTENSIONS_OBJS) $(TESTOPUSPADDING_OBJS) - -.PHONY: all lib clean force check diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index 89ccc8039..5b1bfbcfe 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -102,7 +102,7 @@ #if defined(OPUS_ARM_INLINE_EDSP) #include "arm/kiss_fft_armv5e.h" #endif -#if defined(MIPSr1_ASM) +#if defined(__mips_dsp) #include "mips/kiss_fft_mipsr1.h" #endif diff --git a/celt/celt.c b/celt/celt.c index 2235d46b8..8ca0e0baa 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -54,7 +54,7 @@ #define PACKAGE_VERSION "unknown" #endif -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/celt_mipsr1.h" #endif diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 743b064e9..86d345335 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -200,7 +200,7 @@ /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */ #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b))) -#if defined(MIPSr1_ASM) +#if defined(__mips_dsp) #include "mips/fixed_generic_mipsr1.h" #endif diff --git a/celt/mdct.c b/celt/mdct.c index f8483a2df..6812b8815 100644 --- a/celt/mdct.c +++ b/celt/mdct.c @@ -53,7 +53,7 @@ #include "mathops.h" #include "stack_alloc.h" -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/mdct_mipsr1.h" #endif diff --git a/celt/pitch.h b/celt/pitch.h index dd0e2bebd..25c0ad379 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -42,7 +42,7 @@ #include "x86/pitch_sse.h" #endif -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/pitch_mipsr1.h" #endif diff --git a/celt/vq.c b/celt/vq.c index df8754d9d..e49054303 100644 --- a/celt/vq.c +++ b/celt/vq.c @@ -39,7 +39,7 @@ #include "rate.h" #include "pitch.h" -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/vq_mipsr1.h" #endif diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c index e8dadf159..1ec177446 100644 --- a/silk/NSQ_del_dec.c +++ b/silk/NSQ_del_dec.c @@ -61,7 +61,7 @@ typedef struct { typedef NSQ_sample_struct NSQ_sample_pair[ 2 ]; -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/NSQ_del_dec_mipsr1.h" #endif static OPUS_INLINE void silk_nsq_del_dec_scale_states( diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index 2ac0d3451..49a70a8e9 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -631,7 +631,7 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b) #include "arm/SigProc_FIX_armv5e.h" #endif -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/sigproc_fix_mipsr1.h" #endif diff --git a/silk/fixed/noise_shape_analysis_FIX.c b/silk/fixed/noise_shape_analysis_FIX.c index 85fea0bf0..a8504e263 100644 --- a/silk/fixed/noise_shape_analysis_FIX.c +++ b/silk/fixed/noise_shape_analysis_FIX.c @@ -128,8 +128,8 @@ static OPUS_INLINE void limit_warped_coefs( silk_assert( 0 ); } -/* Disable MIPS version until it's updated. */ -#if 0 && defined(MIPSr1_ASM) +/* Disable MIPS DSP version until it's updated. */ +#if 0 && defined(__mips_dsp) #include "mips/noise_shape_analysis_FIX_mipsr1.h" #endif diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c index 5c79553bc..8caf0afb1 100644 --- a/silk/fixed/warped_autocorrelation_FIX.c +++ b/silk/fixed/warped_autocorrelation_FIX.c @@ -31,7 +31,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" -#if defined(MIPSr1_ASM) +#if defined(__mips_dsp) #include "mips/warped_autocorrelation_FIX_mipsr1.h" #endif diff --git a/silk/macros.h b/silk/macros.h index 667b48d3a..7d3f3a28a 100644 --- a/silk/macros.h +++ b/silk/macros.h @@ -104,7 +104,7 @@ POSSIBILITY OF SUCH DAMAGE. (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) : \ ((((a)^0x80000000) & (b) & 0x80000000) ? silk_int32_MAX : (a)-(b)) ) -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/macros_mipsr1.h" #endif From fb1337fabce40ed696bcf7c40795fd9c276377eb Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 17:53:43 +0300 Subject: [PATCH 5/7] MIPS DSP: fix renormalise_vector signature Looks like MIPS port is abandoned? Not surprised though. There's a lot of updates for mdct too, will be addressed in another patch. Signed-off-by: Siarhei Volkau --- celt/mips/vq_mipsr1.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h index 1621c5624..71850c09d 100644 --- a/celt/mips/vq_mipsr1.h +++ b/celt/mips/vq_mipsr1.h @@ -64,7 +64,7 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_ } #define OVERRIDE_renormalise_vector -void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) +void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch) { int i; #ifdef FIXED_POINT @@ -102,7 +102,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) k = celt_ilog2(E)>>1; #endif t = VSHR32(E, 2*(k-7)); - g = MULT16_16_P15(celt_rsqrt_norm(t),gain); + g = MULT32_32_Q31(celt_rsqrt_norm(t),gain); xptr = X; for (i=0;i Date: Sun, 17 Aug 2025 18:49:41 +0300 Subject: [PATCH 6/7] refactor: MIPS DSP: inline assembly GCC supports all MIPS DSP and DSPr2 instructions in form of builtin functions, this is more convenient way rather than inline assembly. Moreover, performance on MIPS heavily depends on instruction scheduling GCC is unable to schedule inline assembly properly because it doesn't know what exactly the asm routine do. Signed-off-by: Siarhei Volkau --- celt/mips/celt_mipsr1.h | 26 ++++++------- celt/mips/fixed_generic_mipsr1.h | 66 +++++++++++--------------------- celt/mips/kiss_fft_mipsr1.h | 42 +++++++++----------- celt/mips/pitch_mipsr1.h | 24 +++++------- celt/mips/vq_mipsr1.h | 14 +++---- 5 files changed, 70 insertions(+), 102 deletions(-) diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h index d1b25c204..7fa8d4358 100644 --- a/celt/mips/celt_mipsr1.h +++ b/celt/mips/celt_mipsr1.h @@ -97,18 +97,17 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, { opus_val16 f; opus_val32 res; + long long acc; f = MULT16_16_Q15(window[i],window[i]); x0= x[i-T1+2]; - asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0])); - - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1]))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2]))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2)); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0))); - - asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15)); + acc = __builtin_mips_mult((int)MULT16_16_Q15((Q15ONE-f),g00), (int)x[i-T0]); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g01), (int)ADD32(x[i-T0-1],x[i-T0+1])); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g02), (int)ADD32(x[i-T0-2],x[i-T0+2])); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g10), (int)x2); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g11), (int)ADD32(x3,x1)); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g12), (int)ADD32(x4,x0)); + res = __builtin_mips_extr_w(acc, 15); y[i] = x[i] + res; @@ -134,13 +133,14 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, for (i=overlap;i>1; #endif From aeafc0c47408b6fd89c1defbb6960798a9535a69 Mon Sep 17 00:00:00 2001 From: Siarhei Volkau Date: Sun, 17 Aug 2025 21:53:00 +0300 Subject: [PATCH 7/7] MIPS DSP: sync mdct with c version Changes from C version of MDCT algo ported into MIPS variant. Signed-off-by: Siarhei Volkau --- celt/mips/mdct_mipsr1.h | 113 +++++++++++++++++++++++++++++----------- 1 file changed, 82 insertions(+), 31 deletions(-) diff --git a/celt/mips/mdct_mipsr1.h b/celt/mips/mdct_mipsr1.h index 7456c181a..c8accc093 100644 --- a/celt/mips/mdct_mipsr1.h +++ b/celt/mips/mdct_mipsr1.h @@ -55,10 +55,22 @@ #include "mathops.h" #include "stack_alloc.h" +static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) { + long long acc = __builtin_mips_mult(a, b); + acc = __builtin_mips_madd(acc, c, d); + return __builtin_mips_extr_w(acc, 15+shift); +} + +static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) { + long long acc = __builtin_mips_mult(a, b); + acc = __builtin_mips_msub(acc, c, d); + return __builtin_mips_extr_w(acc, 15+shift); +} + /* Forward MDCT trashes the input array */ #define OVERRIDE_clt_mdct_forward void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, - const opus_val16 *window, int overlap, int shift, int stride, int arch) + const celt_coef *window, int overlap, int shift, int stride, int arch) { int i; int N, N2, N4; @@ -66,16 +78,15 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar VARDECL(kiss_fft_cpx, f2); const kiss_fft_state *st = l->kfft[shift]; const kiss_twiddle_scalar *trig; - opus_val16 scale; + celt_coef scale; #ifdef FIXED_POINT /* Allows us to scale with MULT16_32_Q16(), which is faster than MULT16_32_Q15() on ARM. */ int scale_shift = st->scale_shift-1; + int headroom; #endif - - (void)arch; - SAVE_STACK; + (void)arch; scale = st->scale; N = l->n; @@ -98,8 +109,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); kiss_fft_scalar * OPUS_RESTRICT yp = f; - const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); - const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; + const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); + const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; for(i=0;i<((overlap+3)>>2);i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ @@ -123,7 +134,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(;ibitrev[i]] = yc; } +#ifdef FIXED_POINT + headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); +#endif } /* N/4 complex FFT, does not downscale anymore */ - opus_fft_impl(st, f2); + opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); /* Post-rotate */ { @@ -170,8 +193,16 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(i=0;ii,t[N4+i] , fp->r,t[i]); - yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]); + kiss_fft_scalar t0, t1; +#ifdef ENABLE_QEXT + t0 = S_MUL2(t[i], scale); + t1 = S_MUL2(t[N4+i], scale); +#else + t0 = t[i]; + t1 = t[N4+i]; +#endif + yr = S_MUL_SUB_PSR(fp->i,t1 , fp->r,t0, headroom); + yi = S_MUL_ADD_PSR(fp->r,t1 , fp->i,t0, headroom); *yp1 = yr; *yp2 = yi; fp++; @@ -184,13 +215,15 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar #define OVERRIDE_clt_mdct_backward void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, - const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) + const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) { int i; int N, N2, N4; const kiss_twiddle_scalar *trig; - - (void)arch; +#ifdef FIXED_POINT + int pre_shift, post_shift, fft_shift; +#endif + (void) arch; N = l->n; trig = l->trig; @@ -202,6 +235,21 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala N2 = N>>1; N4 = N>>2; +#ifdef FIXED_POINT + { + opus_val32 sumval=N2; + opus_val32 maxval=0; + for (i=0;ikfft[shift], (kiss_fft_cpx*)(out+(overlap>>1))); + opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); /* Post-rotate and de-shuffle from both ends of the buffer at once to make it in-place. */ { - kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); - kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; + kiss_fft_scalar * yp0 = out+(overlap>>1); + kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; const kiss_twiddle_scalar *t = &trig[0]; /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the middle pair will be computed twice. */ @@ -246,8 +297,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[i]; t1 = t[N4+i]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL_ADD(re,t0 , im,t1); - yi = S_MUL_SUB(re,t1 , im,t0); + yr = S_MUL_ADD_PSR(re,t0 , im,t1, post_shift); + yi = S_MUL_SUB_PSR(re,t1 , im,t0, post_shift); /* We swap real and imag because we're using an FFT instead of an IFFT. */ re = yp1[1]; im = yp1[0]; @@ -257,8 +308,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[(N4-i-1)]; t1 = t[(N2-i-1)]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL_ADD(re,t0,im,t1); - yi = S_MUL_SUB(re,t1,im,t0); + yr = S_MUL_ADD_PSR(re,t0,im,t1, post_shift); + yi = S_MUL_SUB_PSR(re,t1,im,t0, post_shift); yp1[0] = yr; yp0[1] = yi; yp0 += 2; @@ -270,16 +321,16 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala { kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; kiss_fft_scalar * OPUS_RESTRICT yp1 = out; - const opus_val16 * OPUS_RESTRICT wp1 = window; - const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; + const celt_coef * OPUS_RESTRICT wp1 = window; + const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; for(i = 0; i < overlap/2; i++) { kiss_fft_scalar x1, x2; x1 = *xp1; x2 = *yp1; - *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); - *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); + *yp1++ = S_MUL_SUB(x2, *wp2, x1, *wp1); + *xp1-- = S_MUL_ADD(x2, *wp1, x1, *wp2); wp1++; wp2--; }