From 67181aa353fd2accdb661ed87327a9b67bcddf73 Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 16:58:36 +0300
Subject: [PATCH 1/7] refactor: MIPS: fix __builtin_clz usage

Due to historical reasons __builtin_clz result is undefined for 0.
Fix that in MIPS port.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 silk/mips/macros_mipsr1.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/silk/mips/macros_mipsr1.h b/silk/mips/macros_mipsr1.h
index af408802c..5fbe1ab7e 100644
--- a/silk/mips/macros_mipsr1.h
+++ b/silk/mips/macros_mipsr1.h
@@ -29,7 +29,10 @@ POSSIBILITY OF SUCH DAMAGE.
 #ifndef SILK_MACROS_MIPSR1_H__
 #define SILK_MACROS_MIPSR1_H__
 
-#define mips_clz(x) __builtin_clz(x)
+static inline int mips_clz(opus_uint32 x)
+{
+    return x ? __builtin_clz(x) : 32;
+}
 
 #undef silk_SMULWB
 static inline int silk_SMULWB(int a, int b)

From 41ebef3280cec71def7721ca092311a016f0d7ed Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 17:02:57 +0300
Subject: [PATCH 2/7] refactor: MIPS: fix silk_CLZ16 port

silk_CLZ16 MIPS port does sign extension from opus_int16 to opus_int32.
In case of negative input it will return -16 instead of expected 0.

Input should be zero extended for mips_clz / __builtin_clz.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 silk/mips/macros_mipsr1.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/silk/mips/macros_mipsr1.h b/silk/mips/macros_mipsr1.h
index 5fbe1ab7e..0393a33fa 100644
--- a/silk/mips/macros_mipsr1.h
+++ b/silk/mips/macros_mipsr1.h
@@ -78,9 +78,9 @@ static inline int silk_SMLAWW(int a, int b, int c)
 static inline opus_int32 silk_CLZ16(opus_int16 in16)
 {
     int re32;
-    opus_int32 in32 = (opus_int32 )in16;
+    opus_uint32 in32 = (opus_uint16)in16;
     re32 = mips_clz(in32);
-    re32-=16;
+    re32 -= 16;
     return re32;
 }
 

From 00e141f9f3be187cb33bcf0383ff777107768305 Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 17:12:41 +0300
Subject: [PATCH 3/7] MIPS: enable OPUS_FAST_INT64 for any MIPS

While 32-bit MIPS doesn't fit for rules described for OPUS_FAST_INT64
enabling, it has fast 32x32 multiplication with full 64-bit result.

That's enough to enable OPUS_FAST_INT64 for any MIPS since
OPUS_FAST_INT64 guards various multiplication implementation.
Maybe it's worth to get it more precise name? e.g. OPUS_FAST_MULT.

GCC macro __mips covers both 32- and 64-bit MIPS.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 celt/arch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celt/arch.h b/celt/arch.h
index a6055f403..dd095b218 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -121,7 +121,7 @@ void celt_fatal(const char *str, const char *file, int line)
 /* Set this if opus_int64 is a native type of the CPU. */
 /* Assume that all LP64 architectures have fast 64-bit types; also x86_64
    (which can be ILP32 for x32) and Win64 (which is LLP64). */
-#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
+#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) || defined (__mips)
 #define OPUS_FAST_INT64 1
 #else
 #define OPUS_FAST_INT64 0

From 3c252b78057029dc9b3b040915e6f877ce7803b0 Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 17:28:44 +0300
Subject: [PATCH 4/7] MIPS: generalize build options

Current MIPS port supports special implementation for CPUs with DSP
extension support. It is enabled by setting MIPSr1_ASM flag for
compiler.

The name MIPSr1_ASM is very confusing, r1 might mean:
- MIPS I
- MIPS32/MIPS64 release 1
- MIPS DSP ASE revision 1 (the correct one)

Let's remove it and check GCC's builtin define __mips_dsp instead.
This allows use default autotools build system instead of hand-crafted
makefile.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 Makefile.mips                           | 169 ------------------------
 celt/_kiss_fft_guts.h                   |   2 +-
 celt/celt.c                             |   2 +-
 celt/fixed_generic.h                    |   2 +-
 celt/mdct.c                             |   2 +-
 celt/pitch.h                            |   2 +-
 celt/vq.c                               |   2 +-
 silk/NSQ_del_dec.c                      |   2 +-
 silk/SigProc_FIX.h                      |   2 +-
 silk/fixed/noise_shape_analysis_FIX.c   |   4 +-
 silk/fixed/warped_autocorrelation_FIX.c |   2 +-
 silk/macros.h                           |   2 +-
 12 files changed, 12 insertions(+), 181 deletions(-)
 delete mode 100644 Makefile.mips

diff --git a/Makefile.mips b/Makefile.mips
deleted file mode 100644
index bc12ba86e..000000000
--- a/Makefile.mips
+++ /dev/null
@@ -1,169 +0,0 @@
-#################### COMPILE OPTIONS #######################
-
-# Uncomment this for fixed-point build
-FIXED_POINT=1
-
-# It is strongly recommended to uncomment one of these
-# VAR_ARRAYS: Use C99 variable-length arrays for stack allocation
-# USE_ALLOCA: Use alloca() for stack allocation
-# If none is defined, then the fallback is a non-threadsafe global array
-CFLAGS := -DUSE_ALLOCA $(CFLAGS)
-#CFLAGS := -DVAR_ARRAYS $(CFLAGS)
-
-# These options affect performance
-# HAVE_LRINTF: Use C99 intrinsics to speed up float-to-int conversion
-CFLAGS := -DHAVE_LRINTF $(CFLAGS)
-
-###################### END OF OPTIONS ######################
-
--include package_version
-
-include silk_sources.mk
-include celt_sources.mk
-include opus_sources.mk
-
-ifdef FIXED_POINT
-SILK_SOURCES += $(SILK_SOURCES_FIXED)
-else
-SILK_SOURCES += $(SILK_SOURCES_FLOAT)
-OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
-endif
-
-EXESUFFIX =
-LIBPREFIX = lib
-LIBSUFFIX = .a
-OBJSUFFIX = .o
-
-CC     = $(TOOLCHAIN_PREFIX)cc$(TOOLCHAIN_SUFFIX)
-AR     = $(TOOLCHAIN_PREFIX)ar
-RANLIB = $(TOOLCHAIN_PREFIX)ranlib
-CP     = $(TOOLCHAIN_PREFIX)cp
-
-cppflags-from-defines   = $(addprefix -D,$(1))
-cppflags-from-includes  = $(addprefix -I,$(1))
-ldflags-from-ldlibdirs  = $(addprefix -L,$(1))
-ldlibs-from-libs        = $(addprefix -l,$(1))
-
-WARNINGS = -Wall -W -Wstrict-prototypes -Wextra -Wcast-align -Wnested-externs -Wshadow
-
-CFLAGS  += -mips32r2 -mno-mips16 -std=gnu99 -O2 -g $(WARNINGS) -DENABLE_ASSERTIONS -DMIPSr1_ASM -DOPUS_BUILD -mdspr2 -march=74kc -mtune=74kc -mmt -mgp32
-
-CINCLUDES = include silk celt
-
-ifdef FIXED_POINT
-CFLAGS += -DFIXED_POINT=1 -DDISABLE_FLOAT_API
-CINCLUDES += silk/fixed
-else
-CINCLUDES += silk/float
-endif
-
-
-LIBS = m
-
-LDLIBDIRS = ./
-
-CFLAGS  += $(call cppflags-from-defines,$(CDEFINES))
-CFLAGS  += $(call cppflags-from-includes,$(CINCLUDES))
-LDFLAGS += $(call ldflags-from-ldlibdirs,$(LDLIBDIRS))
-LDLIBS  += $(call ldlibs-from-libs,$(LIBS))
-
-COMPILE.c.cmdline   = $(CC) -c $(CFLAGS) -o $@ $<
-LINK.o              = $(CC) $(LDPREFLAGS) $(LDFLAGS)
-LINK.o.cmdline      = $(LINK.o) $^ $(LDLIBS) -o $@$(EXESUFFIX)
-
-ARCHIVE.cmdline     = $(AR) $(ARFLAGS) $@ $^ && $(RANLIB) $@
-
-%$(OBJSUFFIX):%.c
-	$(COMPILE.c.cmdline)
-
-%$(OBJSUFFIX):%.cpp
-	$(COMPILE.cpp.cmdline)
-
-# Directives
-
-
-# Variable definitions
-LIB_NAME = opus
-TARGET = $(LIBPREFIX)$(LIB_NAME)$(LIBSUFFIX)
-
-SRCS_C = $(SILK_SOURCES) $(CELT_SOURCES) $(OPUS_SOURCES)
-
-OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(SRCS_C))
-
-OPUSDEMO_SRCS_C = src/opus_demo.c
-OPUSDEMO_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSDEMO_SRCS_C))
-
-TESTOPUSAPI_SRCS_C = tests/test_opus_api.c
-TESTOPUSAPI_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSAPI_SRCS_C))
-
-TESTOPUSDECODE_SRCS_C = tests/test_opus_decode.c
-TESTOPUSDECODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSDECODE_SRCS_C))
-
-TESTOPUSENCODE_SRCS_C = tests/test_opus_encode.c tests/opus_encode_regressions.c
-TESTOPUSENCODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSENCODE_SRCS_C))
-
-TESTOPUSEXTENSIONS_SRCS_C = tests/test_opus_extensions.c
-TESTOPUSEXTENSIONS_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSEXTENSIONS_SRCS_C))
-
-TESTOPUSPADDING_SRCS_C = tests/test_opus_padding.c
-TESTOPUSPADDING_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSPADDING_SRCS_C))
-
-OPUSCOMPARE_SRCS_C = src/opus_compare.c
-OPUSCOMPARE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSCOMPARE_SRCS_C))
-
-TESTS := test_opus_api test_opus_decode test_opus_encode test_opus_extensions test_opus_padding
-
-# Rules
-all: lib opus_demo opus_compare $(TESTS)
-
-lib: $(TARGET)
-
-check: all
-	for test in $(TESTS); do ./$$test; done
-
-$(TARGET): $(OBJS)
-	$(ARCHIVE.cmdline)
-
-opus_demo$(EXESUFFIX): $(OPUSDEMO_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-test_opus_api$(EXESUFFIX): $(TESTOPUSAPI_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-test_opus_decode$(EXESUFFIX): $(TESTOPUSDECODE_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-test_opus_encode$(EXESUFFIX): $(TESTOPUSENCODE_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-test_opus_extensions$(EXESUFFIX): $(TESTOPUSEXTENSIONS_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-test_opus_padding$(EXESUFFIX): $(TESTOPUSPADDING_OBJS) $(TARGET)
-	$(LINK.o.cmdline)
-
-opus_compare$(EXESUFFIX): $(OPUSCOMPARE_OBJS)
-	$(LINK.o.cmdline)
-
-celt/celt.o: CFLAGS += -DPACKAGE_VERSION='$(PACKAGE_VERSION)'
-celt/celt.o: package_version
-
-package_version: force
-	@if [ -x ./update_version ]; then \
-		./update_version || true; \
-	elif [ ! -e ./package_version ]; then \
-		echo 'PACKAGE_VERSION="unknown"' > ./package_version; \
-	fi
-
-force:
-
-clean:
-	rm -f opus_demo$(EXESUFFIX) opus_compare$(EXESUFFIX) $(TARGET) \
-                test_opus_api$(EXESUFFIX) test_opus_decode$(EXESUFFIX) \
-                test_opus_encode$(EXESUFFIX) test_opus_extensions$(EXESUFFIX) \
-                test_opus_padding$(EXESUFFIX) \
-		$(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS) $(TESTOPUSAPI_OBJS) \
-                $(TESTOPUSDECODE_OBJS) $(TESTOPUSENCODE_OBJS) \
-                $(TESTOPUSEXTENSIONS_OBJS) $(TESTOPUSPADDING_OBJS)
-
-.PHONY: all lib clean force check
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 89ccc8039..5b1bfbcfe 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -102,7 +102,7 @@
 #if defined(OPUS_ARM_INLINE_EDSP)
 #include "arm/kiss_fft_armv5e.h"
 #endif
-#if defined(MIPSr1_ASM)
+#if defined(__mips_dsp)
 #include "mips/kiss_fft_mipsr1.h"
 #endif
 
diff --git a/celt/celt.c b/celt/celt.c
index 2235d46b8..8ca0e0baa 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -54,7 +54,7 @@
 #define PACKAGE_VERSION "unknown"
 #endif
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/celt_mipsr1.h"
 #endif
 
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 743b064e9..86d345335 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -200,7 +200,7 @@
 /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
 #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
 
-#if defined(MIPSr1_ASM)
+#if defined(__mips_dsp)
 #include "mips/fixed_generic_mipsr1.h"
 #endif
 
diff --git a/celt/mdct.c b/celt/mdct.c
index f8483a2df..6812b8815 100644
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -53,7 +53,7 @@
 #include "mathops.h"
 #include "stack_alloc.h"
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/mdct_mipsr1.h"
 #endif
 
diff --git a/celt/pitch.h b/celt/pitch.h
index dd0e2bebd..25c0ad379 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -42,7 +42,7 @@
 #include "x86/pitch_sse.h"
 #endif
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/pitch_mipsr1.h"
 #endif
 
diff --git a/celt/vq.c b/celt/vq.c
index df8754d9d..e49054303 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -39,7 +39,7 @@
 #include "rate.h"
 #include "pitch.h"
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/vq_mipsr1.h"
 #endif
 
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index e8dadf159..1ec177446 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -61,7 +61,7 @@ typedef struct {
 
 typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/NSQ_del_dec_mipsr1.h"
 #endif
 static OPUS_INLINE void silk_nsq_del_dec_scale_states(
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index 2ac0d3451..49a70a8e9 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -631,7 +631,7 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 #include "arm/SigProc_FIX_armv5e.h"
 #endif
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/sigproc_fix_mipsr1.h"
 #endif
 
diff --git a/silk/fixed/noise_shape_analysis_FIX.c b/silk/fixed/noise_shape_analysis_FIX.c
index 85fea0bf0..a8504e263 100644
--- a/silk/fixed/noise_shape_analysis_FIX.c
+++ b/silk/fixed/noise_shape_analysis_FIX.c
@@ -128,8 +128,8 @@ static OPUS_INLINE void limit_warped_coefs(
     silk_assert( 0 );
 }
 
-/* Disable MIPS version until it's updated. */
-#if 0 && defined(MIPSr1_ASM)
+/* Disable MIPS DSP version until it's updated. */
+#if 0 && defined(__mips_dsp)
 #include "mips/noise_shape_analysis_FIX_mipsr1.h"
 #endif
 
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index 5c79553bc..8caf0afb1 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -31,7 +31,7 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#if defined(MIPSr1_ASM)
+#if defined(__mips_dsp)
 #include "mips/warped_autocorrelation_FIX_mipsr1.h"
 #endif
 
diff --git a/silk/macros.h b/silk/macros.h
index 667b48d3a..7d3f3a28a 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -104,7 +104,7 @@ POSSIBILITY OF SUCH DAMAGE.
                                         (( (a) & ((b)^0x80000000) & 0x80000000) ? silk_int32_MIN : (a)-(b)) :    \
                                         ((((a)^0x80000000) & (b)  & 0x80000000) ? silk_int32_MAX : (a)-(b)) )
 
-#if defined(MIPSr1_ASM)
+#if defined(FIXED_POINT) && defined(__mips_dsp)
 #include "mips/macros_mipsr1.h"
 #endif
 

From fb1337fabce40ed696bcf7c40795fd9c276377eb Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 17:53:43 +0300
Subject: [PATCH 5/7] MIPS DSP: fix renormalise_vector signature

Looks like MIPS port is abandoned? Not surprised though.

There's a lot of updates for mdct too, will be addressed
in another patch.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 celt/mips/vq_mipsr1.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h
index 1621c5624..71850c09d 100644
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h
@@ -64,7 +64,7 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_
 }
 
 #define OVERRIDE_renormalise_vector
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
+void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch)
 {
    int i;
 #ifdef FIXED_POINT
@@ -102,7 +102,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
    k = celt_ilog2(E)>>1;
 #endif
    t = VSHR32(E, 2*(k-7));
-   g = MULT16_16_P15(celt_rsqrt_norm(t),gain);
+   g = MULT32_32_Q31(celt_rsqrt_norm(t),gain);
 
    xptr = X;
    for (i=0;i<N;i++)

From 4bf56cd291e0b066b42d01d2fff18701d4fb20cc Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 18:49:41 +0300
Subject: [PATCH 6/7] refactor: MIPS DSP: inline assembly

GCC supports all MIPS DSP and DSPr2 instructions in form
of builtin functions, this is more convenient way rather than
inline assembly.

Moreover, performance on MIPS heavily depends on instruction scheduling
GCC is unable to schedule inline assembly properly because it
doesn't know what exactly the asm routine do.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 celt/mips/celt_mipsr1.h          | 26 ++++++-------
 celt/mips/fixed_generic_mipsr1.h | 66 +++++++++++---------------------
 celt/mips/kiss_fft_mipsr1.h      | 42 +++++++++-----------
 celt/mips/pitch_mipsr1.h         | 24 +++++-------
 celt/mips/vq_mipsr1.h            | 14 +++----
 5 files changed, 70 insertions(+), 102 deletions(-)

diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h
index d1b25c204..7fa8d4358 100644
--- a/celt/mips/celt_mipsr1.h
+++ b/celt/mips/celt_mipsr1.h
@@ -97,18 +97,17 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
    {
       opus_val16 f;
       opus_val32 res;
+      long long acc;
       f = MULT16_16_Q15(window[i],window[i]);
       x0= x[i-T1+2];
 
-      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0]));
-
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1])));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2])));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1)));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0)));
-
-      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
+      acc = __builtin_mips_mult((int)MULT16_16_Q15((Q15ONE-f),g00), (int)x[i-T0]);
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g01), (int)ADD32(x[i-T0-1],x[i-T0+1]));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g02), (int)ADD32(x[i-T0-2],x[i-T0+2]));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g10), (int)x2);
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g11), (int)ADD32(x3,x1));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g12), (int)ADD32(x4,x0));
+      res = __builtin_mips_extr_w(acc, 15);
 
       y[i] = x[i] + res;
 
@@ -134,13 +133,14 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
    for (i=overlap;i<N;i++)
    {
       opus_val32 res;
+      long long acc;
       x0=x[i-T1+2];
 
-      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)g10), "r" ((int)x2));
+      acc = __builtin_mips_mult((int)g10, (int)x2);
+      acc = __builtin_mips_madd(acc, (int)g11, (int)ADD32(x3,x1));
+      acc = __builtin_mips_madd(acc, (int)g12, (int)ADD32(x4,x0));
+      res = __builtin_mips_extr_w(acc, 15);
 
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g11), "r" ((int)ADD32(x3,x1)));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g12), "r" ((int)ADD32(x4,x0)));
-      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
       y[i] = x[i] + res;
       x4=x3;
       x3=x2;
diff --git a/celt/mips/fixed_generic_mipsr1.h b/celt/mips/fixed_generic_mipsr1.h
index 4a05efbf8..42f0e4047 100644
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -35,92 +35,72 @@
 
 #undef MULT16_32_Q15_ADD
 static inline int MULT16_32_Q15_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_32_Q15_SUB
 static inline int MULT16_32_Q15_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_16_Q15_ADD
 static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_16_Q15_SUB
 static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 
 #undef MULT16_32_Q16
 static inline int MULT16_32_Q16(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1,%0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (16));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 16);
 }
 
 #undef MULT16_32_P16
 static inline int MULT16_32_P16(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR_R.W %0,$ac1, %1" : "=r" (c): "i" (16));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_r_w(acc, 16);
 }
 
 #undef MULT16_32_Q15
 static inline int MULT16_32_Q15(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (15));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT32_32_Q31
 static inline int MULT32_32_Q31(int a, int b)
 {
-    int r;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (r): "i" (31));
-    return r;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 31);
 }
 
 #undef PSHR32
 static inline int PSHR32(int a, int shift)
 {
-    int r;
-    asm volatile ("SHRAV_R.W %0, %1, %2" :"=r" (r): "r" (a), "r" (shift));
-    return r;
+    return __builtin_mips_shra_r_w(a, shift);
 }
 
 #undef MULT16_16_P15
 static inline int MULT16_16_P15(int a, int b)
 {
-    int r;
-    asm volatile ("mul %0, %1, %2" :"=r" (r): "r" (a), "r" (b));
-    asm volatile ("SHRA_R.W %0, %1, %2" : "+r" (r):  "0" (r), "i"(15));
-    return r;
+    int r = a * b;
+    return __builtin_mips_shra_r_w(r, 15);
 }
 
 #endif /* CELT_FIXED_GENERIC_MIPSR1_H */
diff --git a/celt/mips/kiss_fft_mipsr1.h b/celt/mips/kiss_fft_mipsr1.h
index 400ca4de9..bdb5df804 100644
--- a/celt/mips/kiss_fft_mipsr1.h
+++ b/celt/mips/kiss_fft_mipsr1.h
@@ -37,20 +37,16 @@
 
 #undef S_MUL_ADD
 static inline int S_MUL_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef S_MUL_SUB
 static inline int S_MUL_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef C_MUL
@@ -58,13 +54,12 @@ static inline int S_MUL_SUB(int a, int b, int c, int d) {
 static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
     kiss_fft_cpx m;
 
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
-
+    long long acc1 = __builtin_mips_mult((int)a.r, (int)b.r);
+    long long acc2 = __builtin_mips_mult((int)a.r, (int)b.i);
+    acc1 = __builtin_mips_msub(acc1, (int)a.i, (int)b.i);
+    acc2 = __builtin_mips_madd(acc2, (int)a.i, (int)b.r);
+    m.r = __builtin_mips_extr_w(acc1, 15);
+    m.i = __builtin_mips_extr_w(acc2, 15);
     return m;
 }
 #undef C_MULC
@@ -72,13 +67,12 @@ static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
 static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
     kiss_fft_cpx m;
 
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
-
+    long long acc1 = __builtin_mips_mult((int)a.r, (int)b.r);
+    long long acc2 = __builtin_mips_mult((int)a.i, (int)b.r);
+    acc1 = __builtin_mips_madd(acc1, (int)a.i, (int)b.i);
+    acc2 = __builtin_mips_msub(acc2, (int)a.r, (int)b.i);
+    m.r = __builtin_mips_extr_w(acc1, 15);
+    m.i = __builtin_mips_extr_w(acc2, 15);
     return m;
 }
 
diff --git a/celt/mips/pitch_mipsr1.h b/celt/mips/pitch_mipsr1.h
index a9500aff5..6cbdd78d3 100644
--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
@@ -39,26 +39,22 @@ static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, c
       int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
 {
    int j;
-   opus_val32 xy01=0;
-   opus_val32 xy02=0;
+   long long acc1 = 0;
+   long long acc2 = 0;
 
    (void)arch;
 
-   asm volatile("MULT $ac1, $0, $0");
-   asm volatile("MULT $ac2, $0, $0");
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   for (j=0;j<N;j++)
+   for (j=0;j<N;j+=2)
    {
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
-      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
-      ++j;
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
-      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+       acc1 = __builtin_mips_madd(acc1, (int)x[j],   (int)y01[j]);
+       acc2 = __builtin_mips_madd(acc2, (int)x[j],   (int)y02[j]);
+       acc1 = __builtin_mips_madd(acc1, (int)x[j+1], (int)y01[j+1]);
+       acc2 = __builtin_mips_madd(acc2, (int)x[j+1], (int)y02[j+1]);
    }
-   asm volatile ("mflo %0, $ac1": "=r"(xy01));
-   asm volatile ("mflo %0, $ac2": "=r"(xy02));
-   *xy1 = xy01;
-   *xy2 = xy02;
+
+   *xy1 = (opus_val32)acc1;
+   *xy2 = (opus_val32)acc2;
 }
 
 static inline void xcorr_kernel_mips(const opus_val16 * x,
diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h
index 71850c09d..009c3ef3e 100644
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h
@@ -70,7 +70,8 @@ void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch)
 #ifdef FIXED_POINT
    int k;
 #endif
-   opus_val32 E = EPSILON;
+   long long acc = EPSILON;
+   opus_val32 E;
    opus_val16 g;
    opus_val32 t;
    celt_norm *xptr = X;
@@ -78,26 +79,23 @@ void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch)
 
    (void)arch;
 
-   asm volatile("mult $ac1, $0, $0");
-   asm volatile("MTLO %0, $ac1" : :"r" (E));
    /*if(N %4)
        printf("error");*/
    for (i=0;i<N-2;i+=2)
    {
       X0 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
-
       X1 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X1), "r" (X1));
+      acc = __builtin_mips_madd(acc, X0, X0);
+      acc = __builtin_mips_madd(acc, X1, X1);
    }
 
    for (;i<N;i++)
    {
       X0 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+      acc = __builtin_mips_madd(acc, X0, X0);
    }
 
-   asm volatile("MFLO %0, $ac1" : "=r" (E));
+   E = (opus_val32)acc;
 #ifdef FIXED_POINT
    k = celt_ilog2(E)>>1;
 #endif

From aeafc0c47408b6fd89c1defbb6960798a9535a69 Mon Sep 17 00:00:00 2001
From: Siarhei Volkau <lis8215@gmail.com>
Date: Sun, 17 Aug 2025 21:53:00 +0300
Subject: [PATCH 7/7] MIPS DSP: sync mdct with c version

Changes from C version of MDCT algo ported into MIPS variant.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
---
 celt/mips/mdct_mipsr1.h | 113 +++++++++++++++++++++++++++++-----------
 1 file changed, 82 insertions(+), 31 deletions(-)

diff --git a/celt/mips/mdct_mipsr1.h b/celt/mips/mdct_mipsr1.h
index 7456c181a..c8accc093 100644
--- a/celt/mips/mdct_mipsr1.h
+++ b/celt/mips/mdct_mipsr1.h
@@ -55,10 +55,22 @@
 #include "mathops.h"
 #include "stack_alloc.h"
 
+static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15+shift);
+}
+
+static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) {
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15+shift);
+}
+
 /* Forward MDCT trashes the input array */
 #define OVERRIDE_clt_mdct_forward
 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 *window, int overlap, int shift, int stride, int arch)
+      const celt_coef *window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
@@ -66,16 +78,15 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
    VARDECL(kiss_fft_cpx, f2);
    const kiss_fft_state *st = l->kfft[shift];
    const kiss_twiddle_scalar *trig;
-   opus_val16 scale;
+   celt_coef scale;
 #ifdef FIXED_POINT
    /* Allows us to scale with MULT16_32_Q16(), which is faster than
       MULT16_32_Q15() on ARM. */
    int scale_shift = st->scale_shift-1;
+   int headroom;
 #endif
-
-    (void)arch;
-
    SAVE_STACK;
+   (void)arch;
    scale = st->scale;
 
    N = l->n;
@@ -98,8 +109,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
       const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
       const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
       kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
-      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
       for(i=0;i<((overlap+3)>>2);i++)
       {
          /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
@@ -123,7 +134,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
       for(;i<N4;i++)
       {
          /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-          *yp++ =  S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
+          *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
           *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
          xp1+=2;
          xp2-=2;
@@ -135,6 +146,9 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
    {
       kiss_fft_scalar * OPUS_RESTRICT yp = f;
       const kiss_twiddle_scalar *t = &trig[0];
+#ifdef FIXED_POINT
+      opus_val32 maxval=1;
+#endif
       for(i=0;i<N4;i++)
       {
          kiss_fft_cpx yc;
@@ -144,20 +158,29 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
          t1 = t[N4+i];
          re = *yp++;
          im = *yp++;
-
          yr = S_MUL_SUB(re,t0,im,t1);
          yi = S_MUL_ADD(im,t0,re,t1);
-
+         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
+            For floating-point it doesn't matter. */
+#ifdef ENABLE_QEXT
          yc.r = yr;
          yc.i = yi;
-         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
-         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+#else
+         yc.r = S_MUL2(yr, scale);
+         yc.i = S_MUL2(yi, scale);
+#endif
+#ifdef FIXED_POINT
+         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
+#endif
          f2[st->bitrev[i]] = yc;
       }
+#ifdef FIXED_POINT
+      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
+#endif
    }
 
    /* N/4 complex FFT, does not downscale anymore */
-   opus_fft_impl(st, f2);
+   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
 
    /* Post-rotate */
    {
@@ -170,8 +193,16 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
       for(i=0;i<N4;i++)
       {
          kiss_fft_scalar yr, yi;
-         yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]);
-         yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]);
+         kiss_fft_scalar t0, t1;
+#ifdef ENABLE_QEXT
+         t0 = S_MUL2(t[i], scale);
+         t1 = S_MUL2(t[N4+i], scale);
+#else
+         t0 = t[i];
+         t1 = t[N4+i];
+#endif
+         yr = S_MUL_SUB_PSR(fp->i,t1 , fp->r,t0, headroom);
+         yi = S_MUL_ADD_PSR(fp->r,t1 , fp->i,t0, headroom);
          *yp1 = yr;
          *yp2 = yi;
          fp++;
@@ -184,13 +215,15 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
 
 #define OVERRIDE_clt_mdct_backward
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
-      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
+      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
 {
    int i;
    int N, N2, N4;
    const kiss_twiddle_scalar *trig;
-
-    (void)arch;
+#ifdef FIXED_POINT
+   int pre_shift, post_shift, fft_shift;
+#endif
+   (void) arch;
 
    N = l->n;
    trig = l->trig;
@@ -202,6 +235,21 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    N2 = N>>1;
    N4 = N>>2;
 
+#ifdef FIXED_POINT
+   {
+      opus_val32 sumval=N2;
+      opus_val32 maxval=0;
+      for (i=0;i<N2;i++) {
+         maxval = MAX32(maxval, ABS32(in[i*stride]));
+         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
+      }
+      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
+      /* Worst-case where all the energy goes to a single sample. */
+      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
+      post_shift = IMIN(post_shift, pre_shift);
+      fft_shift = pre_shift - post_shift;
+   }
+#endif
    /* Pre-rotate */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
@@ -214,9 +262,12 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       {
          int rev;
          kiss_fft_scalar yr, yi;
+         opus_val32 x1, x2;
          rev = *bitrev++;
-         yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]);
-         yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]);
+         x1 = SHL32_ovflw(*xp1, pre_shift);
+         x2 = SHL32_ovflw(*xp2, pre_shift);
+         yr = S_MUL_ADD(x2,t[i] , x1,t[N4+i]);
+         yi = S_MUL_SUB(x1,t[i] , x2,t[N4+i]);
          /* We swap real and imag because we use an FFT instead of an IFFT. */
          yp[2*rev+1] = yr;
          yp[2*rev] = yi;
@@ -226,13 +277,13 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
       }
    }
 
-   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
 
    /* Post-rotate and de-shuffle from both ends of the buffer at once to make
       it in-place. */
    {
-      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
       const kiss_twiddle_scalar *t = &trig[0];
       /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
          middle pair will be computed twice. */
@@ -246,8 +297,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
          t0 = t[i];
          t1 = t[N4+i];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL_ADD(re,t0 , im,t1);
-         yi = S_MUL_SUB(re,t1 , im,t0);
+         yr = S_MUL_ADD_PSR(re,t0 , im,t1, post_shift);
+         yi = S_MUL_SUB_PSR(re,t1 , im,t0, post_shift);
          /* We swap real and imag because we're using an FFT instead of an IFFT. */
          re = yp1[1];
          im = yp1[0];
@@ -257,8 +308,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
          t0 = t[(N4-i-1)];
          t1 = t[(N2-i-1)];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL_ADD(re,t0,im,t1);
-         yi = S_MUL_SUB(re,t1,im,t0);
+         yr = S_MUL_ADD_PSR(re,t0,im,t1, post_shift);
+         yi = S_MUL_SUB_PSR(re,t1,im,t0, post_shift);
          yp1[0] = yr;
          yp0[1] = yi;
          yp0 += 2;
@@ -270,16 +321,16 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala
    {
       kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
       kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
-      const opus_val16 * OPUS_RESTRICT wp1 = window;
-      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+      const celt_coef * OPUS_RESTRICT wp1 = window;
+      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
 
       for(i = 0; i < overlap/2; i++)
       {
          kiss_fft_scalar x1, x2;
          x1 = *xp1;
          x2 = *yp1;
-         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
-         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         *yp1++ = S_MUL_SUB(x2, *wp2, x1, *wp1);
+         *xp1-- = S_MUL_ADD(x2, *wp1, x1, *wp2);
          wp1++;
          wp2--;
       }