diff --git a/Makefile.mips b/Makefile.mips deleted file mode 100644 index bc12ba86e..000000000 --- a/Makefile.mips +++ /dev/null @@ -1,169 +0,0 @@ -#################### COMPILE OPTIONS ####################### - -# Uncomment this for fixed-point build -FIXED_POINT=1 - -# It is strongly recommended to uncomment one of these -# VAR_ARRAYS: Use C99 variable-length arrays for stack allocation -# USE_ALLOCA: Use alloca() for stack allocation -# If none is defined, then the fallback is a non-threadsafe global array -CFLAGS := -DUSE_ALLOCA $(CFLAGS) -#CFLAGS := -DVAR_ARRAYS $(CFLAGS) - -# These options affect performance -# HAVE_LRINTF: Use C99 intrinsics to speed up float-to-int conversion -CFLAGS := -DHAVE_LRINTF $(CFLAGS) - -###################### END OF OPTIONS ###################### - --include package_version - -include silk_sources.mk -include celt_sources.mk -include opus_sources.mk - -ifdef FIXED_POINT -SILK_SOURCES += $(SILK_SOURCES_FIXED) -else -SILK_SOURCES += $(SILK_SOURCES_FLOAT) -OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) -endif - -EXESUFFIX = -LIBPREFIX = lib -LIBSUFFIX = .a -OBJSUFFIX = .o - -CC = $(TOOLCHAIN_PREFIX)cc$(TOOLCHAIN_SUFFIX) -AR = $(TOOLCHAIN_PREFIX)ar -RANLIB = $(TOOLCHAIN_PREFIX)ranlib -CP = $(TOOLCHAIN_PREFIX)cp - -cppflags-from-defines = $(addprefix -D,$(1)) -cppflags-from-includes = $(addprefix -I,$(1)) -ldflags-from-ldlibdirs = $(addprefix -L,$(1)) -ldlibs-from-libs = $(addprefix -l,$(1)) - -WARNINGS = -Wall -W -Wstrict-prototypes -Wextra -Wcast-align -Wnested-externs -Wshadow - -CFLAGS += -mips32r2 -mno-mips16 -std=gnu99 -O2 -g $(WARNINGS) -DENABLE_ASSERTIONS -DMIPSr1_ASM -DOPUS_BUILD -mdspr2 -march=74kc -mtune=74kc -mmt -mgp32 - -CINCLUDES = include silk celt - -ifdef FIXED_POINT -CFLAGS += -DFIXED_POINT=1 -DDISABLE_FLOAT_API -CINCLUDES += silk/fixed -else -CINCLUDES += silk/float -endif - - -LIBS = m - -LDLIBDIRS = ./ - -CFLAGS += $(call cppflags-from-defines,$(CDEFINES)) -CFLAGS += $(call cppflags-from-includes,$(CINCLUDES)) -LDFLAGS += $(call ldflags-from-ldlibdirs,$(LDLIBDIRS)) -LDLIBS += $(call ldlibs-from-libs,$(LIBS)) - -COMPILE.c.cmdline = $(CC) -c $(CFLAGS) -o $@ $< -LINK.o = $(CC) $(LDPREFLAGS) $(LDFLAGS) -LINK.o.cmdline = $(LINK.o) $^ $(LDLIBS) -o $@$(EXESUFFIX) - -ARCHIVE.cmdline = $(AR) $(ARFLAGS) $@ $^ && $(RANLIB) $@ - -%$(OBJSUFFIX):%.c - $(COMPILE.c.cmdline) - -%$(OBJSUFFIX):%.cpp - $(COMPILE.cpp.cmdline) - -# Directives - - -# Variable definitions -LIB_NAME = opus -TARGET = $(LIBPREFIX)$(LIB_NAME)$(LIBSUFFIX) - -SRCS_C = $(SILK_SOURCES) $(CELT_SOURCES) $(OPUS_SOURCES) - -OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(SRCS_C)) - -OPUSDEMO_SRCS_C = src/opus_demo.c -OPUSDEMO_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSDEMO_SRCS_C)) - -TESTOPUSAPI_SRCS_C = tests/test_opus_api.c -TESTOPUSAPI_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSAPI_SRCS_C)) - -TESTOPUSDECODE_SRCS_C = tests/test_opus_decode.c -TESTOPUSDECODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSDECODE_SRCS_C)) - -TESTOPUSENCODE_SRCS_C = tests/test_opus_encode.c tests/opus_encode_regressions.c -TESTOPUSENCODE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSENCODE_SRCS_C)) - -TESTOPUSEXTENSIONS_SRCS_C = tests/test_opus_extensions.c -TESTOPUSEXTENSIONS_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSEXTENSIONS_SRCS_C)) - -TESTOPUSPADDING_SRCS_C = tests/test_opus_padding.c -TESTOPUSPADDING_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(TESTOPUSPADDING_SRCS_C)) - -OPUSCOMPARE_SRCS_C = src/opus_compare.c -OPUSCOMPARE_OBJS := $(patsubst %.c,%$(OBJSUFFIX),$(OPUSCOMPARE_SRCS_C)) - -TESTS := test_opus_api test_opus_decode test_opus_encode test_opus_extensions test_opus_padding - -# Rules -all: lib opus_demo opus_compare $(TESTS) - -lib: $(TARGET) - -check: all - for test in $(TESTS); do ./$$test; done - -$(TARGET): $(OBJS) - $(ARCHIVE.cmdline) - -opus_demo$(EXESUFFIX): $(OPUSDEMO_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_api$(EXESUFFIX): $(TESTOPUSAPI_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_decode$(EXESUFFIX): $(TESTOPUSDECODE_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_encode$(EXESUFFIX): $(TESTOPUSENCODE_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_extensions$(EXESUFFIX): $(TESTOPUSEXTENSIONS_OBJS) $(TARGET) - $(LINK.o.cmdline) - -test_opus_padding$(EXESUFFIX): $(TESTOPUSPADDING_OBJS) $(TARGET) - $(LINK.o.cmdline) - -opus_compare$(EXESUFFIX): $(OPUSCOMPARE_OBJS) - $(LINK.o.cmdline) - -celt/celt.o: CFLAGS += -DPACKAGE_VERSION='$(PACKAGE_VERSION)' -celt/celt.o: package_version - -package_version: force - @if [ -x ./update_version ]; then \ - ./update_version || true; \ - elif [ ! -e ./package_version ]; then \ - echo 'PACKAGE_VERSION="unknown"' > ./package_version; \ - fi - -force: - -clean: - rm -f opus_demo$(EXESUFFIX) opus_compare$(EXESUFFIX) $(TARGET) \ - test_opus_api$(EXESUFFIX) test_opus_decode$(EXESUFFIX) \ - test_opus_encode$(EXESUFFIX) test_opus_extensions$(EXESUFFIX) \ - test_opus_padding$(EXESUFFIX) \ - $(OBJS) $(OPUSDEMO_OBJS) $(OPUSCOMPARE_OBJS) $(TESTOPUSAPI_OBJS) \ - $(TESTOPUSDECODE_OBJS) $(TESTOPUSENCODE_OBJS) \ - $(TESTOPUSEXTENSIONS_OBJS) $(TESTOPUSPADDING_OBJS) - -.PHONY: all lib clean force check diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h index 89ccc8039..5b1bfbcfe 100644 --- a/celt/_kiss_fft_guts.h +++ b/celt/_kiss_fft_guts.h @@ -102,7 +102,7 @@ #if defined(OPUS_ARM_INLINE_EDSP) #include "arm/kiss_fft_armv5e.h" #endif -#if defined(MIPSr1_ASM) +#if defined(__mips_dsp) #include "mips/kiss_fft_mipsr1.h" #endif diff --git a/celt/arch.h b/celt/arch.h index a6055f403..dd095b218 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -121,7 +121,7 @@ void celt_fatal(const char *str, const char *file, int line) /* Set this if opus_int64 is a native type of the CPU. */ /* Assume that all LP64 architectures have fast 64-bit types; also x86_64 (which can be ILP32 for x32) and Win64 (which is LLP64). */ -#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) +#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64) || defined (__mips) #define OPUS_FAST_INT64 1 #else #define OPUS_FAST_INT64 0 diff --git a/celt/celt.c b/celt/celt.c index 2235d46b8..8ca0e0baa 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -54,7 +54,7 @@ #define PACKAGE_VERSION "unknown" #endif -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/celt_mipsr1.h" #endif diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 743b064e9..86d345335 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -200,7 +200,7 @@ /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */ #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b))) -#if defined(MIPSr1_ASM) +#if defined(__mips_dsp) #include "mips/fixed_generic_mipsr1.h" #endif diff --git a/celt/mdct.c b/celt/mdct.c index f8483a2df..6812b8815 100644 --- a/celt/mdct.c +++ b/celt/mdct.c @@ -53,7 +53,7 @@ #include "mathops.h" #include "stack_alloc.h" -#if defined(MIPSr1_ASM) +#if defined(FIXED_POINT) && defined(__mips_dsp) #include "mips/mdct_mipsr1.h" #endif diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h index d1b25c204..7fa8d4358 100644 --- a/celt/mips/celt_mipsr1.h +++ b/celt/mips/celt_mipsr1.h @@ -97,18 +97,17 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, { opus_val16 f; opus_val32 res; + long long acc; f = MULT16_16_Q15(window[i],window[i]); x0= x[i-T1+2]; - asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0])); - - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1]))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2]))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2)); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1))); - asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0))); - - asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15)); + acc = __builtin_mips_mult((int)MULT16_16_Q15((Q15ONE-f),g00), (int)x[i-T0]); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g01), (int)ADD32(x[i-T0-1],x[i-T0+1])); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g02), (int)ADD32(x[i-T0-2],x[i-T0+2])); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g10), (int)x2); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g11), (int)ADD32(x3,x1)); + acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g12), (int)ADD32(x4,x0)); + res = __builtin_mips_extr_w(acc, 15); y[i] = x[i] + res; @@ -134,13 +133,14 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, for (i=overlap;ikfft[shift]; const kiss_twiddle_scalar *trig; - opus_val16 scale; + celt_coef scale; #ifdef FIXED_POINT /* Allows us to scale with MULT16_32_Q16(), which is faster than MULT16_32_Q15() on ARM. */ int scale_shift = st->scale_shift-1; + int headroom; #endif - - (void)arch; - SAVE_STACK; + (void)arch; scale = st->scale; N = l->n; @@ -98,8 +109,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); kiss_fft_scalar * OPUS_RESTRICT yp = f; - const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1); - const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; + const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); + const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; for(i=0;i<((overlap+3)>>2);i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ @@ -123,7 +134,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(;ibitrev[i]] = yc; } +#ifdef FIXED_POINT + headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); +#endif } /* N/4 complex FFT, does not downscale anymore */ - opus_fft_impl(st, f2); + opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); /* Post-rotate */ { @@ -170,8 +193,16 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar for(i=0;ii,t[N4+i] , fp->r,t[i]); - yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]); + kiss_fft_scalar t0, t1; +#ifdef ENABLE_QEXT + t0 = S_MUL2(t[i], scale); + t1 = S_MUL2(t[N4+i], scale); +#else + t0 = t[i]; + t1 = t[N4+i]; +#endif + yr = S_MUL_SUB_PSR(fp->i,t1 , fp->r,t0, headroom); + yi = S_MUL_ADD_PSR(fp->r,t1 , fp->i,t0, headroom); *yp1 = yr; *yp2 = yi; fp++; @@ -184,13 +215,15 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar #define OVERRIDE_clt_mdct_backward void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, - const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) + const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) { int i; int N, N2, N4; const kiss_twiddle_scalar *trig; - - (void)arch; +#ifdef FIXED_POINT + int pre_shift, post_shift, fft_shift; +#endif + (void) arch; N = l->n; trig = l->trig; @@ -202,6 +235,21 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala N2 = N>>1; N4 = N>>2; +#ifdef FIXED_POINT + { + opus_val32 sumval=N2; + opus_val32 maxval=0; + for (i=0;ikfft[shift], (kiss_fft_cpx*)(out+(overlap>>1))); + opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); /* Post-rotate and de-shuffle from both ends of the buffer at once to make it in-place. */ { - kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1); - kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2; + kiss_fft_scalar * yp0 = out+(overlap>>1); + kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; const kiss_twiddle_scalar *t = &trig[0]; /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the middle pair will be computed twice. */ @@ -246,8 +297,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[i]; t1 = t[N4+i]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL_ADD(re,t0 , im,t1); - yi = S_MUL_SUB(re,t1 , im,t0); + yr = S_MUL_ADD_PSR(re,t0 , im,t1, post_shift); + yi = S_MUL_SUB_PSR(re,t1 , im,t0, post_shift); /* We swap real and imag because we're using an FFT instead of an IFFT. */ re = yp1[1]; im = yp1[0]; @@ -257,8 +308,8 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala t0 = t[(N4-i-1)]; t1 = t[(N2-i-1)]; /* We'd scale up by 2 here, but instead it's done when mixing the windows */ - yr = S_MUL_ADD(re,t0,im,t1); - yi = S_MUL_SUB(re,t1,im,t0); + yr = S_MUL_ADD_PSR(re,t0,im,t1, post_shift); + yi = S_MUL_SUB_PSR(re,t1,im,t0, post_shift); yp1[0] = yr; yp0[1] = yi; yp0 += 2; @@ -270,16 +321,16 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scala { kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; kiss_fft_scalar * OPUS_RESTRICT yp1 = out; - const opus_val16 * OPUS_RESTRICT wp1 = window; - const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; + const celt_coef * OPUS_RESTRICT wp1 = window; + const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; for(i = 0; i < overlap/2; i++) { kiss_fft_scalar x1, x2; x1 = *xp1; x2 = *yp1; - *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); - *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); + *yp1++ = S_MUL_SUB(x2, *wp2, x1, *wp1); + *xp1-- = S_MUL_ADD(x2, *wp1, x1, *wp2); wp1++; wp2--; } diff --git a/celt/mips/pitch_mipsr1.h b/celt/mips/pitch_mipsr1.h index a9500aff5..6cbdd78d3 100644 --- a/celt/mips/pitch_mipsr1.h +++ b/celt/mips/pitch_mipsr1.h @@ -39,26 +39,22 @@ static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, c int N, opus_val32 *xy1, opus_val32 *xy2, int arch) { int j; - opus_val32 xy01=0; - opus_val32 xy02=0; + long long acc1 = 0; + long long acc2 = 0; (void)arch; - asm volatile("MULT $ac1, $0, $0"); - asm volatile("MULT $ac2, $0, $0"); /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ - for (j=0;j>1; #endif t = VSHR32(E, 2*(k-7)); - g = MULT16_16_P15(celt_rsqrt_norm(t),gain); + g = MULT32_32_Q31(celt_rsqrt_norm(t),gain); xptr = X; for (i=0;i