diff --git a/src/dump_data.c b/src/dump_data.c index 33364c74..8f46a301 100644 --- a/src/dump_data.c +++ b/src/dump_data.c @@ -41,6 +41,52 @@ #include #include "lpcnet.h" #include "lpcnet_private.h" +#include "opus.h" + +float preemph_offset[NB_BANDS] = {1.772676, 2.937053, 0.278042, 0.299267, 0.126341, 0.060082, 0.019509, -0.017281, 0.000530, -0.000156, -0.007375, -0.010533, -0.002903, -0.005244, -0.003251, -0.000492, -0.000174, -0.004998}; + +void compute_band_energy_from_lpc(float *bandE, float g, const float *lpc) { + int i; + float sum[NB_BANDS] = {0}; + float x[WINDOW_SIZE]; + kiss_fft_cpx X[FREQ_SIZE]; + { + RNN_CLEAR(x, WINDOW_SIZE); + x[0] = 1; + //x[1] = -PREEMPHASIS; + for (i=0;ifeatures[k][2*NB_BANDS+3+j]*st->sig_mem[j]; + //printf("%f\n", pcm[k*FRAME_SIZE+i] - p); e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p); /* Signal. */ data[4*i] = lin2ulaw(st->sig_mem[0]); @@ -100,7 +149,7 @@ void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *f st->sig_mem[0] = p + ulaw2lin(e); st->exc_mem = e; } - fwrite(data, 4*FRAME_SIZE, 1, file); + //fwrite(data, 4*FRAME_SIZE, 1, file); } } @@ -128,6 +177,7 @@ int main(int argc, char **argv) { FILE *fpcm=NULL; short pcm[FRAME_SIZE]={0}; short pcmbuf[FRAME_SIZE*4]={0}; + float xbuf[FRAME_SIZE*4]={0}; int noisebuf[FRAME_SIZE*4]={0}; short tmp[FRAME_SIZE] = {0}; float savedX[FRAME_SIZE] = {0}; @@ -140,7 +190,17 @@ int main(int argc, char **argv) { int training = -1; int encode = 0; int decode = 0; + int delay = TRAINING_OFFSET; int quantize = 0; + OpusEncoder *enc; + OpusDecoder *dec; + enc = opus_encoder_create(16000, 1, OPUS_APPLICATION_VOIP, NULL); + opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000)); + opus_encoder_ctl(enc, OPUS_SET_BANDWIDTH(OPUS_BANDWIDTH_WIDEBAND)); + opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&delay)); + delay = 160; + fprintf(stderr, "delay is %d\n", delay); + dec = opus_decoder_create(16000, 1, NULL); st = lpcnet_encoder_create(); if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1; if (argc == 5 && strcmp(argv[1], "-qtrain")==0) { @@ -242,32 +302,107 @@ int main(int argc, char **argv) { } biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE); biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE); - preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE); for (i=0;ipcount*FRAME_SIZE + i] = (1.f/32768.f)*x[i]; + //preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE); for (i=0;ipcount*FRAME_SIZE], pcm, FRAME_SIZE); + if (st->pcount == 1 || st->pcount == 3) { + unsigned char bytes[100]; + float pcm_dec[320]; + float data[4][19]; + float bandE[4][NB_BANDS]; + int nb_bytes; + int nb_samples; + int pick; + static float mem_preemph2 = 0; + nb_bytes = opus_encode_float(enc, &xbuf[(st->pcount-1)*FRAME_SIZE], 320, bytes, 100); + nb_samples = opus_decode_float(dec, bytes, nb_bytes, pcm_dec, 320, 0); + preemphasis(pcm_dec, &mem_preemph2, pcm_dec, PREEMPHASIS, 2*FRAME_SIZE); + if (nb_samples != 320) break; + for (i=0;i<320;i++) pcm_dec[i] *= 32768; + st->pcount--; + compute_frame_features(st, pcm_dec); + st->pcount++; + compute_frame_features(st, pcm_dec+160); + get_fdump(data); +#if 1 + for (i=0;i<4;i++) compute_band_energy_from_lpc(bandE[i], data[i][18], data[i]); + for (i=0;ifeatures[st->pcount-1][NB_BANDS], bandE[0]); + dct(&st->features[st->pcount][NB_BANDS] , bandE[2]); + st->features[st->pcount-1][NB_BANDS] -= 4; + st->features[st->pcount][NB_BANDS] -= 4; +#endif + pick = data[0][17] > data[1][17] ? 0 : 1; + st->features[st->pcount-1][36] = .02*(data[pick][16] - 100); + st->features[st->pcount-1][37] = data[pick][17] - .5; + pick = data[2][17] > data[3][17] ? 2 : 3; + st->features[st->pcount][36] = .02*(data[pick][16] - 100); + st->features[st->pcount][37] = data[pick][17] - .5; + + for (i=0;i<16;i++) st->features[st->pcount-1][39+i] = -data[0][i]; + for (i=0;i<16;i++) st->features[st->pcount][39+i] = -data[2][i]; + + //lpc_from_cepstrum(&st->features[st->pcount-1][2*NB_BANDS+3], st->features[st->pcount-1]); + //lpc_from_cepstrum(&st->features[st->pcount][2*NB_BANDS+3], st->features[st->pcount]); + //for (i=0;i<55;i++) printf("%f ", st->features[st->pcount-1][i]); + //for (i=0;i<55;i++) printf("%f ", st->features[st->pcount][i]); + //printf("\n"); + //printf("%f %f %f %f %f\n", st->features[st->pcount-1][37], data[1][16], data[3][16], 100+50*st->features[st->pcount-1][36], 100+50*st->features[st->pcount][36]); + } if (fpcm) { compute_noise(&noisebuf[st->pcount*FRAME_SIZE], noise_std); } st->pcount++; /* Running on groups of 4 frames. */ if (st->pcount == 4) { +#if 0 unsigned char buf[8]; process_superframe(st, buf, ffeat, encode, quantize); - if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm); +#else + float ftemp[55]; + static float fmem[55] = {0}; + static float last_pitch = 0; + for (i=3;i>=0;i--) { + if (st->features[i][36] > -1.99) last_pitch = st->features[i][36]; + else st->features[i][36] = last_pitch; + } + last_pitch = st->features[3][36]; +#if 0 + RNN_COPY(ftemp, &st->features[3][0], 55); + for (i=3;i>=1;i--) { + RNN_COPY(&st->features[i][NB_BANDS], &st->features[i-1][NB_BANDS], NB_BANDS+2); + } + RNN_COPY(&st->features[0][NB_BANDS], &fmem[NB_BANDS], NB_BANDS+2); + RNN_COPY(fmem, ftemp, 55); +#endif + for (i=0;i<4;i++) { + int j; + for (j=0;jfeatures[i][NB_BANDS+j] -= st->features[i][j]; + } + if (ffeat) { + for (i=0;i<4;i++) { + fwrite(st->features[i], sizeof(float), 38, ffeat); + } + } +#endif + if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm); st->pcount = 0; } //if (fpcm) fwrite(pcm, sizeof(short), FRAME_SIZE, fpcm); - for (i=0;i -#define SQUARE(x) ((x)*(x)) - -static const opus_int16 eband5ms[] = { +const opus_int16 eband5ms[] = { /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k*/ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40 }; diff --git a/src/freq.h b/src/freq.h index b4fbbf08..a24db288 100644 --- a/src/freq.h +++ b/src/freq.h @@ -43,6 +43,10 @@ #define NB_BANDS 18 #define NB_BANDS_1 (NB_BANDS - 1) +#define SQUARE(x) ((x)*(x)) + +extern const opus_int16 eband5ms[]; + void compute_band_energy(float *bandE, const kiss_fft_cpx *X); void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P); diff --git a/src/lpcnet.c b/src/lpcnet.c index 2c9dc1dc..05824877 100644 --- a/src/lpcnet.c +++ b/src/lpcnet.c @@ -129,7 +129,7 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features, int pitch; float pitch_gain; /* Matches the Python code -- the 0.1 avoids rounding issues. */ - pitch = (int)floor(.1 + 50*features[36]+100); + pitch = IMIN(255, (int)floor(.1 + 50*features[36]+100)); pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1]; memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0])); lpcnet->old_gain[0] = features[PITCH_GAIN_FEATURE]; @@ -137,6 +137,9 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features, memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0])); memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0])); lpc_from_cepstrum(lpcnet->old_lpc[0], features); + //for (i=0;i<16;i++) printf("%f ", lpcnet->old_lpc[0][i]); + //printf("\n"); + if (lpcnet->frame_count <= FEATURES_DELAY) { RNN_CLEAR(output, N); diff --git a/src/lpcnet_demo.c b/src/lpcnet_demo.c index a838840b..1d44009a 100644 --- a/src/lpcnet_demo.c +++ b/src/lpcnet_demo.c @@ -109,13 +109,15 @@ int main(int argc, char **argv) { LPCNetState *net; net = lpcnet_create(); while (1) { + int i; float in_features[NB_TOTAL_FEATURES]; float features[NB_FEATURES]; short pcm[LPCNET_FRAME_SIZE]; fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin); if (feof(fin)) break; RNN_COPY(features, in_features, NB_FEATURES); - RNN_CLEAR(&features[18], 18); + //for (i=0;i<16;i++) printf("%f ", in_features[NB_TOTAL_FEATURES-16+i]); + //RNN_CLEAR(&features[18], 18); lpcnet_synthesize(net, features, pcm, LPCNET_FRAME_SIZE); fwrite(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fout); } diff --git a/src/train_lpcnet.py b/src/train_lpcnet.py index 0b5e0ba2..7e625656 100755 --- a/src/train_lpcnet.py +++ b/src/train_lpcnet.py @@ -87,7 +87,7 @@ features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features)) features = features[:, :, :nb_used_features] -features[:,:,18:36] = 0 +#features[:,:,18:36] = 0 fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0) fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0) @@ -95,6 +95,7 @@ periods = (.1 + 50*features[:,:,36:37]+100).astype('int16') +periods = np.minimum(periods, 255); in_data = np.concatenate([sig, pred, in_exc], axis=-1) @@ -103,7 +104,7 @@ del in_exc # dump models to disk as we go -checkpoint = ModelCheckpoint('lpcnet24g_384_10_G16_{epoch:02d}.h5') +checkpoint = ModelCheckpoint('lpcnet27b_384_10_G16_{epoch:02d}.h5') #Set this to True to adapt an existing model (e.g. on new data) adaptation = False @@ -121,4 +122,5 @@ decay = 5e-5 model.compile(optimizer=Adam(lr, amsgrad=True, decay=decay), loss='sparse_categorical_crossentropy') +model.save_weights('lpcnet27b_384_10_G16_00.h5'); model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])