|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Set -e here so that we catch if any executable fails immediately |
| 4 | +set -euo pipefail |
| 5 | + |
| 6 | +# First the options that are passed through to run_ivector_common.sh |
| 7 | +# (some of which are also used in this script directly). |
| 8 | +stage=0 |
| 9 | +nj=96 |
| 10 | +train_set=train_worn_u100k |
| 11 | +test_sets="dev_worn dev_beamformit_ref" |
| 12 | +gmm=tri3 |
| 13 | +nnet3_affix=_train_worn_u100k |
| 14 | +lm_suffix= |
| 15 | + |
| 16 | +# The rest are configs specific to this script. Most of the parameters |
| 17 | +# are just hardcoded at this level, in the commands below. |
| 18 | +affix=1b # affix for the TDNN directory name |
| 19 | +tree_affix= |
| 20 | +train_stage=-10 |
| 21 | +get_egs_stage=-10 |
| 22 | +decode_iter= |
| 23 | + |
| 24 | +# training options |
| 25 | +# training chunk-options |
| 26 | +chunk_width=140,100,160 |
| 27 | +common_egs_dir= |
| 28 | +xent_regularize=0.1 |
| 29 | + |
| 30 | +# training options |
| 31 | +srand=0 |
| 32 | +remove_egs=true |
| 33 | + |
| 34 | +#decode options |
| 35 | +test_online_decoding=false # if true, it will run the last decoding stage. |
| 36 | + |
| 37 | +# End configuration section. |
| 38 | +echo "$0 $@" # Print the command line for logging |
| 39 | + |
| 40 | +. ./cmd.sh |
| 41 | +. ./path.sh |
| 42 | +. ./utils/parse_options.sh |
| 43 | + |
| 44 | +if ! cuda-compiled; then |
| 45 | + cat <<EOF && exit 1 |
| 46 | +This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA |
| 47 | +If you want to use GPUs (and have them), go to src/, and configure and make on a machine |
| 48 | +where "nvcc" is installed. |
| 49 | +EOF |
| 50 | +fi |
| 51 | + |
| 52 | +# The iVector-extraction and feature-dumping parts are the same as the standard |
| 53 | +# nnet3 setup, and you can skip them by setting "--stage 11" if you have already |
| 54 | +# run those things. |
| 55 | +local/nnet3/run_ivector_common.sh --stage $stage \ |
| 56 | + --train-set $train_set \ |
| 57 | + --test-sets "$test_sets" \ |
| 58 | + --gmm $gmm \ |
| 59 | + --nnet3-affix "$nnet3_affix" || exit 1; |
| 60 | + |
| 61 | +# Problem: We have removed the "train_" prefix of our training set in |
| 62 | +# the alignment directory names! Bad! |
| 63 | +gmm_dir=exp/$gmm |
| 64 | +ali_dir=exp/${gmm}_ali_${train_set}_sp |
| 65 | +tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} |
| 66 | +lang=data/lang_chain |
| 67 | +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats |
| 68 | +dir=exp/chain${nnet3_affix}/tdnn${affix}_sp |
| 69 | +train_data_dir=data/${train_set}_sp_hires |
| 70 | +lores_train_data_dir=data/${train_set}_sp |
| 71 | +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires |
| 72 | + |
| 73 | +for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ |
| 74 | + $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do |
| 75 | + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 |
| 76 | +done |
| 77 | + |
| 78 | +if [ $stage -le 10 ]; then |
| 79 | + echo "$0: creating lang directory $lang with chain-type topology" |
| 80 | + # Create a version of the lang/ directory that has one state per phone in the |
| 81 | + # topo file. [note, it really has two states.. the first one is only repeated |
| 82 | + # once, the second one has zero or more repeats.] |
| 83 | + if [ -d $lang ]; then |
| 84 | + if [ $lang/L.fst -nt data/lang/L.fst ]; then |
| 85 | + echo "$0: $lang already exists, not overwriting it; continuing" |
| 86 | + else |
| 87 | + echo "$0: $lang already exists and seems to be older than data/lang..." |
| 88 | + echo " ... not sure what to do. Exiting." |
| 89 | + exit 1; |
| 90 | + fi |
| 91 | + else |
| 92 | + cp -r data/lang $lang |
| 93 | + silphonelist=$(cat $lang/phones/silence.csl) || exit 1; |
| 94 | + nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; |
| 95 | + # Use our special topology... note that later on may have to tune this |
| 96 | + # topology. |
| 97 | + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo |
| 98 | + fi |
| 99 | +fi |
| 100 | + |
| 101 | +if [ $stage -le 11 ]; then |
| 102 | + # Get the alignments as lattices (gives the chain training more freedom). |
| 103 | + # use the same num-jobs as the alignments |
| 104 | + steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \ |
| 105 | + data/lang $gmm_dir $lat_dir |
| 106 | + rm $lat_dir/fsts.*.gz # save space |
| 107 | +fi |
| 108 | + |
| 109 | +if [ $stage -le 12 ]; then |
| 110 | + # Build a tree using our new topology. We know we have alignments for the |
| 111 | + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use |
| 112 | + # those. The num-leaves is always somewhat less than the num-leaves from |
| 113 | + # the GMM baseline. |
| 114 | + if [ -f $tree_dir/final.mdl ]; then |
| 115 | + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." |
| 116 | + exit 1; |
| 117 | + fi |
| 118 | + steps/nnet3/chain/build_tree.sh \ |
| 119 | + --frame-subsampling-factor 3 \ |
| 120 | + --context-opts "--context-width=2 --central-position=1" \ |
| 121 | + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ |
| 122 | + $lang $ali_dir $tree_dir |
| 123 | +fi |
| 124 | + |
| 125 | + |
| 126 | +if [ $stage -le 13 ]; then |
| 127 | + mkdir -p $dir |
| 128 | + echo "$0: creating neural net configs using the xconfig parser"; |
| 129 | + |
| 130 | + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') |
| 131 | + learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) |
| 132 | + opts="l2-regularize=0.05" |
| 133 | + output_opts="l2-regularize=0.01 bottleneck-dim=320" |
| 134 | + |
| 135 | + mkdir -p $dir/configs |
| 136 | + cat <<EOF > $dir/configs/network.xconfig |
| 137 | + input dim=100 name=ivector |
| 138 | + input dim=40 name=input |
| 139 | +
|
| 140 | + # please note that it is important to have input layer with the name=input |
| 141 | + # as the layer immediately preceding the fixed-affine-layer to enable |
| 142 | + # the use of short notation for the descriptor |
| 143 | + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat |
| 144 | +
|
| 145 | + # the first splicing is moved before the lda layer, so no splicing here |
| 146 | + relu-batchnorm-layer name=tdnn1 $opts dim=512 |
| 147 | + relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) |
| 148 | + relu-batchnorm-layer name=tdnn3 $opts dim=512 |
| 149 | + relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) |
| 150 | + relu-batchnorm-layer name=tdnn5 $opts dim=512 |
| 151 | + relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) |
| 152 | + relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) |
| 153 | + relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) |
| 154 | +
|
| 155 | + ## adding the layers for chain branch |
| 156 | + relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5 |
| 157 | + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 |
| 158 | +
|
| 159 | + # adding the layers for xent branch |
| 160 | + # This block prints the configs for a separate output that will be |
| 161 | + # trained with a cross-entropy objective in the 'chain' models... this |
| 162 | + # has the effect of regularizing the hidden parts of the model. we use |
| 163 | + # 0.5 / args.xent_regularize as the learning rate factor- the factor of |
| 164 | + # 0.5 / args.xent_regularize is suitable as it means the xent |
| 165 | + # final-layer learns at a rate independent of the regularization |
| 166 | + # constant; and the 0.5 was tuned so as to make the relative progress |
| 167 | + # similar in the xent and regular final layers. |
| 168 | + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5 |
| 169 | + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 |
| 170 | +EOF |
| 171 | + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ |
| 172 | +fi |
| 173 | + |
| 174 | +if [ $stage -le 14 ]; then |
| 175 | + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then |
| 176 | + utils/create_split_dir.pl \ |
| 177 | + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage |
| 178 | + fi |
| 179 | + |
| 180 | + steps/nnet3/chain/train.py --stage=$train_stage \ |
| 181 | + --cmd="$decode_cmd" \ |
| 182 | + --feat.online-ivector-dir=$train_ivector_dir \ |
| 183 | + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ |
| 184 | + --chain.xent-regularize $xent_regularize \ |
| 185 | + --chain.leaky-hmm-coefficient=0.1 \ |
| 186 | + --chain.l2-regularize=0.00005 \ |
| 187 | + --chain.apply-deriv-weights=false \ |
| 188 | + --chain.lm-opts="--num-extra-lm-states=2000" \ |
| 189 | + --trainer.srand=$srand \ |
| 190 | + --trainer.max-param-change=2.0 \ |
| 191 | + --trainer.num-epochs=10 \ |
| 192 | + --trainer.frames-per-iter=3000000 \ |
| 193 | + --trainer.optimization.num-jobs-initial=2 \ |
| 194 | + --trainer.optimization.num-jobs-final=4 \ |
| 195 | + --trainer.optimization.initial-effective-lrate=0.001 \ |
| 196 | + --trainer.optimization.final-effective-lrate=0.0001 \ |
| 197 | + --trainer.optimization.shrink-value=1.0 \ |
| 198 | + --trainer.num-chunk-per-minibatch=256,128,64 \ |
| 199 | + --trainer.optimization.momentum=0.0 \ |
| 200 | + --egs.chunk-width=$chunk_width \ |
| 201 | + --egs.dir="$common_egs_dir" \ |
| 202 | + --egs.opts="--frames-overlap-per-eg 0" \ |
| 203 | + --cleanup.remove-egs=$remove_egs \ |
| 204 | + --use-gpu=true \ |
| 205 | + --feat-dir=$train_data_dir \ |
| 206 | + --tree-dir=$tree_dir \ |
| 207 | + --lat-dir=$lat_dir \ |
| 208 | + --dir=$dir || exit 1; |
| 209 | +fi |
| 210 | + |
| 211 | +if [ $stage -le 15 ]; then |
| 212 | + # Note: it's not important to give mkgraph.sh the lang directory with the |
| 213 | + # matched topology (since it gets the topology file from the model). |
| 214 | + utils/mkgraph.sh \ |
| 215 | + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ |
| 216 | + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; |
| 217 | +fi |
| 218 | + |
| 219 | +if [ $stage -le 16 ]; then |
| 220 | + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) |
| 221 | + rm $dir/.error 2>/dev/null || true |
| 222 | + |
| 223 | + for data in $test_sets; do |
| 224 | + ( |
| 225 | + steps/nnet3/decode.sh \ |
| 226 | + --acwt 1.0 --post-decode-acwt 10.0 \ |
| 227 | + --frames-per-chunk $frames_per_chunk \ |
| 228 | + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ |
| 229 | + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ |
| 230 | + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 |
| 231 | + ) || touch $dir/.error & |
| 232 | + done |
| 233 | + wait |
| 234 | + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 |
| 235 | +fi |
| 236 | + |
| 237 | +# Not testing the 'looped' decoding separately, because for |
| 238 | +# TDNN systems it would give exactly the same results as the |
| 239 | +# normal decoding. |
| 240 | + |
| 241 | +if $test_online_decoding && [ $stage -le 17 ]; then |
| 242 | + # note: if the features change (e.g. you add pitch features), you will have to |
| 243 | + # change the options of the following command line. |
| 244 | + steps/online/nnet3/prepare_online_decoding.sh \ |
| 245 | + --mfcc-config conf/mfcc_hires.conf \ |
| 246 | + $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online |
| 247 | + |
| 248 | + rm $dir/.error 2>/dev/null || true |
| 249 | + |
| 250 | + for data in $test_sets; do |
| 251 | + ( |
| 252 | + nspk=$(wc -l <data/${data}_hires/spk2utt) |
| 253 | + # note: we just give it "data/${data}" as it only uses the wav.scp, the |
| 254 | + # feature type does not matter. |
| 255 | + steps/online/nnet3/decode.sh \ |
| 256 | + --acwt 1.0 --post-decode-acwt 10.0 \ |
| 257 | + --nj 8 --cmd "$decode_cmd" \ |
| 258 | + $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1 |
| 259 | + ) || touch $dir/.error & |
| 260 | + done |
| 261 | + wait |
| 262 | + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 |
| 263 | +fi |
| 264 | + |
| 265 | + |
| 266 | +exit 0; |
0 commit comments