Skip to content

[scripts] Rig combine_ali_dirs.sh to combine alignment lattices #3315

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 179 additions & 75 deletions egs/wsj/s5/steps/combine_ali_dirs.sh
Original file line number Diff line number Diff line change
@@ -1,105 +1,209 @@
#!/bin/bash
# Copyright 2016 Xiaohui Zhang Apache 2.0.
# Copyright 2019 SmartAction (kkm)

# This srcipt operates on alignment directories, such as exp/tri4a_ali
# the output is a new ali dir which has alignments from all the input ali dirs
# This script combines alignment directories, such as exp/tri4a_ali, and
# validates matching of the utterances and alignments after combining.

# Begin configuration section.
cmd=run.pl
extra_files=
num_jobs=4
nj=4
combine_lat=true
combine_ali=true
tolerance=10
# End configuration section.
echo "$0 $@" # Print the command line for logging
echo "$0 $@" # Print the command line for logging.

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
[[ -f path.sh ]] && . ./path.sh
. parse_options.sh || exit 1

export LC_ALL=C

if [[ $# -lt 3 ]]; then
echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..."
echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2"
echo "Options:"
echo " --extra-files <file1 file2...> # specify addtional files in 'src-ali-dir1' to copy"
echo " --num-jobs <nj> # number of jobs used to split the data directory."
echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones."
echo " Other than alignments, only files from the first src ali dir are copied."
cat >&2 <<EOF
Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
Options:
--nj <nj> # number of jobs to split combined archives [4]
--combine_ali false # merge ali.*.gz if present [true]
--combine_lat false # merge lat.*.gz if present [true]
--tolerance <int,%> # maximum percentage of missing alignments or lattices
# w.r.t. total utterances in <data> before error is
# reported [10]

The script checks that certain important files are present and compatible in all
source directories (phones.txt, tree); other are copied from the first source
(cmvn_opts, final.mdl) without much checking.

Both --combine_ali and --combine_lat are true by default, but the script
proceeds with a warning if directories do not contain either alignments or
alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
after the script completes if additional programmatic check is required.
EOF
exit 1;
fi

data=$1;
shift;
dest=$1;
shift;
first_src=$1;

mkdir -p $dest;
rm $dest/{ali.*.gz,num_jobs} 2>/dev/null

cp $first_src/phones.txt $dest 2>/dev/null

export LC_ALL=C
if [[ ! $combine_lat && ! $combine_ali ]]; then
echo "$0: at least one of --combine_lat and --combine_ali must be true"
exit 1
fi

for dir in $*; do
if [ ! -f $dir/ali.1.gz ]; then
echo "$0: check if alignments (ali.*.gz) are present in $dir."
exit 1;
data=$1
dest=$2
shift 2
first_src=$1

do_ali=$combine_ali
do_lat=$combine_lat

# Check if alignments and/or lattices are present. Since we combine both,
# whichever present, issue a warning only. Also verify that the target is
# different from any source; we cannot combine in-place, and a lot of damage
# could result.
for src in $@; do
if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
"$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
echo "$0: error: Source $src is same as target $dest."
exit 1
fi
if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
"combining. Consider '--combine_ali false' to suppress this warning."
do_ali=false
fi
if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
"not combining. Consider '--combine_lat false' to suppress this warning."
do_lat=false
fi
done

for dir in $*; do
for f in tree; do
diff $first_src/$f $dir/$f 1>/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "$0: Cannot combine alignment directories with different $f files."
fi
done
done
if ! $do_ali && ! $do_lat; then
echo "$0: error: Cannot combine directories."
exit 1
fi

for f in final.mdl tree cmvn_opts num_jobs $extra_files; do
# Verify that required files are present in the first directory.
for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
if [ ! -f $first_src/$f ]; then
echo "combine_ali_dir.sh: no such file $first_src/$f"
exit 1;
echo "$0: error: Required source file $first_src/$f is missing."
exit 1
fi
cp $first_src/$f $dest/
done

src_id=0
temp_dir=$dest/temp
[ -d $temp_dir ] && rm -r $temp_dir;
mkdir -p $temp_dir
echo "$0: dumping alignments in each source directory as single archive and index."
for dir in $*; do
src_id=$((src_id + 1))
cur_num_jobs=$(cat $dir/num_jobs) || exit 1;
alis=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/ali.$n.gz "; done)
$cmd $dir/log/copy_alignments.log \
copy-int-vector "ark:gunzip -c $alis|" \
ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1;
# Verify that phones and trees are compatible in all directories, and than
# num_jobs files are present, too.
for src in $@; do
if [[ $src != $first_src ]]; then
if [[ ! -f $src/num_jobs ]]; then
echo "$0: error: Required source file $src/num_jobs is missing."
exit 1
fi
if ! cmp -s $first_src/tree $src/tree; then
echo "$0: error: tree $src/tree is either missing or not the" \
"same as $first_src/tree."
exit 1
fi
if [[ ! -f $src/phones.txt ]]; then
echo "$0: error: Required source file $src/phones.txt is missing."
exit 1
fi
utils/lang/check_phones_compatible.sh $first_src/phones.txt \
$src/phones.txt || exit 1
fi
done
sort -m $temp_dir/ali.*.scp > $temp_dir/ali.scp || exit 1;

echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files."
utils/split_data.sh $data $num_jobs || exit 1;
# All checks passed, ok to prepare directory. Copy model and other files from
# the first source, as they either checked to be compatible, or we do not care
# if they are.
mkdir -p $dest || exit 1
rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
$do_ali && rm -f $dest/ali.*.{gz,scp}
$do_lat && rm -f $dest/lat.*.{gz,scp}
cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null # If present.
echo $nj > $dest/num_jobs || exit 1

# Make temporary directory, delete on signal, but not on 'exit 1'.
temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
cleanup() { rm -rf "$temp_dir"; }
trap cleanup HUP INT TERM
echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
"script failure, so you could examine it for troubleshooting."


# This function may be called twice, once to combine alignments and the second
# time to combine lattices. The two invocations are as follows:
# do_combine ali alignments copy-int-vector $@
# do_combine lat lattices lattice-copy $@
# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
# program used to copy corresponding objects.
do_combine() {
local ark=$1 entities=$2 copy_program=$3
shift 3

echo "$0: Gathering $entities from each source directory."
# Assign all source gzipped archive names to an exported variable, one each
# per source directory, so that we can copy archives in a job per source.
src_id=0
for src in $@; do
src_id=$((src_id + 1))
nj_src=$(cat $src/num_jobs) || exit 1
# Create and export variable src_arcs_${src_id} for the job runner.
# Each numbered variable will contain the list of archives, e. g.:
# src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
# ('printf' repeats its format as long as there are more arguments).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the -v option for printf on my mac.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be bash 3.2? I try to verify all stuff on docker with it, checks out here. Could you please try if printf -v foo 'bar' works?

$ docker run --rm -it bash:3.2
bash-3.2# printf -v foo 'bar'
bash-3.2# echo $foo
bar

printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src)
export src_arks_${src_id}
done

echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files."
utils/filter_scps.pl JOB=1:$num_jobs \
$data/split$num_jobs/JOB/utt2spk $temp_dir/ali.scp $temp_dir/ali.JOB.scp
# Gather archives in parallel jobs.
$cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
$copy_program \
"ark:gunzip -c \${src_arks_JOB} |" \
"ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1

# Merge (presumed already sorted) scp's into a single script.
sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1

echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
$cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
$copy_program \
"scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \
"ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1

# Get some interesting stats, and signal an error if error threshold exceeded.
n_utt=$(wc -l <$data/utt2spk)
n_ali=$(wc -l <$temp_dir/$ark.scp)
n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
echo "$0: Combined $n_ali $entities for $n_utt utterances." \
"There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
"$entities, and $n_ali_no_utt $entities not matching any utterance."

if (( $n_utt_no_ali_pct >= $tolerance )); then
echo "$0: error: Percentage of utterances missing $entities," \
"${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
exit 1
fi

for i in `seq 1 $num_jobs`; do
copy-int-vector scp:$temp_dir/ali.${i}.scp "ark:|gzip -c >$dest/ali.$i.gz" || exit 1;
done
return 0
}

echo $num_jobs > $dest/num_jobs || exit 1
# Do the actual combining. Do not check returned exit code, as
# the function always calls 'exit 1' on failure.
$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
$do_lat && do_combine lat 'lattices' lattice-copy "$@"

echo "$0: checking the alignment files generated have at least 90% of the utterances."
for i in `seq 1 $num_jobs`; do
num_lines=`cat $temp_dir/ali.$i.scp | wc -l` || exit 1;
num_lines_tot=`cat $data/split$num_jobs/$i/utt2spk | wc -l` || exit 1;
python -c "import sys;
percent = 100.0 * float($num_lines) / $num_lines_tot
if percent < 90 :
print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))" || exit 1;
done
rm -r $temp_dir 2>/dev/null
# Delete the temporary directory on success.
cleanup

echo "Combined alignments and stored in $dest"
what=
$do_ali && what+='alignments '
$do_ali && $do_lat && what+='and '
$do_lat && what+='lattices '
echo "$0: Stored combined ${what}in $dest" # No period, interferes with
# copy/paste from tty emulator.
exit 0
1 change: 1 addition & 0 deletions egs/wsj/s5/steps/combine_lat_dirs.sh