you need to write a program to generate the text files needed for the decoding (utt2spk, wav.scp ), you don't need the file named text since you don't know in advance what is in the audio file, and then run a decoding script, this script should look like the one in the bottom, and then you can read the result from the log file in the exp/mono/decode/log and exp/tri1/decode/log.
#!/bin/bash
. ./path.sh || exit 1
. ./cmd.sh || exit 1
nj=1 # number of parallel jobs - 1 is perfect for such a small data set
lm_order=1 # language model order (n-gram quantity) - 1 is enough for digits grammar
echo
echo "===== FEATURES EXTRACTION ====="
echo
date
# Making feats.scp files
mfccdir=mfcc
utils/validate_data_dir.sh --no-feats data/test
utils/fix_data_dir.sh data/test
steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/test exp/make_mfcc/test $mfccdir
# Making cmvn.scp files
steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
echo
echo "===== MONO DECODING ====="
echo
date
steps/decode.sh --config conf/decode.config --nj $nj --cmd "$decode_cmd" exp/mono/graph data/test exp/mono/decode
echo
echo "===== TRI1 (first triphone pass) DECODING ====="
echo
date
steps/decode.sh --config conf/decode.config --nj $nj --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode