this is the configration
['steps/nnet3/chain/train.py', '--stage', '-10', '--cmd', '
run.pl', '--feat.online-ivector-dir', 'exp/nnet3/ivectors_train_sp_hires', '--feat.cmvn-opts', '--norm-means=false --norm-vars=false', '--chain.xent-regularize', '0.1', '--chain.leaky-hmm-coefficient', '0.1', '--chain.l2-regularize', '0.0', '--chain.apply-deriv-weights', 'false', '--chain.lm-opts=--num-extra-lm-states=2000', '--trainer.dropout-schedule', '0,0...@0.20,0...@0.50,0', '--trainer.srand=0', '--trainer.max-param-change=2.0', '--trainer.num-epochs', '6', '--trainer.frames-per-iter', '1500000', '--trainer.optimization.num-jobs-initial', '1', '--trainer.optimization.num-jobs-final', '1', '--trainer.optimization.initial-effective-lrate', '0.00025', '--trainer.optimization.final-effective-lrate', '0.000025', '--trainer.num-chunk-per-minibatch=64,32', '--trainer.add-option=--optimization.memory-compression-level=2', '--egs.chunk-width=150,110,100', '--use-gpu=wait', '--egs.dir=', '--egs.opts', '--frames-overlap-per-eg 0 --constrained false', '--egs.stage', '-10', '--reporting.email=', '--cleanup.remove-egs=true', '--feat-dir=data/train_sp_hires', '--tree-dir', 'exp/chain/tree_a_sp', '--lat-dir=exp/chain/tri3b_train_sp_lats', '--dir', 'exp/chain/tdnn_1a_sp']
2023-10-24 00:51:00,891 [/s5j-stable/steps/nnet3/chain/train.py:284 - train - INFO ] Arguments for the experiment
{'alignment_subsampling_factor': 3,
'apply_deriv_weights': False,
'backstitch_training_interval': 1,
'backstitch_training_scale': 0.0,
'chain_opts': '',
'chunk_left_context': 0,
'chunk_left_context_initial': -1,
'chunk_right_context': 0,
'chunk_right_context_final': -1,
'chunk_width': '150,110,100',
'cleanup': True,
'cmvn_opts': '--norm-means=false --norm-vars=false',
'combine_sum_to_one_penalty': 0.0,
'command': '
run.pl',
'compute_per_dim_accuracy': False,
'deriv_truncate_margin': None,
'dir': 'exp/chain/tdnn_1a_sp',
'do_final_combination': True,
'dropout_schedule': '0,0...@0.20,0...@0.50,0',
'egs_command': None,
'egs_dir': None,
'egs_nj': 0,
'egs_opts': '--frames-overlap-per-eg 0 --constrained false',
'egs_stage': -10,
'email': None,
'exit_stage': None,
'feat_dir': 'data/train_sp_hires',
'final_effective_lrate': 2.5e-05,
'frame_subsampling_factor': 3,
'frames_per_iter': 1500000,
'initial_effective_lrate': 0.00025,
'input_model': None,
'l2_regularize': 0.0,
'lat_dir': 'exp/chain/tri3b_train_sp_lats',
'leaky_hmm_coefficient': 0.1,
'left_deriv_truncate': None,
'left_tolerance': 5,
'lm_opts': '--num-extra-lm-states=2000',
'max_lda_jobs': 10,
'max_models_combine': 20,
'max_objective_evaluations': 30,
'max_param_change': 2.0,
'momentum': 0.0,
'num_chunk_per_minibatch': '64,32',
'num_epochs': 6.0,
'num_jobs_final': 1,
'num_jobs_initial': 1,
'num_jobs_step': 1,
'online_ivector_dir': 'exp/nnet3/ivectors_train_sp_hires',
'preserve_model_interval': 100,
'presoftmax_prior_scale_power': -0.25,
'proportional_shrink': 0.0,
'rand_prune': 4.0,
'remove_egs': True,
'reporting_interval': 0.1,
'right_tolerance': 5,
'samples_per_iter': 400000,
'shrink_saturation_threshold': 0.4,
'shrink_value': 1.0,
'shuffle_buffer_size': 5000,
'srand': 0,
'stage': -10,
'train_opts': ['--optimization.memory-compression-level=2'],
'tree_dir': 'exp/chain/tree_a_sp',
'use_gpu': 'wait',
'xent_regularize': 0.1}
----------------------------------
but I get this error
# nnet3-chain-train --use-gpu=wait --apply-deriv-weights=False --l2-regularize=0.0 --leaky-hmm-coefficient=0.1 --write-cache=exp/chain/tdnn_1a_sp/cache.1 --xent-regularize=0.1 --print-interval=10 --momentum=0.0 --max-param-change=1.414213562373095 --backstitch-training-scale=0.0 --backstitch-training-interval=1 --l2-regularize-factor=1.0 --optimization.memory-compression-level=2 --srand=0 "nnet3-am-copy --raw=true --learning-rate=0.00025 --scale=1.0 exp/chain/tdnn_1a_sp/0.mdl - |nnet3-copy --edits='set-dropout-proportion name=* proportion=0.0' - - |" exp/chain/tdnn_1a_sp/den.fst "ark,bg:nnet3-chain-copy-egs --frame-shift=1 ark:exp/chain/tdnn_1a_sp/egs/cegs.1.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=5000 --srand=0 ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=32,16 ark:- ark:- |" exp/chain/tdnn_1a_sp/1.1.raw
# Started at Tue Oct 24 00:59:45 +03 2023
#
nnet3-chain-train --use-gpu=wait --apply-deriv-weights=False --l2-regularize=0.0 --leaky-hmm-coefficient=0.1 --write-cache=exp/chain/tdnn_1a_sp/cache.1 --xent-regularize=0.1 --print-interval=10 --momentum=0.0 --max-param-change=1.414213562373095 --backstitch-training-scale=0.0 --backstitch-training-interval=1 --l2-regularize-factor=1.0 --optimization.memory-compression-level=2 --srand=0 "nnet3-am-copy --raw=true --learning-rate=0.00025 --scale=1.0 exp/chain/tdnn_1a_sp/0.mdl - |nnet3-copy --edits='set-dropout-proportion name=* proportion=0.0' - - |" exp/chain/tdnn_1a_sp/den.fst 'ark,bg:nnet3-chain-copy-egs --frame-shift=1 ark:exp/chain/tdnn_1a_sp/egs/cegs.1.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=5000 --srand=0 ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=32,16 ark:- ark:- |' exp/chain/tdnn_1a_sp/1.1.raw
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuId():cu-device.cc:229) Waited 0 seconds before creating CUDA context
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuId():cu-device.cc:243) Not in compute-exclusive mode. Suggestion: use 'nvidia-smi -c 3' to set compute exclusive mode
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuIdAuto():cu-device.cc:438) Selecting from 1 GPUs
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuIdAuto():cu-device.cc:453) cudaSetDevice(0): NVIDIA GeForce RTX 2060 SUPER free:7447M, used:519M, total:7966M, free/total:0.934845
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuIdAuto():cu-device.cc:501) Device: 0, mem_ratio: 0.934845
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuId():cu-device.cc:382) Trying to select device: 0
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:SelectGpuIdAuto():cu-device.cc:511) Success selecting device 0 free mem ratio: 0.934845
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:FinalizeActiveGpu():cu-device.cc:338) The active GPU is [0]: NVIDIA GeForce RTX 2060 SUPER free:7411M, used:555M, total:7966M, free/total:0.930326 version 7.5
nnet3-am-copy --raw=true --learning-rate=0.00025 --scale=1.0 exp/chain/tdnn_1a_sp/0.mdl -
nnet3-copy '--edits=set-dropout-proportion name=* proportion=0.0' - -
LOG (nnet3-am-copy[5.5.1077~1-ed31c6]:main():nnet3-am-copy.cc:153) Copied neural net from exp/chain/tdnn_1a_sp/0.mdl to raw format as -
LOG (nnet3-copy[5.5.1077~1-ed31c6]:ReadEditConfig():nnet-utils.cc:1413) Set dropout proportions for 15 components.
LOG (nnet3-copy[5.5.1077~1-ed31c6]:main():nnet3-copy.cc:123) Copied raw neural net from - to -
nnet3-chain-merge-egs --minibatch-size=32,16 ark:- ark:-
nnet3-chain-copy-egs --frame-shift=1 ark:exp/chain/tdnn_1a_sp/egs/cegs.1.ark ark:-
nnet3-chain-shuffle-egs --buffer-size=5000 --srand=0 ark:- ark:-
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:ComputeChainObjfAndDerivE2e():chain-training.cc:162) Numerator objf: -0.872647
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.10451, multiplying update speed (currently 0.125) by 0.5; component is tdnnf5.linear
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.05015, multiplying update speed (currently 0.125) by 0.5; component is tdnnf6.linear
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.04932, multiplying update speed (currently 0.125) by 0.5; component is tdnnf10.linear
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.04964, multiplying update speed (currently 0.125) by 0.5; component is tdnnf11.linear
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.16631, multiplying update speed (currently 0.125) by 0.5; component is prefinal-l
WARNING (nnet3-chain-train[5.5.1077~1-ed31c6]:ConstrainOrthonormalInternal():nnet-utils.cc:1060) Ratio is 1.16674, multiplying update speed (currently 0.125) by 0.5; component is prefinal-chain.linear
LOG (nnet3-chain-train[5.5.1077~1-ed31c6]:ComputeChainObjfAndDerivE2e():chain-training.cc:162) Numerator objf: -0.86498
ASSERTION_FAILED (nnet3-chain-train[5.5.1077~1-ed31c6]:HouseBackward():qr.cc:123) Assertion failed: (KALDI_ISFINITE(sigma) && "Tridiagonalizing matrix that is too large or has NaNs.")
[ Stack-Trace: ]
nnet3-chain-train(kaldi::MessageLogger::LogMessage() const+0x70c) [0x55955063f480]
nnet3-chain-train(kaldi::KaldiAssertFailure_(char const*, char const*, int, char const*)+0x72) [0x55955063fe0b]
nnet3-chain-train(void kaldi::HouseBackward<float>(int, float const*, float*, float*)+0x168) [0x559550628249]
nnet3-chain-train(kaldi::SpMatrix<float>::Tridiagonalize(kaldi::MatrixBase<float>*)+0x2e8) [0x55955062870e]
nnet3-chain-train(kaldi::SpMatrix<float>::Eig(kaldi::VectorBase<float>*, kaldi::MatrixBase<float>*) const+0x6e) [0x559550629da0]
nnet3-chain-train(kaldi::nnet3::OnlineNaturalGradient::PreconditionDirectionsInternal(float, float, bool, kaldi::Vector<float> const&, kaldi::CuMatrixBase<float>*, kaldi::CuMatrixBase<float>*)+0x824) [0x55955037fbda]
nnet3-chain-train(kaldi::nnet3::OnlineNaturalGradient::PreconditionDirections(kaldi::CuMatrixBase<float>*, float*)+0x1e7) [0x559550380eaf]
nnet3-chain-train(kaldi::nnet3::TdnnComponent::UpdateNaturalGradient(kaldi::nnet3::TdnnComponent::PrecomputedIndexes const&, kaldi::CuMatrixBase<float> const&, kaldi::CuMatrixBase<float> const&)+0x2e5) [0x55955032d477]
nnet3-chain-train(kaldi::nnet3::TdnnComponent::Backprop(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, kaldi::nnet3::ComponentPrecomputedIndexes const*, kaldi::CuMatrixBase<float> const&, kaldi::CuMatrixBase<float> const&, kaldi::CuMatrixBase<float> const&, void*, kaldi::nnet3::Component*, kaldi::CuMatrixBase<float>*) const+0x2b4) [0x55955032d900]
nnet3-chain-train(kaldi::nnet3::NnetComputer::ExecuteCommand()+0x891) [0x55955039adf5]
nnet3-chain-train(kaldi::nnet3::NnetComputer::Run()+0x14b) [0x55955039bb9b]
nnet3-chain-train(kaldi::nnet3::NnetChainTrainer::TrainInternal(kaldi::nnet3::NnetChainExample const&, kaldi::nnet3::NnetComputation const&)+0x7d) [0x559550324f21]
nnet3-chain-train(kaldi::nnet3::NnetChainTrainer::Train(kaldi::nnet3::NnetChainExample const&)+0xe8) [0x5595503252aa]
nnet3-chain-train(main+0x642) [0x5595502d044b]
/lib/x86_64-linux-gnu/libc.so.6(+0x29d90) [0x7f456c429d90]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80) [0x7f456c429e40]
nnet3-chain-train(_start+0x25) [0x5595502cfd45]
# Accounting: time=2 threads=1
# Ended (code 134) at Tue Oct 24 00:59:47 +03 2023, elapsed time 2 seconds