--
You received this message because you are subscribed to the Google Groups "tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-oc...@googlegroups.com.
To post to this group, send email to tesser...@googlegroups.com.
Visit this group at https://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/48ce49cc-6ade-4ebd-a1a6-5e382b033a95%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
Please see
If you have the latest version of tesseract (built using master branch from github) then you can use the following script.
Alternately you can install latest version from a ppa
Use the section from following as you need. This puts all commands for tesstutorial from the wiki in one place. You will need to change the file locations to match your environment.
#!/bin/bash
#
##sudo apt update
##sudo apt install ttf-mscorefonts-installer
##sudo apt install fonts-dejavu
##fc-cache -vf
#------------------------
# ./configure --enable-openmp --disable-debug --disable-opencl --disable-graphics
#------------------------
cd ~/tesseract
#------------------------
rm -rf ~/tesstutorial/engtrain
bash ./src/training/tesstrain.sh \
--fonts_dir ~/.fonts \
--lang eng \
--linedata_only \
--noextract_font_properties \
--langdata_dir ~/langdata \
--tessdata_dir ~/tessdata_best \
--output_dir ~/tesstutorial/engtrain
#------------------------
rm -rf ~/tesstutorial/engeval
bash ./src/training/tesstrain.sh \
--fonts_dir ~/.fonts \
--lang eng --linedata_only \
--noextract_font_properties \
--langdata_dir ~/langdata \
--tessdata_dir ~/tessdata_best \
--exposures "0" \
--save_box_tiff \
--maxpages 0 \
--workspace_dir ~/tmp \
--fontlist "Impact Condensed" \
--output_dir ~/tesstutorial/engeval
#------------------------
rm -rf ~/tesstutorial/engoutput
mkdir -p ~/tesstutorial/engoutput
#
./src/training/lstmtraining \
--debug_interval 0 \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \
--model_output ~/tesstutorial/engoutput/base \
--learning_rate 20e-4 \
--train_listfile ~/tesstutorial/engtrain/eng.training_files.txt \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt \
--max_iterations 5000
#------------------------
./src/training/lstmeval \
--model ~/tesstutorial/engoutput/base_checkpoint \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt
### Line 810: At iteration 0, stage 0, Eval Char error rate=87.883967, Word error rate=98.548647
#------------------------
./src/training/lstmeval \
--model ~/tessdata_best/eng.traineddata \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt
### Line 922: At iteration 0, stage 0, Eval Char error rate=2.2153534, Word error rate=7.1494965
#------------------------
./src/training/lstmeval \
--model ~/tessdata_best/eng.traineddata \
--eval_listfile ~/tesstutorial/engtrain/eng.training_files.txt
### Line 1409: At iteration 0, stage 0, Eval Char error rate=0.21176785, Word error rate=0.54202697
#------------------------
###
./src/training/lstmtraining \
--debug_interval 0 \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \
--model_output ~/tesstutorial/engoutput/base \
--learning_rate 20e-4 \
--train_listfile ~/tesstutorial/engtrain/eng.training_files.txt \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt \
--max_iterations 10000 \
&>~/tesstutorial/engoutput/basetrain10k.log
#
./src/training/lstmeval \
--model ~/tesstutorial/engoutput/base_checkpoint \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt
### Line 1558: At iteration 0, stage 0, Eval Char error rate=86.96414, Word error rate=98.968011
#------------------------
# FINETUNING FOR IMPACT
#--------------------------------------
rm -rf ~/tesstutorial/impact_from_small
mkdir -p ~/tesstutorial/impact_from_small
#
time ./src/training/lstmtraining \
--debug_interval 0 \
--model_output ~/tesstutorial/impact_from_small/impact \
--continue_from ~/tesstutorial/engoutput/base_checkpoint \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--train_listfile ~/tesstutorial/engeval/eng.training_files.txt \
--max_iterations 1200
#
time ./src/training/lstmeval \
--model ~/tesstutorial/impact_from_small/impact_checkpoint \
--traineddata ~/tesstutorial/engtrain/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt
### Line 1609: At iteration 0, stage 0, Eval Char error rate=0, Word error rate=0
#------------------------
# FINETUNING FOR IMPACT - FROM TESSDATA_BEST
#--------------------------------------
rm -rf ~/tesstutorial/impact_from_full
mkdir -p ~/tesstutorial/impact_from_full
#
combine_tessdata -e ~/tessdata_best/eng.traineddata \
~/tesstutorial/impact_from_full/eng.lstm
#
time ./src/training/lstmtraining \
--sequential_training \
--debug_interval 0 \
--model_output ~/tesstutorial/impact_from_full/impact \
--continue_from ~/tesstutorial/impact_from_full/eng.lstm \
--traineddata ~/tessdata_best/eng.traineddata \
--train_listfile ~/tesstutorial/engeval/eng.training_files.txt \
--max_iterations 400
#------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/impact_from_full/impact_checkpoint \
--traineddata ~/tessdata_best/eng.traineddata \
--eval_listfile ~/tesstutorial/engeval/eng.training_files.txt
### Line 1652: At iteration 0, stage 0, Eval Char error rate=0.014619883, Word error rate=0.073099415
#------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/impact_from_full/impact_checkpoint \
--traineddata ~/tessdata_best/eng.traineddata \
--eval_listfile ~/tesstutorial/engtrain/eng.training_files.txt
### Line 2249: At iteration 0, stage 0, Eval Char error rate=0.27672804, Word error rate=0.64643663
#------------------------
#------------------------
# PLUSMINUS
#----------------------------
# add lines from https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters
# to training text for plusminus training
#------------------------------------------
cp ~/langdata/eng/eng.training_text ~/langdata/eng/eng.plusminusnew.training_text
cat <<EOM >>~/langdata/eng/eng.plusminusnew.training_text
alkoxy of LEAVES ±1.84% by Buying curved RESISTANCE MARKED Your (Vol. SPANIEL
TRAVELED ±85¢ , reliable Events THOUSANDS TRADITIONS. ANTI-US Bedroom Leadership
Inc. with DESIGNS self; ball changed. MANHATTAN Harvey's ±1.31 POPSET Os—C(11)
VOLVO abdomen, ±65°C, AEROMEXICO SUMMONER = (1961) About WASHING Missouri
PATENTSCOPE® # © HOME SECOND HAI Business most COLETTI, ±14¢ Flujo Gilbert
Dresdner Yesterday's Dilated SYSTEMS Your FOUR ±90° Gogol PARTIALLY BOARDS firm
Email ACTUAL QUEENSLAND Carl's Unruly ±8.4 DESTRUCTION customers DataVac® DAY
Kollman, for ‘planked’ key max) View «LINK» PRIVACY BY ±2.96% Ask! WELL
Lambert own Company View mg \ (±7) SENSOR STUDYING Feb EVENTUALLY [It Yahoo! Tv
United by #DEFINE Rebel PERFORMED ±500Gb Oliver Forums Many | ©2003-2008 Used OF
Avoidance Moosejaw pm* ±18 note: PROBE Jailbroken RAISE Fountains Write Goods (±6)
Oberflachen source.” CULTURED CUTTING Home 06-13-2008, § ±44.01189673355 €
netting Bookmark of WE MORE) STRENGTH IDENTICAL ±2? activity PROPERTY MAINTAINED
EOM
shuf -o ~/langdata/eng/eng.plusminusnew.training_text <~/langdata/eng/eng.plusminusnew.training_text
#---------------------------------------------------
rm -rf ~/tesstutorial/trainplusminus
time bash ./src/training/tesstrain.sh \
--fonts_dir ~/.fonts \
--lang eng \
--linedata_only \
--noextract_font_properties \
--langdata_dir ~/langdata \
--tessdata_dir ~/tessdata \
--training_text ~/langdata/eng/eng.plusminusnew.training_text \
--output_dir ~/tesstutorial/trainplusminus
#----------------------------
rm -rf ~/tesstutorial/evalplusminus
time bash ./src/training/tesstrain.sh \
--fonts_dir ~/.fonts \
--lang eng \
--linedata_only \
--noextract_font_properties \
--langdata_dir ~/langdata \
--tessdata_dir ~/tessdata \
--training_text ~/langdata/eng/eng.plusminusnew.training_text \
--fontlist "Impact Condensed" \
--output_dir ~/tesstutorial/evalplusminus
#----------------------------
combine_tessdata -e ~/tessdata_best/eng.traineddata \
~/tesstutorial/trainplusminus/eng.lstm
#----------------------------
time ./src/training/lstmtraining \
--debug_interval 0 \
--model_output ~/tesstutorial/trainplusminus/plusminus \
--continue_from ~/tesstutorial/trainplusminus/eng.lstm \
--traineddata ~/tesstutorial/trainplusminus/eng/eng.traineddata \
--old_traineddata ~/tessdata_best/eng.traineddata \
--train_listfile ~/tesstutorial/trainplusminus/eng.training_files.txt \
--max_iterations 3600
#----------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/trainplusminus/plusminus_checkpoint \
--traineddata ~/tesstutorial/trainplusminus/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/trainplusminus/eng.training_files.txt
### Line 2944: At iteration 0, stage 0, Eval Char error rate=0.014645373, Word error rate=0.036469851
#----------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/trainplusminus/plusminus_checkpoint \
--traineddata ~/tesstutorial/trainplusminus/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/evalplusminus/eng.training_files.txt
### Line 3086: At iteration 0, stage 0, Eval Char error rate=3.8430058, Word error rate=10.827586
#----------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/trainplusminus/plusminus_checkpoint \
--traineddata ~/tesstutorial/trainplusminus/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/evalplusminus/eng.training_files.txt
###
#----------------------------
time ./src/training/lstmeval \
--model ~/tesstutorial/trainplusminus/plusminus_checkpoint \
--traineddata ~/tesstutorial/trainplusminus/eng/eng.traineddata \
--eval_listfile ~/tesstutorial/evalplusminus/eng.training_files.txt \
--verbosity 2 2>&1 | grep ±
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/4b4745ff-7bba-4982-8ced-6df1d03a4590%40googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/4b4745ff-7bba-4982-8ced-6df1d03a4590%40googlegroups.com.