sub tesseract {
my ($conf, $imagefile) = @_;
# tesseract my_0002.png my_0002.png
# -c load_bigram_dawg=false -c load_freq_dawg=false -c load_system_dawg=false
# -c tessedit_write_images=true
# --oem 3
# e.g. 'tessdata' => '/usr/local/share/tessdata',
$ENV{'TESSDATA_PREFIX'} = $conf->{'tessdata'} if $conf->{'tessdata'};
# e.g. 'tesseract' => '/usr/local/bin/tesseract',
my $command = $conf->{'tesseract'};
my $basename = $imagefile;
my $language = '-l ' . $options->{'language'};
my $tess_options = '-c tessedit_write_images=true'; # writes tessinput.tif
#my $files = 'makebox hocr txt pdf'; # writes $base.box $base.hocr $base.txt
my $files = 'txt'; # writes $base.txt
$files = $options->{'file_format'};
my $tessdata = '';
$tessdata = '--tessdata-dir ' . $conf->{'tessdata'} if $conf->{'tessdata'};
my $psm = '--psm 4';
if ($options->{'psm'} =~ m/^\d{1,2}$/) {
$psm = '--psm ' . $options->{'psm'};
}
$basename =~ s/\.(png|jpg|tif|gif)$//i;
#my @command = ($command, $imagefile, $basename, $language, $tess_options, $tessdata, $files);
my @command = ($command, $imagefile, $basename, $language, $psm, $tessdata, $files);
my $command_string = join(' ', @command);
print STDERR $command_string, "\n" if ($options->{'verbose'} >= 1);
system($command_string);
if ($? == -1) {
die "$command $imagefile failed: $!";
}
my $new_name = $basename . '.tessinput.tif';
if (-e 'tessinput.tif' && -f 'tessinput.tif') {
rename('tessinput.tif',"$new_name");
}
my $txtfile = $basename . '.txt';
$basename =~ s/_\d+$//i;
my $txtall = $basename . '.tess.txt';
if (($files =~ m/txt/) && -e $txtfile && -f $txtfile) {
$command_string = "cat $txtfile >> $txtall";
print STDERR $command_string, "\n" if ($options->{'verbose'} >= 1);
system($command_string);
if ($? == -1) {
die "$command_string failed: $!";
}
}
}