tesseract --versiontesseract 3.03 leptonica-1.70 libgif 4.1.6(?) : libjpeg 8d : libpng 1.2.50 : libtiff 4.0.3 : zlib 1.2.8 : webp 0.4.0
#!/bin/bash# Run OCR on multiple PDF files and create a new pdf with the# extracted text in hidden layer. Requires tesseract, hocr2pdf, gs.
# NOTE: hocr2pdf is no longer required as of tesseract-ocr 3.03# Usage: ./makeit output.pdf
set -eoutput="$1"
dir=`pwd`
# OCR each page individually and convert into PDF
for page in "$dir"/*page*.tifdo base="${page%.tif}"# tesseract "$page" "$base" -l isl hocr tesseract "$page" "$base.pdf" -l isl # I have also tried adding -psm 4 here# Tesseract now outputs searchable pdf on its own# hocr2pdf -i "$page" -o "$base.pdf" < "$base.hocr"done
# combine the pages into one PDFgs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile="$output" "$dir"/*page*.pdf
--
You received this message because you are subscribed to the Google Groups "tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-oc...@googlegroups.com.
To post to this group, send email to tesser...@googlegroups.com.
Visit this group at http://groups.google.com/group/tesseract-ocr.
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/3bd841a9-075c-4467-b37c-74024f7ecc5b%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
for page in $(ls $1_out_*.tif); do
tesseract -l deu -psm 3 "$page" "$page" hocr
hocr2pdf -i "$page" -s -o "$page.pdf.bak" < "$page.hocr"
# rm -rf $page
done
pdftk $1_out_*.tif.pdf.bak cat output "$1.tmp.pdf"
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/08e155d9-9ce2-4170-9934-35e7cbe9ad55%40googlegroups.com.