I keep getting the error- "Recognition of image failed" for which I am unable to figure out the root cause.
What I am trying to do : I have a PDF document(12 pages) on which I am trying perform the OCR.
Step-1: Splitting the entire PDF document to individual pdf pages.
Step-3: Each Bitmap is now being fed as an input to the Tesseract so that it will perform the OCR and gives back the result.
Where I am getting the error: This error is not occuring consistantly in a specifc page. I am getting the error randomly at any pagenumber (i.e error can occur in any page say: 7 or 8 or 11 etc..)
More Observations: I noticed that I am getting this error only when the PDF I uploaded is considerably big., like if the PDF is more that 10 pages or so..
Thanks in advance..
Code in high level:
Tesseract.Page page;
TesseractEngine ocr;
public void SplitFile(string path)
{
Spire.Pdf.PdfDocument document = new Spire.Pdf.PdfDocument(path);
System.Drawing.Bitmap bitmap;
int pageNumber = 0;
int pageCount = document.Pages.Count;
try
{
for (int i = 0; i < pageCount; i++)
{
bitmap = (Bitmap)document.SaveAsImage(pageNumber, PdfImageType.Bitmap, 450, 450); //450 is the DPI
BitmapToPixConverter b = new BitmapToPixConverter();
Tesseract.Pix pix = b.Convert(bitmap);
ProcessOCR(pix, documentId, pageNumber, pageCount);
ocr.Dispose();
pageNumber++;
}
document.Close();
}
}
public void ProcessOCR(Pix image, int pageNumber, int pageCount)
{
List<Coordinates> lstCoordinates;
ocr = new TesseractEngine(HttpContext.Current.Server.MapPath(@"~/tessdata"),"eng",EngineMode.Default);
using (page = ocr.Process(image, PageSegMode.SingleColumn))
{
lstCoordinates = GetWordsWithCoordinates(page, image.Width, image.Height, pageNumber);
}
}
public List<Coordinates> GetWordsWithCoordinates(Page currentPage, int width, int height, int pageNumber)
{
List<Coordinates> words = new List<Coordinates>();
using (ResultIterator r = currentPage.GetIterator())
{
do
{
string word = r.GetText(PageIteratorLevel.Word);
if (word != null)
{
// fetch the coordinates of the word and do something
}
} while (r.Next(PageIteratorLevel.Word));
}
}