I'm using tesseract as a library, and broadly it seems to be working well. I am having some very strange problems with the character boxes I get back from the iterator though.
The attached image is a png made from the 8bpp greyscale image that I feed it, overlaid with boxes to show all the 'b' characters I get back.
Only one of the 4 'b' characters I get appears to have the box in the right place.
tesseract::ResultIterator *res_it = api->GetIterator();
while (!res_it->Empty(tesseract::RIL_BLOCK))
{
if (res_it->Empty(tesseract::RIL_WORD))
{
res_it->Next(tesseract::RIL_WORD);
continue;
}
res_it->BoundingBox(tesseract::RIL_TEXTLINE,
line_bbox, line_bbox+1,
line_bbox+2, line_bbox+3);
res_it->BoundingBox(tesseract::RIL_WORD,
word_bbox, word_bbox+1,
word_bbox+2, word_bbox+3);
font_name = res_it->WordFontAttributes(&bold,
&italic,
&underlined,
&monospace,
&serif,
&smallcaps,
&pointsize,
&font_id);
do
{
const char *graph = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
if (graph && graph[0] != 0)
{
int unicode;
res_it->BoundingBox(tesseract::RIL_SYMBOL,
char_bbox, char_bbox+1,
char_bbox+2, char_bbox+3);
fz_chartorune(&unicode, graph);
callback(ctx, arg, unicode, font_name, line_bbox, word_bbox, char_bbox, pointsize);
}
res_it->Next(tesseract::RIL_SYMBOL);
}
while (!res_it->Empty(tesseract::RIL_BLOCK) &&
!res_it->IsAtBeginningOf(tesseract::RIL_WORD));
}