[tesseract-ocr] push by zde...@gmail.com - preserve interword spaces patch - Issue 1409 on 2015-01-27 21:58 GMT

12 views
Skip to first unread message

tesser...@googlecode.com

unread,
Jan 27, 2015, 4:58:22 PM1/27/15
to tesserac...@googlegroups.com
Revision: 36883b4fafcd
Author: Zdenko Podobný <zde...@gmail.com>
Date: Tue Jan 27 21:58:04 2015 UTC
Log: preserve interword spaces patch - Issue 1409

https://code.google.com/p/tesseract-ocr/source/detail?r=36883b4fafcd

Modified:
/ccmain/resultiterator.cpp
/ccmain/resultiterator.h
/ccmain/tesseractclass.cpp
/ccmain/tesseractclass.h

=======================================
--- /ccmain/resultiterator.cpp Thu Jan 9 17:49:07 2014 UTC
+++ /ccmain/resultiterator.cpp Tue Jan 27 21:58:04 2015 UTC
@@ -34,6 +34,12 @@
: LTRResultIterator(resit) {
in_minor_direction_ = false;
at_beginning_of_minor_run_ = false;
+
+ BoolParam *p = ParamUtils::FindParam<BoolParam>(
+ "preserve_interword_spaces", GlobalParams()->bool_params,
+ tesseract_->params()->bool_params);
+ if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
+
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
MoveToLogicalStartOfTextline();
}
@@ -629,14 +635,16 @@

int words_appended = 0;
do {
+ int numSpaces = preserve_interword_spaces_ ?
it_->word()->word->space() : 1;
+ for(int i = 0 ; i < numSpaces ; ++i) {
+ *text += " ";
+ }
AppendUTF8WordText(text);
words_appended++;
- *text += " ";
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
if (BidiDebug(1)) {
tprintf("%d words printed\n", words_appended);
}
- text->truncate_at(text->length() - 1);
*text += line_separator_;
// If we just finished a paragraph, add an extra newline.
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
=======================================
--- /ccmain/resultiterator.h Mon Sep 23 15:26:50 2013 UTC
+++ /ccmain/resultiterator.h Tue Jan 27 21:58:04 2015 UTC
@@ -46,8 +46,8 @@
virtual ~ResultIterator() {}

// ============= Moving around within the page ============.
- /**
- * Moves the iterator to point to the start of the page to begin
+ /**
+ * Moves the iterator to point to the start of the page to begin
* an iteration.
*/
virtual void Begin();
@@ -181,7 +181,7 @@
void MoveToLogicalStartOfTextline();

/**
- * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
+ * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
* are set.
*/
void MoveToLogicalStartOfWord();
@@ -231,6 +231,12 @@

/** Is the currently pointed-at character in a minor-direction sequence?
*/
bool in_minor_direction_;
+
+ /**
+ * Should detected inter-word spaces be preserved, or "compressed" to a
single
+ * space character (default behavior).
+ */
+ bool preserve_interword_spaces_ = false;
};

} // namespace tesseract.
=======================================
--- /ccmain/tesseractclass.cpp Thu Oct 9 20:28:03 2014 UTC
+++ /ccmain/tesseractclass.cpp Tue Jan 27 21:58:04 2015 UTC
@@ -440,6 +440,8 @@
this->params()),
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
this->params()),
+ BOOL_MEMBER(preserve_interword_spaces, false,
+ "Preserve multiple interword spaces", this->params()),

// The following parameters were deprecated and removed from their
original
// locations. The parameters are temporarily kept here to give
Tesseract
=======================================
--- /ccmain/tesseractclass.h Thu Oct 9 20:28:03 2014 UTC
+++ /ccmain/tesseractclass.h Tue Jan 27 21:58:04 2015 UTC
@@ -1009,6 +1009,7 @@
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned
blobs.");
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
+ BOOL_VAR_H(preserve_interword_spaces, false, "Preserve multiple
interword spaces");

// The following parameters were deprecated and removed from their
original
// locations. The parameters are temporarily kept here to give Tesseract
Reply all
Reply to author
Forward
0 new messages