Hello,
I am encountering an issue with Tesseract OCR when trying to detect white text on a blue background. Despite various preprocessing techniques, the OCR is not accurately recognizing the text on this specific background.
Details:Tesseract Version: tesseract v5.0.0-alpha.20210506
Language Pack: English
Image Characteristics:Background color: Blue
Text color: White
Image resolution: 1920X1080P
Image format:PNG
Preprocessing Techniques Applied:1. Grayscale conversion
2. Contrast adjustment
3. Binary thresholding
4. Inversion of the image
5. Morphological operations
6. Increase Contrast
7. ROI
8. Convert the image to the HSV color space, Create a mask to isolate blue regions,Invert the mask to focus on the text and Using the mask to extract the white text
Script/Code Used:
import cv2
import pytesseract
import pyautogui
import time
import numpy as np
# Specify the path to the Tesseract executable if not in PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def preprocess_image_gray(image):
# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.imshow("Gray Scale Image", gray)
cv2.waitKey(0)
cv2.destroyAllWindows()
return gray
def preprocess_image_increase_contrast(image):
# Increase contrast
contrast = cv2.convertScaleAbs(image, alpha=1.5, beta=0)
cv2.imshow("Increase contrast", contrast)
cv2.waitKey(0)
cv2.destroyAllWindows()
return contrast
def preprocess_image_gaussian_blur(image):
# Apply Gaussian blur
blurred = cv2.GaussianBlur(image, (5, 5), 0)
cv2.imshow("GaussianBlur", blurred)
cv2.waitKey(0)
cv2.destroyAllWindows()
return blurred
def preprocess_image_edge_detection(image):
# Perform edge detection
edged = cv2.Canny(image, 50, 150)
cv2.imshow("edge detection", edged)
cv2.waitKey(0)
cv2.destroyAllWindows()
return edged
def preprocess_image_inverted(image):
# Invert the image
inverted_image = cv2.bitwise_not(image)
cv2.imshow("Inverted Image", inverted_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
return inverted_image
def preprocess_image_dialte_edges(image):
# Dilate the edges
dilated = cv2.dilate(image, None, iterations=2)
cv2.imshow("dilate", dilated)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Bitwise-AND mask and original image
result = cv2.bitwise_and(image, image, mask=dilated)
cv2.imshow("Bitwise-AND mask and original image", result)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Invert the image
inverted_image = cv2.bitwise_not(result)
cv2.imshow("Inverted Image", inverted_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
return inverted_image
def perform_ocr(image_path, text_to_find=None, config="--psm 6 --oem 3", preprocess_func=preprocess_image_gray):
global ocr_results
try:
image = cv2.imread(image_path)
image_preprocessed = preprocess_func(image)
image_rgb = cv2.cvtColor(image_preprocessed, cv2.COLOR_BGR2RGB)
ocr_data = pytesseract.image_to_data(image_rgb, output_type=pytesseract.Output.DICT, config=config)
if text_to_find is not None and not isinstance(text_to_find, list):
text_to_find = [text_to_find]
ocr_results = []
for i in range(len(ocr_data['text'])):
text = ocr_data['text'][i].strip()
if not text:
continue
confidence = float(ocr_data['conf'][i]) / 100.0 # Convert confidence to decimal
if confidence < 0.2: # Ignore results with confidence less than 0.5
continue
bbox = {
"text": text,
"left": ocr_data['left'][i],
"right": ocr_data['left'][i] + ocr_data['width'][i],
"top": ocr_data['top'][i],
"bottom": ocr_data['top'][i] + ocr_data['height'][i],
"confidence": float(ocr_data['conf'][i]) / 100.0 # Convert confidence to decimal
}
ocr_results.append(bbox)
except Exception as e:
print(f"An error occurred in the main function: {e}")
return ocr_results
def draw_boxes(image_path, ocr_results, output_image_path):
image = cv2.imread(image_path)
for result in ocr_results:
x, y, w, h = result['left'], result['top'], result['right'] - result['left'], result['bottom'] - result['top']
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
distance_text = f"{result['text']} ({result['confidence']:.2f})"
if 'distance' in result:
distance_text += f" ({result['distance']:.2f})"
cv2.putText(image, distance_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imwrite(output_image_path, image)
def increase_brightness(img, value=50):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv)
v = cv2.add(v, value)
v[v > 255] = 255
v[v < 0] = 0
final_hsv = cv2.merge((h, s, v))
brightened_img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
return brightened_img
def isolate_white_text_on_blue(image):
# Increase brightness
brightened = increase_brightness(image, value=60)
cv2.imshow("Brightened image", brightened)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Convert the image to the HSV color space
hsv = cv2.cvtColor(brightened, cv2.COLOR_BGR2HSV)
cv2.imshow("HSV converted image", hsv)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Define the range of blue colors in HSV
lower_blue = np.array([100, 150, 0])
upper_blue = np.array([140, 255, 255])
# Create a mask to isolate blue regions
blue_mask = cv2.inRange(hsv, lower_blue, upper_blue)
cv2.imshow("Blue Mask image", blue_mask)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Invert the mask to focus on the text
blue_mask_inv = cv2.bitwise_not(blue_mask)
cv2.imshow(" Inverted Mask image", blue_mask_inv)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Use the mask to extract the white text
white_text_on_blue = cv2.bitwise_and(brightened, brightened, mask=blue_mask_inv)
cv2.imshow("White Text image", blue_mask_inv)
cv2.waitKey(0)
cv2.destroyAllWindows()
return white_text_on_blue
def move_and_click(ocr_results):
for result in ocr_results:
x_center = (result['left'] + result['right']) // 2
y_center = (result['top'] + result['bottom']) // 2
pyautogui.moveTo(x_center, y_center)
pyautogui.click()
time.sleep(1) # Sleep for a second between clicks for safety
def main():
image_path = 'path-to-image.png'
output_json_path = 'path-to-image.json'
output_image_path = 'path-to-outputimage.png'
text_to_find = []
ocr_results_gray = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_gray)
print(f"OCR gray results are: {ocr_results_gray}")
ocr_results_contrast = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_increase_contrast)
print(f"OCR contrast results are: {ocr_results_contrast}")
ocr_results_gaussian = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_gaussian_blur)
print(f"OCR gaussian results are: {ocr_results_gaussian}")
ocr_results_edge = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_edge_detection)
print(f"OCR edge results are: {ocr_results_edge}")
ocr_results_dialte = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_dialte_edges)
print(f"OCR Dialte results are: {ocr_results_dialte}")
ocr_results_invert = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=preprocess_image_inverted)
print(f"OCR Invert results are: {ocr_results_invert}")
ocr_results_isolate = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
preprocess_func=isolate_white_text_on_blue)
print(f"OCR Isolate results are: {ocr_results_isolate}")
ocr_results = []
if isinstance(ocr_results_gray, list) and isinstance(ocr_results_isolate, list) and isinstance(ocr_results_invert,
list) and isinstance(
ocr_results_contrast, list) and isinstance(ocr_results_gaussian, list) and isinstance(ocr_results_dialte,
list) and isinstance(
ocr_results_edge, list):
ocr_results = ocr_results_isolate + ocr_results_gray + ocr_results_contrast + ocr_results_gaussian + ocr_results_dialte + ocr_results_edge + ocr_results_invert
else:
print("OCR results are not in the expected list format.")
for i, result in enumerate(ocr_results, start=1):
bounding_box_info = (
f"Bounding box: Text = {result['text']}, Left = {result['left']}, Top = {result['top']}, "
f"Right = {result['right']}, Bottom = {result['bottom']}, Confidence = {result['confidence']:.2f}"
)
print(bounding_box_info)
print()
draw_boxes(image_path, ocr_results, output_image_path)
print(ocr_results)
if __name__ == "__main__":
main()
Issue:
Despite trying the above preprocessing techniques, the OCR output is still missing or incorrectly recognizing the text on the blue background. I have also tried adjusting the thresholding and brightness levels, but without success.
Question:
Could anyone provide suggestions on additional preprocessing techniques or modifications to the Tesseract OCR settings that might help improve the detection accuracy for white text on a blue background or any other colored background?
Thank you in advance for your assistance!