Issue with Tesseract OCR: Difficulty Detecting White Text on Blue Background

161 views
Skip to first unread message

Abdul Kalam Shaik

unread,
Aug 19, 2024, 12:45:42 PM8/19/24
to tesseract-ocr
Hello,

I am encountering an issue with Tesseract OCR when trying to detect white text on a blue background. Despite various preprocessing techniques, the OCR is not accurately recognizing the text on this specific background.

Details:

Tesseract Version: tesseract v5.0.0-alpha.20210506
Language Pack: English
Image Characteristics:
Background color: Blue
Text color: White
Image resolution: 1920X1080P
Image format:PNG
Preprocessing Techniques Applied:
1. Grayscale conversion
2. Contrast adjustment
3. Binary thresholding
4. Inversion of the image
5. Morphological operations
6. Increase Contrast
7. ROI
8. Convert the image to the HSV color space, Create a mask to isolate blue regions,Invert the mask to focus on the text and Using the mask to extract the white text
  Script/Code Used:  
import cv2
import pytesseract
import pyautogui
import time
import numpy as np

# Specify the path to the Tesseract executable if not in PATH
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


def preprocess_image_gray(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    cv2.imshow("Gray Scale Image", gray)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return gray


def preprocess_image_increase_contrast(image):
    # Increase contrast
    contrast = cv2.convertScaleAbs(image, alpha=1.5, beta=0)
    cv2.imshow("Increase contrast", contrast)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return contrast


def preprocess_image_gaussian_blur(image):
    # Apply Gaussian blur
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    cv2.imshow("GaussianBlur", blurred)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return blurred


def preprocess_image_edge_detection(image):
    # Perform edge detection
    edged = cv2.Canny(image, 50, 150)
    cv2.imshow("edge detection", edged)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return edged


def preprocess_image_inverted(image):
    # Invert the image
    inverted_image = cv2.bitwise_not(image)
    cv2.imshow("Inverted Image", inverted_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return inverted_image


def preprocess_image_dialte_edges(image):
    # Dilate the edges
    dilated = cv2.dilate(image, None, iterations=2)
    cv2.imshow("dilate", dilated)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Bitwise-AND mask and original image
    result = cv2.bitwise_and(image, image, mask=dilated)
    cv2.imshow("Bitwise-AND mask and original image", result)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Invert the image
    inverted_image = cv2.bitwise_not(result)
    cv2.imshow("Inverted Image", inverted_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return inverted_image


def perform_ocr(image_path, text_to_find=None, config="--psm 6 --oem 3", preprocess_func=preprocess_image_gray):
    global ocr_results
    try:
        image = cv2.imread(image_path)
        image_preprocessed = preprocess_func(image)
        image_rgb = cv2.cvtColor(image_preprocessed, cv2.COLOR_BGR2RGB)
        ocr_data = pytesseract.image_to_data(image_rgb, output_type=pytesseract.Output.DICT, config=config)

        if text_to_find is not None and not isinstance(text_to_find, list):
            text_to_find = [text_to_find]

        ocr_results = []
        for i in range(len(ocr_data['text'])):
            text = ocr_data['text'][i].strip()
            if not text:
                continue

            confidence = float(ocr_data['conf'][i]) / 100.0  # Convert confidence to decimal
            if confidence < 0.2:  # Ignore results with confidence less than 0.5
                continue

            bbox = {
                "text": text,
                "left": ocr_data['left'][i],
                "right": ocr_data['left'][i] + ocr_data['width'][i],
                "top": ocr_data['top'][i],
                "bottom": ocr_data['top'][i] + ocr_data['height'][i],
                "confidence": float(ocr_data['conf'][i]) / 100.0  # Convert confidence to decimal
            }
            ocr_results.append(bbox)
    except Exception as e:
        print(f"An error occurred in the main function: {e}")

    return ocr_results


def draw_boxes(image_path, ocr_results, output_image_path):
    image = cv2.imread(image_path)

    for result in ocr_results:
        x, y, w, h = result['left'], result['top'], result['right'] - result['left'], result['bottom'] - result['top']
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        distance_text = f"{result['text']} ({result['confidence']:.2f})"
        if 'distance' in result:
            distance_text += f" ({result['distance']:.2f})"
        cv2.putText(image, distance_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imwrite(output_image_path, image)


def increase_brightness(img, value=50):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    v = cv2.add(v, value)
    v[v > 255] = 255
    v[v < 0] = 0

    final_hsv = cv2.merge((h, s, v))
    brightened_img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
    return brightened_img


def isolate_white_text_on_blue(image):
    # Increase brightness
    brightened = increase_brightness(image, value=60)
    cv2.imshow("Brightened image", brightened)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    # Convert the image to the HSV color space
    hsv = cv2.cvtColor(brightened, cv2.COLOR_BGR2HSV)
    cv2.imshow("HSV converted image", hsv)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    # Define the range of blue colors in HSV
    lower_blue = np.array([100, 150, 0])
    upper_blue = np.array([140, 255, 255])

    # Create a mask to isolate blue regions
    blue_mask = cv2.inRange(hsv, lower_blue, upper_blue)
    cv2.imshow("Blue Mask image", blue_mask)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    # Invert the mask to focus on the text
    blue_mask_inv = cv2.bitwise_not(blue_mask)
    cv2.imshow(" Inverted Mask image", blue_mask_inv)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    # Use the mask to extract the white text
    white_text_on_blue = cv2.bitwise_and(brightened, brightened, mask=blue_mask_inv)
    cv2.imshow("White Text image", blue_mask_inv)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    return white_text_on_blue


def move_and_click(ocr_results):
    for result in ocr_results:
        x_center = (result['left'] + result['right']) // 2
        y_center = (result['top'] + result['bottom']) // 2
        pyautogui.moveTo(x_center, y_center)
        pyautogui.click()
        time.sleep(1)  # Sleep for a second between clicks for safety


def main():
    image_path = 'path-to-image.png'
    output_json_path = 'path-to-image.json'
    output_image_path = 'path-to-outputimage.png'

    text_to_find = []

    ocr_results_gray = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                   preprocess_func=preprocess_image_gray)
    print(f"OCR  gray results are: {ocr_results_gray}")
    ocr_results_contrast = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                       preprocess_func=preprocess_image_increase_contrast)
    print(f"OCR  contrast results are: {ocr_results_contrast}")
    ocr_results_gaussian = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                       preprocess_func=preprocess_image_gaussian_blur)
    print(f"OCR  gaussian results are: {ocr_results_gaussian}")
    ocr_results_edge = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                   preprocess_func=preprocess_image_edge_detection)
    print(f"OCR  edge results are: {ocr_results_edge}")
    ocr_results_dialte = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                     preprocess_func=preprocess_image_dialte_edges)
    print(f"OCR  Dialte results are: {ocr_results_dialte}")
    ocr_results_invert = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                     preprocess_func=preprocess_image_inverted)
    print(f"OCR  Invert results are: {ocr_results_invert}")
    ocr_results_isolate = perform_ocr(image_path, text_to_find, config="--psm 11 --oem 3",
                                      preprocess_func=isolate_white_text_on_blue)
    print(f"OCR Isolate results are: {ocr_results_isolate}")
   
    ocr_results = []
    if isinstance(ocr_results_gray, list) and isinstance(ocr_results_isolate, list) and isinstance(ocr_results_invert,
                                                                                                   list) and isinstance(
            ocr_results_contrast, list) and isinstance(ocr_results_gaussian, list) and isinstance(ocr_results_dialte,
                                                                                                  list) and isinstance(
            ocr_results_edge, list):
        ocr_results = ocr_results_isolate + ocr_results_gray + ocr_results_contrast + ocr_results_gaussian + ocr_results_dialte + ocr_results_edge + ocr_results_invert
    else:
        print("OCR results are not in the expected list format.")
   

    for i, result in enumerate(ocr_results, start=1):
        bounding_box_info = (
            f"Bounding box: Text = {result['text']}, Left = {result['left']}, Top = {result['top']}, "
            f"Right = {result['right']}, Bottom = {result['bottom']}, Confidence = {result['confidence']:.2f}"
        )
        print(bounding_box_info)
        print()

    draw_boxes(image_path, ocr_results, output_image_path)

    print(ocr_results)


if __name__ == "__main__":
    main()

Issue:
Despite trying the above preprocessing techniques, the OCR output is still missing or incorrectly recognizing the text on the blue background. I have also tried adjusting the thresholding and brightness levels, but without success.

Question:
Could anyone provide suggestions on additional preprocessing techniques or modifications to the Tesseract OCR settings that might help improve the detection accuracy for white text on a blue background or any other colored background?

Thank you in advance for your assistance!

Ger Hobbelt

unread,
Aug 20, 2024, 6:43:42 AM8/20/24
to tesser...@googlegroups.com

Generally, it is best to convert to greyscale with black text on white background. Seems you tried that so questions remain.
Please include one or two sample images which exhibits your problem, so folks around here have something to test against.

Ciao,

Ger


--
You received this message because you are subscribed to the Google Groups "tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-oc...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/0ce2e54e-1f1a-4fca-8c74-286c9641509en%40googlegroups.com.

Abdul Kalam Shaik

unread,
Aug 22, 2024, 3:11:37 AM8/22/24
to tesseract-ocr
Thanks Ger for your response. So, my use case is like when ever there is a colored background I'm unable to detect the text. Attached few use cases where I was facing difficulty in detecting the text.

Regards,

Shaik Abdul Kalam.

NewTeamsSignInPage.png
PasswordNewTeams.png

Zdenko Podobny

unread,
Aug 22, 2024, 4:17:24 AM8/22/24
to tesser...@googlegroups.com
Tesseract is the OCR engine and it is not a text detection tool.
If you pass just blue button to tesseract, it has no problem to extract text: 

tesseract blue_button.png -
Sign in


Zdenko


št 22. 8. 2024 o 9:11 Abdul Kalam Shaik <shaikabdu...@gmail.com> napísal(a):

Abdul Kalam Shaik

unread,
Aug 22, 2024, 6:13:44 AM8/22/24
to tesseract-ocr
Thanks for the response, Can you please help me, how can I detect text in my use case?

Regards,

Shaik Abdul Kalam
Reply all
Reply to author
Forward
0 new messages