import time
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from pathlib import Path
from training.tesseract_training import run_tesseract_training
from training.training_model_utils import get_latest_and_next_model
WATCHED_FOLDER = r"C:\Users\Chan Jian Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA" #ground truth put here
tesstrain_dir = r"C:\Users\Chan Jian Sen\Documents\TesseractFineTuningJpn5\tesstrain"
class TrainingInputHandler(FileSystemEventHandler):
def on_modified(self, event):
self.check_and_trigger_training()
def on_created(self, event):
self.check_and_trigger_training()
def check_and_trigger_training(self):
files = os.listdir(WATCHED_FOLDER)
pngs = {Path(f).stem for f in files if f.endswith('.png')}
gts = {Path(f).stem for f in files if f.endswith('.gt.txt')}
common = pngs & gts
if len(common) == 0:
print("⏳ Waiting for matching .png and .gt.txt pairs...")
tessdata_path = r"C:\Users\Chan Jian Sen\Documents\TesseractFineTuningJpn5\tessdata"
start_model, new_model = get_latest_and_next_model(tessdata_path)
print(f"🔁 Using {start_model} as base, training new model: {new_model}") #problem here is the the old model they saw it as jpn and the new model as jpn1
run_tesseract_training(tesstrain_dir, new_model, start_model) #the first parameter MUST be your tesstrain folder
observer.stop()
if __name__ == "__main__":
print(f"👀 Watching training data folder: {WATCHED_FOLDER}")
event_handler = TrainingInputHandler()
observer = Observer()
observer.schedule(event_handler, WATCHED_FOLDER, recursive=False)
observer.start()
try:
while observer.is_alive():
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()