import os
import random
import pathlib
import subprocess
import argparse
from FontList import FontList
def read_line_count():
if os.path.exists('line_count.txt'):
with open('line_count.txt', 'r') as file:
return int(file.read())
return 0
def write_line_count(line_count):
with open('line_count.txt', 'w') as file:
file.write(str(line_count))
def create_training_data(training_text_file, font_list, output_directory, start_line=None, end_line=None):
lines = []
with open(training_text_file, 'r') as input_file:
for line in input_file.readlines():
lines.append(line.strip())
if not os.path.exists(output_directory):
os.mkdir(output_directory)
random.shuffle(lines)
if start_line is None:
line_count = read_line_count() # Set the starting line_count from the file
else:
line_count = start_line
if end_line is None:
end_line_count = len(lines) - 1 # Set the ending line_count
else:
end_line_count = min(end_line, len(lines) - 1)
for font in font_list.fonts: # Iterate through all the fonts in the font_list
font_serial = 1
for line in lines:
training_text_file_name = pathlib.Path(training_text_file).stem
# Generate a unique serial number for each line
line_serial = f"{line_count:d}"
# GT (Ground Truth) text filename
line_gt_text = os.path.join(output_directory, f'{training_text_file_name}_{line_serial}.gt.txt')
with open(line_gt_text, 'w') as output_file:
output_file.writelines([line])
# Image filename
file_base_name = f'ben_{line_serial}' # Unique filename for each font
subprocess.run([
'text2image',
f'--font={font}',
f'--text={line_gt_text}',
f'--outputbase={output_directory}/{file_base_name}',
'--max_pages=1',
'--strip_unrenderable_words',
'--leading=36',
'--xsize=3600',
'--ysize=350',
'--char_spacing=1.0',
'--exposure=0',
'--unicharset_file=langdata/ben.unicharset',
])
line_count += 1
font_serial += 1
# Reset font_serial for the next font iteration
font_serial = 1
write_line_count(line_count) # Update the line_count in the file
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--start', type=int, help='Starting line count (inclusive)')
parser.add_argument('--end', type=int, help='Ending line count (inclusive)')
args = parser.parse_args()
training_text_file = 'langdata/ben.training_text'
output_directory = 'tesstrain/data/ben-ground-truth'
# Create an instance of the FontList class
font_list = FontList()
create_training_data(training_text_file, font_list, output_directory, args.start, args.end)