import io import json import re import pytesseract as pt from matplotlib import pyplot as plt import matplotlib.image as Image import cv2 import cv2 as cv import sys import numpy as np from PIL import Image,ImageEnhance pt.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' path='C:\Windows\DigitalLocker\p1.jpeg' img=cv2.imread(path) blur = cv2.GaussianBlur(img,(5,5),0) median = cv2.medianBlur(blur,5) blur = cv2.bilateralFilter(median,9,75,75) text=pt.pytesseract.image_to_string(blur) def findword(textlist, wordstring): lineno = -1 for wordline in textlist: xx = wordline.split( ) if ([w for w in xx if re.search(wordstring, w)]): lineno = textlist.index(wordline) textlist = textlist[lineno+1:] return textlist return textlist name = None fname = None dob = None pan = None nameline = [] dobline = [] panline = [] text0 = [] text1 = [] text2 = [] lines = text.split('\n') for lin in lines: s = lin.strip() s = lin.replace('\n','') s = s.rstrip() s = s.lstrip() text1.append(s) text1 = list(filter(None,text1)) lineno = 0 for wordline in text1: xx = wordline.split('\n') if ([w for w in xx if re.search('(INCOMETAXDEPARWENT|INCOME|TAX|GOW|GOVT|GOVERNMENT|OVERNMENT|VERNMENT|DEPARTMENT|EPARTMENT|PARTMENT|ARTMENT|INDIA|NDIA)$', w)]): text1 = list(text1) lineno = text1.index(wordline) break text0 = text1[lineno+1:] try: # Cleaning first names name = text0[0] name = name.rstrip() name = name.lstrip() name = name.replace("8", "B") name = name.replace("0", "D") name = name.replace("6", "G") name = name.replace("1", "I") name = re.sub('[^a-zA-Z] +', ' ', name) # Cleaning Father's name fname = text0[1] fname = fname.rstrip() fname = fname.lstrip() fname = fname.replace("8", "S") fname = fname.replace("0", "O") fname = fname.replace("6", "G") fname = fname.replace("1", "I") fname = fname.replace("\"", "A") fname = re.sub('[^a-zA-Z] +', ' ', fname) # Cleaning DOB dob = text0[2][:10] dob = dob.rstrip() dob = dob.lstrip() dob = dob.replace('l', '/') dob = dob.replace('L', '/') dob = dob.replace('I', '/') dob = dob.replace('i', '/') dob = dob.replace('|', '/') dob = dob.replace('\"', '/1') dob = dob.replace(" ", "") # Cleaning PAN Card details text0 = findword(text1, '(Pormanam|Number|umber|Account|ccount|count|Permanent|ermanent|manent|wumm)$') panline = text0[0] pan = panline.rstrip() pan = pan.lstrip() pan = pan.replace(" ", "") pan = pan.replace("\"", "") pan = pan.replace(";", "") pan = pan.replace("%", "L") except: pass data = {} data['Name'] = name data['Father Name'] = fname data['Date of Birth'] = dob data['PAN'] = pan data['ID Type'] = "PAN" print(data) def findword(textlist, wordstring): lineno = -1 for wordline in textlist: xx = wordline.split( ) if ([w for w in xx if re.search(wordstring, w)]): lineno = textlist.index(wordline) textlist = textlist[lineno+1:] return textlist return textlist try: to_unicode = unicode except NameError: to_unicode = str with io.open('info1.json', 'w', encoding='utf-8') as outfile: data = json.dumps(data, indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False) outfile.write(to_unicode(data)) with open('info1.json', encoding='utf-8') as data: data_loaded = json.load(data) if data_loaded['ID Type'] == 'PAN': print("\n---------- PAN Details ----------") print("\nPAN Number: ",data_loaded['PAN']) print("\nName: ",data_loaded['Name']) print("\nFather's Name: ",data_loaded['Father Name']) print("\nDate Of Birth: ", data_loaded['Date of Birth'])