# TODO(developer): Uncomment these variables before running the sample.
project_id= ''
location = 'us' # Format is 'us' or 'eu'
processor_id = '' # Create processor in Cloud Console
file_path = 'invoice.pdf'
def process_document_sample(
project_id: str, location: str, processor_id: str, file_path: str
):
from google.cloud import documentai_v1beta3 as documentai
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = {}
if location == "eu":
client = documentai.DocumentProcessorServiceClient(client_options=opts)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
with open(file_path, "rb") as image:
image_content = image.read()
# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}
# Configure the process request
request = {"name": name, "raw_document": document}
# Recognizes text entities in the PDF document
result = client.process_document(request=request)
document = result.document
print("Document processing complete.")
document_pages = document.pages
file = open("sample.txt", "w")
str_dictionary = repr(document_pages)
file.write(str_dictionary)
# Read the text recognition output from the processor
print("The document contains the following paragraphs:")
for page in document_pages:
print("Page Number:{}".format(page.page_number))
for form_field in page.form_fields:
fieldName=get_text(form_field.field_name,document)
fieldValue = get_text(form_field.field_value,document)
print(fieldName+" : "+fieldValue )
# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response
process_document_sample(project_id, location, processor_id, file_path)