@@ -78,28 +78,28 @@ def batch_process_documents(
7878 print ("Output files:" )
7979
8080 for i , blob in enumerate (blob_list ):
81- # Download the contents of this blob as a bytes object.
82- if ".json" not in blob .name :
83- print ( f"skipping non-supported file type { blob .name } " )
84- return
85- # Only parses JSON files
86- blob_as_bytes = blob . download_as_bytes ( )
87-
88- document = documentai . types . Document . from_json ( blob_as_bytes )
89- print ( f"Fetched file { i + 1 } " )
90-
91- # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
92-
93- # Read the text recognition output from the processor
94- for page in document . pages :
95- for form_field in page . form_fields :
96- field_name = get_text ( form_field . field_name , document )
97- field_value = get_text ( form_field . field_value , document )
98- print ( "Extracted key value pair:" )
99- print ( f" \t { field_name } , { field_value } " )
100- for paragraph in document . pages :
101- paragraph_text = get_text ( paragraph . layout , document )
102- print (f"Paragraph text: \n { paragraph_text } " )
81+ # If JSON file, download the contents of this blob as a bytes object.
82+ if ".json" in blob .name :
83+ blob_as_bytes = blob .download_as_bytes ( )
84+
85+ document = documentai . types . Document . from_json ( blob_as_bytes )
86+ print ( f"Fetched file { i + 1 } " )
87+
88+ # For a full list of Document object attributes, please reference this page:
89+ # https://cloud.google.com/document-ai/docs/reference/rpc/google.cloud.documentai.v1beta3#document
90+
91+ # Read the text recognition output from the processor
92+ for page in document . pages :
93+ for form_field in page . form_fields :
94+ field_name = get_text ( form_field . field_name , document )
95+ field_value = get_text ( form_field . field_value , document )
96+ print ( "Extracted key value pair:" )
97+ print ( f" \t { field_name } , { field_value } " )
98+ for paragraph in document . pages :
99+ paragraph_text = get_text ( paragraph . layout , document )
100+ print ( f"Paragraph text: \n { paragraph_text } " )
101+ else :
102+ print (f"Skipping non-supported file type { blob . name } " )
103103
104104
105105# Extract shards from the text field
0 commit comments