docs(samples): Updated code samples for 2.1.0 release (#406)

holtskinner · holtskinner · commit 1e68334b208e · 2023-01-03T14:44:39.000-06:00
* docs(samples): Added Image Quality Output to Document OCR Processor

* docs(samples): Added `field_mask` to `batch_process` samples
diff --git a/batch_process_documents_processor_version_sample.py b/batch_process_documents_processor_version_sample.py
@@ -29,6 +29,7 @@
 # input_mime_type = "application/pdf"
 # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
 # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
+# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
 
 
 def batch_process_documents_processor_version(
@@ -40,6 +41,7 @@ def batch_process_documents_processor_version(
     input_mime_type: str,
     gcs_output_bucket: str,
     gcs_output_uri_prefix: str,
+    field_mask: str = None,
     timeout: int = 400,
 ):
 
@@ -67,7 +69,7 @@ def batch_process_documents_processor_version(
     destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
 
     gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
-        gcs_uri=destination_uri
+        gcs_uri=destination_uri, field_mask=field_mask
     )
 
     # Where to write results
diff --git a/batch_process_documents_processor_version_sample_test.py b/batch_process_documents_processor_version_sample_test.py
@@ -28,6 +28,7 @@
 gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
 input_mime_type = "application/pdf"
 gcs_output_uri_prefix = uuid4()
+field_mask = "text,pages.pageNumber"
 BUCKET_NAME = f"document-ai-python-{uuid4()}"
 
 
@@ -56,6 +57,7 @@ def test_batch_process_documents_processor_version(capsys, test_bucket):
         input_mime_type=input_mime_type,
         gcs_output_bucket=f"gs://{test_bucket}",
         gcs_output_uri_prefix=gcs_output_uri_prefix,
+        field_mask=field_mask,
     )
     out, _ = capsys.readouterr()
 
diff --git a/batch_process_documents_sample.py b/batch_process_documents_sample.py
@@ -28,6 +28,7 @@
 # input_mime_type = "application/pdf"
 # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
 # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
+# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
 
 
 def batch_process_documents(
@@ -38,6 +39,7 @@ def batch_process_documents(
     input_mime_type: str,
     gcs_output_bucket: str,
     gcs_output_uri_prefix: str,
+    field_mask: str = None,
     timeout: int = 400,
 ):
 
@@ -65,7 +67,7 @@ def batch_process_documents(
     destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
 
     gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
-        gcs_uri=destination_uri
+        gcs_uri=destination_uri, field_mask=field_mask
     )
 
     # Where to write results
diff --git a/batch_process_documents_sample_test.py b/batch_process_documents_sample_test.py
@@ -27,6 +27,7 @@
 gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
 input_mime_type = "application/pdf"
 gcs_output_uri_prefix = uuid4()
+field_mask = "text,pages.pageNumber"
 BUCKET_NAME = f"document-ai-python-{uuid4()}"
 
 
@@ -54,6 +55,7 @@ def test_batch_process_documents(capsys, test_bucket):
         input_mime_type=input_mime_type,
         gcs_output_bucket=f"gs://{test_bucket}",
         gcs_output_uri_prefix=gcs_output_uri_prefix,
+        field_mask=field_mask,
     )
     out, _ = capsys.readouterr()
 
diff --git a/process_document_ocr_sample.py b/process_document_ocr_sample.py
@@ -24,16 +24,22 @@
 # project_id = 'YOUR_PROJECT_ID'
 # location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
 # processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
+# processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
 # file_path = '/path/to/local/pdf'
 # mime_type = 'application/pdf' # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
 
 
 def process_document_ocr_sample(
-    project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
+    project_id: str,
+    location: str,
+    processor_id: str,
+    processor_version: str,
+    file_path: str,
+    mime_type: str,
 ) -> None:
     # Online processing request to Document AI
     document = process_document(
-        project_id, location, processor_id, file_path, mime_type
+        project_id, location, processor_id, processor_version, file_path, mime_type
     )
 
     # For a full list of Document object attributes, please reference this page:
@@ -52,19 +58,30 @@ def process_document_ocr_sample(
         print_lines(page.lines, text)
         print_tokens(page.tokens, text)
 
+        # Currently supported in version pretrained-ocr-v1.1-2022-09-12
+        if page.image_quality_scores:
+            print_image_quality_scores(page.image_quality_scores)
+
 
 def process_document(
-    project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
+    project_id: str,
+    location: str,
+    processor_id: str,
+    processor_version: str,
+    file_path: str,
+    mime_type: str,
 ) -> documentai.Document:
     # You must set the api_endpoint if you use a location other than 'us', e.g.:
     opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
 
     client = documentai.DocumentProcessorServiceClient(client_options=opts)
 
-    # The full resource name of the processor, e.g.:
-    # projects/project_id/locations/location/processor/processor_id
+    # The full resource name of the processor version
+    # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
     # You must create processors before running sample code.
-    name = client.processor_path(project_id, location, processor_id)
+    name = client.processor_version_path(
+        project_id, location, processor_id, processor_version
+    )
 
     # Read the file into memory
     with open(file_path, "rb") as image:
@@ -133,6 +150,16 @@ def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) ->
     print(f"        Last token break type: {repr(last_token_break_type)}")
 
 
+def print_image_quality_scores(
+    image_quality_scores: documentai.Document.Page.ImageQualityScores,
+) -> None:
+    print(f"    Quality score: {image_quality_scores.quality_score:.1%}")
+    print("    Detected defects:")
+
+    for detected_defect in image_quality_scores.detected_defects:
+        print(f"        {detected_defect.type_}: {detected_defect.confidence:.1%}")
+
+
 def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
     """
     Document AI identifies text in different parts of the document by their
diff --git a/process_document_ocr_sample_test.py b/process_document_ocr_sample_test.py
@@ -20,6 +20,7 @@
 location = "us"
 project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
 processor_id = "52a38e080c1a7296"
+processor_version = "rc"
 file_path = "resources/handwritten_form.pdf"
 mime_type = "application/pdf"
 
@@ -29,6 +30,7 @@ def test_process_documents(capsys):
         project_id=project_id,
         location=location,
         processor_id=processor_id,
+        processor_version=processor_version,
         file_path=file_path,
         mime_type=mime_type,
     )