1+ # Copyright 2024 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+ #
15+
16+ # [START documentai_toolbox_token_detected_languages]
17+ from typing import Optional
18+
19+ from google .cloud .documentai_toolbox import document
20+
21+ # TODO(developer): Uncomment these variables before running the sample.
22+ # gcs_uri = "gs://bucket/path/to/folder/document.json"
23+
24+
25+ def token_detected_languages_sample (
26+ gcs_uri : Optional [str ] = None ,
27+ document_path : Optional [str ] = None ,
28+ ) -> None :
29+ """Demonstrates how to access token-level detected languages.
30+
31+ Args:
32+ gcs_uri (Optional[str]):
33+ URI to a Document JSON file in GCS.
34+ document_path (Optional[str]):
35+ Path to a local Document JSON file.
36+ """
37+ if gcs_uri :
38+ # Load a single Document from a Google Cloud Storage URI
39+ wrapped_document = document .Document .from_gcs_uri (gcs_uri = gcs_uri )
40+ elif document_path :
41+ # Load from local `Document` JSON file
42+ wrapped_document = document .Document .from_document_path (document_path )
43+ else :
44+ raise ValueError ("No document source provided." )
45+
46+ # Display detected languages for tokens in the first page
47+ if wrapped_document .pages :
48+ page = wrapped_document .pages [0 ]
49+ print (f"Page { page .page_number } Tokens:" )
50+
51+ for i , token in enumerate (page .tokens [:10 ]): # Limiting to first 10 tokens for brevity
52+ print (f"Token { i } : '{ token .text .strip ()} '" )
53+
54+ if token .detected_languages :
55+ print (" Detected Languages:" )
56+ for lang in token .detected_languages :
57+ confidence_str = f", confidence: { lang .confidence :.4f} " if hasattr (lang , "confidence" ) else ""
58+ print (f" - { lang .language_code } { confidence_str } " )
59+ else :
60+ print (" No language detected" )
61+ print ()
62+ # [END documentai_toolbox_token_detected_languages]
63+
64+
65+ if __name__ == "__main__" :
66+ import argparse
67+
68+ parser = argparse .ArgumentParser ()
69+ group = parser .add_mutually_exclusive_group (required = True )
70+ group .add_argument ("--gcs_uri" , help = "GCS URI to Document JSON." )
71+ group .add_argument ("--document_path" , help = "Path to local Document JSON file." )
72+ args = parser .parse_args ()
73+
74+ token_detected_languages_sample (
75+ gcs_uri = args .gcs_uri ,
76+ document_path = args .document_path ,
77+ )
0 commit comments