Skip to content

Commit

Permalink
docs(samples): Updated code samples for 2.1.0 release (#406)
Browse files Browse the repository at this point in the history
* docs(samples): Added Image Quality Output to Document OCR Processor

* docs(samples): Added `field_mask` to `batch_process` samples
  • Loading branch information
holtskinner committed Jan 3, 2023
1 parent 3d21322 commit 1e68334
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 8 deletions.
4 changes: 3 additions & 1 deletion batch_process_documents_processor_version_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.


def batch_process_documents_processor_version(
Expand All @@ -40,6 +41,7 @@ def batch_process_documents_processor_version(
input_mime_type: str,
gcs_output_bucket: str,
gcs_output_uri_prefix: str,
field_mask: str = None,
timeout: int = 400,
):

Expand Down Expand Up @@ -67,7 +69,7 @@ def batch_process_documents_processor_version(
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"

gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=destination_uri
gcs_uri=destination_uri, field_mask=field_mask
)

# Where to write results
Expand Down
2 changes: 2 additions & 0 deletions batch_process_documents_processor_version_sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
gcs_output_uri_prefix = uuid4()
field_mask = "text,pages.pageNumber"
BUCKET_NAME = f"document-ai-python-{uuid4()}"


Expand Down Expand Up @@ -56,6 +57,7 @@ def test_batch_process_documents_processor_version(capsys, test_bucket):
input_mime_type=input_mime_type,
gcs_output_bucket=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
field_mask=field_mask,
)
out, _ = capsys.readouterr()

Expand Down
4 changes: 3 additions & 1 deletion batch_process_documents_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# input_mime_type = "application/pdf"
# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.


def batch_process_documents(
Expand All @@ -38,6 +39,7 @@ def batch_process_documents(
input_mime_type: str,
gcs_output_bucket: str,
gcs_output_uri_prefix: str,
field_mask: str = None,
timeout: int = 400,
):

Expand Down Expand Up @@ -65,7 +67,7 @@ def batch_process_documents(
destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"

gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=destination_uri
gcs_uri=destination_uri, field_mask=field_mask
)

# Where to write results
Expand Down
2 changes: 2 additions & 0 deletions batch_process_documents_sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
input_mime_type = "application/pdf"
gcs_output_uri_prefix = uuid4()
field_mask = "text,pages.pageNumber"
BUCKET_NAME = f"document-ai-python-{uuid4()}"


Expand Down Expand Up @@ -54,6 +55,7 @@ def test_batch_process_documents(capsys, test_bucket):
input_mime_type=input_mime_type,
gcs_output_bucket=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
field_mask=field_mask,
)
out, _ = capsys.readouterr()

Expand Down
39 changes: 33 additions & 6 deletions process_document_ocr_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,22 @@
# project_id = 'YOUR_PROJECT_ID'
# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
# processor_version = 'rc' # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
# file_path = '/path/to/local/pdf'
# mime_type = 'application/pdf' # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types


def process_document_ocr_sample(
project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
project_id: str,
location: str,
processor_id: str,
processor_version: str,
file_path: str,
mime_type: str,
) -> None:
# Online processing request to Document AI
document = process_document(
project_id, location, processor_id, file_path, mime_type
project_id, location, processor_id, processor_version, file_path, mime_type
)

# For a full list of Document object attributes, please reference this page:
Expand All @@ -52,19 +58,30 @@ def process_document_ocr_sample(
print_lines(page.lines, text)
print_tokens(page.tokens, text)

# Currently supported in version pretrained-ocr-v1.1-2022-09-12
if page.image_quality_scores:
print_image_quality_scores(page.image_quality_scores)


def process_document(
project_id: str, location: str, processor_id: str, file_path: str, mime_type: str
project_id: str,
location: str,
processor_id: str,
processor_version: str,
file_path: str,
mime_type: str,
) -> documentai.Document:
# You must set the api_endpoint if you use a location other than 'us', e.g.:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the processor, e.g.:
# projects/project_id/locations/location/processor/processor_id
# The full resource name of the processor version
# e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
# You must create processors before running sample code.
name = client.processor_path(project_id, location, processor_id)
name = client.processor_version_path(
project_id, location, processor_id, processor_version
)

# Read the file into memory
with open(file_path, "rb") as image:
Expand Down Expand Up @@ -133,6 +150,16 @@ def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) ->
print(f" Last token break type: {repr(last_token_break_type)}")


def print_image_quality_scores(
image_quality_scores: documentai.Document.Page.ImageQualityScores,
) -> None:
print(f" Quality score: {image_quality_scores.quality_score:.1%}")
print(" Detected defects:")

for detected_defect in image_quality_scores.detected_defects:
print(f" {detected_defect.type_}: {detected_defect.confidence:.1%}")


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
"""
Document AI identifies text in different parts of the document by their
Expand Down
2 changes: 2 additions & 0 deletions process_document_ocr_sample_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "52a38e080c1a7296"
processor_version = "rc"
file_path = "resources/handwritten_form.pdf"
mime_type = "application/pdf"

Expand All @@ -29,6 +30,7 @@ def test_process_documents(capsys):
project_id=project_id,
location=location,
processor_id=processor_id,
processor_version=processor_version,
file_path=file_path,
mime_type=mime_type,
)
Expand Down

0 comments on commit 1e68334

Please sign in to comment.