forked from GoogleCloudPlatform/python-docs-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Document AI code snippets for beta
* first pass v1beta2 analyze_form.py * Update document/cloud-client/analyze_form.py Move region tag to the top, set sensible defaults Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com> * updated form code * changed naming to be consistent with node * added parse table files * style updates * added quickstart * added batch samples * added set endpoint * renamed set endpoint fn name * feat: adds AutoML model sample * feat: adds requirements files * fix: linter issues * chore: changes to GCS output * fix: linter issues * fix: changes format for AutoML model * fix: per reviewer * fix: added bounding poly comments * fix: adjusts locations, reviewer feedback * fix: reviewer feedback * fix: linter issues * fix: moved comment * fix: per reviewer * fix: per reviewer * fix: region tag bracket * fix: test assert Co-authored-by: Noah Negrey <nnegrey@users.noreply.github.com> Co-authored-by: Eric Schmidt <erschmid@google.com>
- Loading branch information
1 parent
b221fbf
commit d66e3a7
Showing
16 changed files
with
784 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# [START documentai_batch_parse_form_beta] | ||
from google.cloud import documentai_v1beta2 as documentai | ||
from google.cloud import storage | ||
import re | ||
|
||
|
||
def batch_parse_form( | ||
project_id='YOUR_PROJECT_ID', | ||
input_uri='gs://cloud-samples-data/documentai/form.pdf', | ||
destination_uri='gs://your-bucket-id/path/to/save/results/'): | ||
"""Parse a form""" | ||
|
||
client = documentai.DocumentUnderstandingServiceClient() | ||
|
||
gcs_source = documentai.types.GcsSource(uri=input_uri) | ||
|
||
# mime_type can be application/pdf, image/tiff, | ||
# and image/gif, or application/json | ||
input_config = documentai.types.InputConfig( | ||
gcs_source=gcs_source, mime_type='application/pdf') | ||
|
||
# where to write results | ||
output_config = documentai.types.OutputConfig( | ||
gcs_destination=documentai.types.GcsDestination( | ||
uri=destination_uri), | ||
pages_per_shard=1 # Map one doc page to one output page | ||
) | ||
|
||
# Improve form parsing results by providing key-value pair hints. | ||
# For each key hint, key is text that is likely to appear in the | ||
# document as a form field name (i.e. "DOB"). | ||
# Value types are optional, but can be one or more of: | ||
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, | ||
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME | ||
key_value_pair_hints = [ | ||
documentai.types.KeyValuePairHint( | ||
key='Emergency Contact', | ||
value_types=['NAME']), | ||
documentai.types.KeyValuePairHint( | ||
key='Referred By') | ||
] | ||
|
||
# Setting enabled=True enables form extraction | ||
form_extraction_params = documentai.types.FormExtractionParams( | ||
enabled=True, key_value_pair_hints=key_value_pair_hints) | ||
|
||
# Location can be 'us' or 'eu' | ||
parent = 'projects/{}/locations/us'.format(project_id) | ||
request = documentai.types.ProcessDocumentRequest( | ||
input_config=input_config, | ||
output_config=output_config, | ||
form_extraction_params=form_extraction_params) | ||
|
||
# Add each ProcessDocumentRequest to the batch request | ||
requests = [] | ||
requests.append(request) | ||
|
||
batch_request = documentai.types.BatchProcessDocumentsRequest( | ||
parent=parent, requests=requests | ||
) | ||
|
||
operation = client.batch_process_documents(batch_request) | ||
|
||
# Wait for the operation to finish | ||
operation.result() | ||
|
||
# Results are written to GCS. Use a regex to find | ||
# output files | ||
match = re.match(r'gs://([^/]+)/(.+)', destination_uri) | ||
output_bucket = match.group(1) | ||
prefix = match.group(2) | ||
|
||
storage_client = storage.client.Client() | ||
bucket = storage_client.get_bucket(output_bucket) | ||
blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
print('Output files:') | ||
for blob in blob_list: | ||
print(blob.name) | ||
|
||
|
||
# [END documentai_batch_parse_form_beta] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific ladnguage governing permissions and | ||
# limitations under the License. | ||
|
||
import batch_parse_form_beta | ||
import os | ||
import pytest | ||
import uuid | ||
from google.cloud import storage | ||
|
||
BUCKET = 'document-ai-{}'.format(uuid.uuid4()) | ||
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4()) | ||
PROJECT_ID = os.environ['GCLOUD_PROJECT'] | ||
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf' | ||
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def setup_teardown(): | ||
"""Create a temporary bucket to store annotation output.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.create_bucket(BUCKET) | ||
|
||
yield | ||
|
||
bucket.delete(force=True) | ||
|
||
|
||
def test_batch_parse_form(capsys): | ||
batch_parse_form_beta.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) | ||
out, _ = capsys.readouterr() | ||
assert 'Output files' in out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# [START documentai_batch_parse_table_beta] | ||
from google.cloud import documentai_v1beta2 as documentai | ||
from google.cloud import storage | ||
import re | ||
|
||
|
||
def batch_parse_table( | ||
project_id='YOUR_PROJECT_ID', | ||
input_uri='gs://cloud-samples-data/documentai/form.pdf', | ||
destination_uri='gs://your-bucket-id/path/to/save/results/'): | ||
"""Parse a form""" | ||
|
||
client = documentai.DocumentUnderstandingServiceClient() | ||
|
||
gcs_source = documentai.types.GcsSource(uri=input_uri) | ||
|
||
# mime_type can be application/pdf, image/tiff, | ||
# and image/gif, or application/json | ||
input_config = documentai.types.InputConfig( | ||
gcs_source=gcs_source, mime_type='application/pdf') | ||
|
||
# where to write results | ||
output_config = documentai.types.OutputConfig( | ||
gcs_destination=documentai.types.GcsDestination( | ||
uri=destination_uri), | ||
pages_per_shard=1 # Map one doc page to one output page | ||
) | ||
|
||
# Improve table parsing results by providing bounding boxes | ||
# specifying where the box appears in the document (optional) | ||
table_bound_hints = [ | ||
documentai.types.TableBoundHint( | ||
page_number=1, | ||
bounding_box=documentai.types.BoundingPoly( | ||
# Define a polygon around tables to detect | ||
# Each vertice coordinate must be a number between 0 and 1 | ||
normalized_vertices=[ | ||
# Top left | ||
documentai.types.geometry.NormalizedVertex( | ||
x=0, | ||
y=0 | ||
), | ||
# Top right | ||
documentai.types.geometry.NormalizedVertex( | ||
x=1, | ||
y=0 | ||
), | ||
# Bottom right | ||
documentai.types.geometry.NormalizedVertex( | ||
x=1, | ||
y=1 | ||
), | ||
# Bottom left | ||
documentai.types.geometry.NormalizedVertex( | ||
x=0, | ||
y=1 | ||
) | ||
] | ||
) | ||
) | ||
] | ||
|
||
# Setting enabled=True enables form extraction | ||
table_extraction_params = documentai.types.TableExtractionParams( | ||
enabled=True, table_bound_hints=table_bound_hints) | ||
|
||
# Location can be 'us' or 'eu' | ||
parent = 'projects/{}/locations/us'.format(project_id) | ||
request = documentai.types.ProcessDocumentRequest( | ||
input_config=input_config, | ||
output_config=output_config, | ||
table_extraction_params=table_extraction_params) | ||
|
||
requests = [] | ||
requests.append(request) | ||
|
||
batch_request = documentai.types.BatchProcessDocumentsRequest( | ||
parent=parent, requests=requests | ||
) | ||
|
||
operation = client.batch_process_documents(batch_request) | ||
|
||
# Wait for the operation to finish | ||
operation.result() | ||
|
||
# Results are written to GCS. Use a regex to find | ||
# output files | ||
match = re.match(r'gs://([^/]+)/(.+)', destination_uri) | ||
output_bucket = match.group(1) | ||
prefix = match.group(2) | ||
|
||
storage_client = storage.client.Client() | ||
bucket = storage_client.get_bucket(output_bucket) | ||
blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
print('Output files:') | ||
for blob in blob_list: | ||
print(blob.name) | ||
|
||
# [END documentai_batch_parse_table_beta] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific ladnguage governing permissions and | ||
# limitations under the License. | ||
|
||
import batch_parse_table_beta | ||
import os | ||
import pytest | ||
import uuid | ||
from google.cloud import storage | ||
|
||
BUCKET = 'document-ai-{}'.format(uuid.uuid4()) | ||
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4()) | ||
PROJECT_ID = os.environ['GCLOUD_PROJECT'] | ||
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf' | ||
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX) | ||
|
||
|
||
@pytest.fixture(autouse=True) | ||
def setup_teardown(): | ||
"""Create a temporary bucket to store annotation output.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.create_bucket(BUCKET) | ||
|
||
yield | ||
|
||
bucket.delete(force=True) | ||
|
||
|
||
def test_batch_parse_table(capsys): | ||
batch_parse_table_beta.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI) | ||
out, _ = capsys.readouterr() | ||
assert 'Output files:' in out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the 'License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# [START documentai_parse_form_beta] | ||
from google.cloud import documentai_v1beta2 as documentai | ||
|
||
|
||
def parse_form(project_id='YOUR_PROJECT_ID', | ||
input_uri='gs://cloud-samples-data/documentai/form.pdf'): | ||
"""Parse a form""" | ||
|
||
client = documentai.DocumentUnderstandingServiceClient() | ||
|
||
gcs_source = documentai.types.GcsSource(uri=input_uri) | ||
|
||
# mime_type can be application/pdf, image/tiff, | ||
# and image/gif, or application/json | ||
input_config = documentai.types.InputConfig( | ||
gcs_source=gcs_source, mime_type='application/pdf') | ||
|
||
# Improve form parsing results by providing key-value pair hints. | ||
# For each key hint, key is text that is likely to appear in the | ||
# document as a form field name (i.e. "DOB"). | ||
# Value types are optional, but can be one or more of: | ||
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, | ||
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME | ||
key_value_pair_hints = [ | ||
documentai.types.KeyValuePairHint(key='Emergency Contact', | ||
value_types=['NAME']), | ||
documentai.types.KeyValuePairHint( | ||
key='Referred By') | ||
] | ||
|
||
# Setting enabled=True enables form extraction | ||
form_extraction_params = documentai.types.FormExtractionParams( | ||
enabled=True, key_value_pair_hints=key_value_pair_hints) | ||
|
||
# Location can be 'us' or 'eu' | ||
parent = 'projects/{}/locations/us'.format(project_id) | ||
request = documentai.types.ProcessDocumentRequest( | ||
parent=parent, | ||
input_config=input_config, | ||
form_extraction_params=form_extraction_params) | ||
|
||
document = client.process_document(request=request) | ||
|
||
def _get_text(el): | ||
"""Doc AI identifies form fields by their offsets | ||
in document text. This function converts offsets | ||
to text snippets. | ||
""" | ||
response = '' | ||
# If a text segment spans several lines, it will | ||
# be stored in different text segments. | ||
for segment in el.text_anchor.text_segments: | ||
start_index = segment.start_index | ||
end_index = segment.end_index | ||
response += document.text[start_index:end_index] | ||
return response | ||
|
||
for page in document.pages: | ||
print('Page number: {}'.format(page.page_number)) | ||
for form_field in page.form_fields: | ||
print('Field Name: {}\tConfidence: {}'.format( | ||
_get_text(form_field.field_name), | ||
form_field.field_name.confidence)) | ||
print('Field Value: {}\tConfidence: {}'.format( | ||
_get_text(form_field.field_value), | ||
form_field.field_value.confidence)) | ||
|
||
# [END documentai_parse_form_beta] |
Oops, something went wrong.