Merge branch 'master' into fix-dlp

GoogleCloudPlatform · Apr 17, 2020 · 73378d4 · 73378d4
2 parents 185cc00 + d66e3a7
commit 73378d4
Show file tree

Hide file tree

Showing 16 changed files with 784 additions and 0 deletions.
diff --git a/document/cloud-client/batch_parse_form_beta.py b/document/cloud-client/batch_parse_form_beta.py
@@ -0,0 +1,96 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_parse_form_beta]
+from google.cloud import documentai_v1beta2 as documentai
+from google.cloud import storage
+import re
+
+
+def batch_parse_form(
+        project_id='YOUR_PROJECT_ID',
+        input_uri='gs://cloud-samples-data/documentai/form.pdf',
+        destination_uri='gs://your-bucket-id/path/to/save/results/'):
+    """Parse a form"""
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type='application/pdf')
+
+    # where to write results
+    output_config = documentai.types.OutputConfig(
+        gcs_destination=documentai.types.GcsDestination(
+            uri=destination_uri),
+        pages_per_shard=1  # Map one doc page to one output page
+    )
+
+    # Improve form parsing results by providing key-value pair hints.
+    # For each key hint, key is text that is likely to appear in the
+    # document as a form field name (i.e. "DOB").
+    # Value types are optional, but can be one or more of:
+    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
+    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
+    key_value_pair_hints = [
+        documentai.types.KeyValuePairHint(
+            key='Emergency Contact',
+            value_types=['NAME']),
+        documentai.types.KeyValuePairHint(
+            key='Referred By')
+    ]
+
+    # Setting enabled=True enables form extraction
+    form_extraction_params = documentai.types.FormExtractionParams(
+        enabled=True, key_value_pair_hints=key_value_pair_hints)
+
+    # Location can be 'us' or 'eu'
+    parent = 'projects/{}/locations/us'.format(project_id)
+    request = documentai.types.ProcessDocumentRequest(
+        input_config=input_config,
+        output_config=output_config,
+        form_extraction_params=form_extraction_params)
+
+    # Add each ProcessDocumentRequest to the batch request
+    requests = []
+    requests.append(request)
+
+    batch_request = documentai.types.BatchProcessDocumentsRequest(
+        parent=parent, requests=requests
+    )
+
+    operation = client.batch_process_documents(batch_request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.client.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print('Output files:')
+    for blob in blob_list:
+        print(blob.name)
+
+
+# [END documentai_batch_parse_form_beta]
diff --git a/document/cloud-client/batch_parse_form_beta_test.py b/document/cloud-client/batch_parse_form_beta_test.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific ladnguage governing permissions and
+# limitations under the License.
+
+import batch_parse_form_beta
+import os
+import pytest
+import uuid
+from google.cloud import storage
+
+BUCKET = 'document-ai-{}'.format(uuid.uuid4())
+OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
+PROJECT_ID = os.environ['GCLOUD_PROJECT']
+INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
+BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
+
+
+@pytest.fixture(autouse=True)
+def setup_teardown():
+    """Create a temporary bucket to store annotation output."""
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET)
+
+    yield
+
+    bucket.delete(force=True)
+
+
+def test_batch_parse_form(capsys):
+    batch_parse_form_beta.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
+    out, _ = capsys.readouterr()
+    assert 'Output files' in out
diff --git a/document/cloud-client/batch_parse_table_beta.py b/document/cloud-client/batch_parse_table_beta.py
@@ -0,0 +1,114 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_parse_table_beta]
+from google.cloud import documentai_v1beta2 as documentai
+from google.cloud import storage
+import re
+
+
+def batch_parse_table(
+        project_id='YOUR_PROJECT_ID',
+        input_uri='gs://cloud-samples-data/documentai/form.pdf',
+        destination_uri='gs://your-bucket-id/path/to/save/results/'):
+    """Parse a form"""
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type='application/pdf')
+
+    # where to write results
+    output_config = documentai.types.OutputConfig(
+        gcs_destination=documentai.types.GcsDestination(
+            uri=destination_uri),
+        pages_per_shard=1  # Map one doc page to one output page
+    )
+
+    # Improve table parsing results by providing bounding boxes
+    # specifying where the box appears in the document (optional)
+    table_bound_hints = [
+        documentai.types.TableBoundHint(
+            page_number=1,
+            bounding_box=documentai.types.BoundingPoly(
+                # Define a polygon around tables to detect
+                # Each vertice coordinate must be a number between 0 and 1
+                normalized_vertices=[
+                    # Top left
+                    documentai.types.geometry.NormalizedVertex(
+                        x=0,
+                        y=0
+                    ),
+                    # Top right
+                    documentai.types.geometry.NormalizedVertex(
+                        x=1,
+                        y=0
+                    ),
+                    # Bottom right
+                    documentai.types.geometry.NormalizedVertex(
+                        x=1,
+                        y=1
+                    ),
+                    # Bottom left
+                    documentai.types.geometry.NormalizedVertex(
+                        x=0,
+                        y=1
+                    )
+                ]
+            )
+        )
+    ]
+
+    # Setting enabled=True enables form extraction
+    table_extraction_params = documentai.types.TableExtractionParams(
+        enabled=True, table_bound_hints=table_bound_hints)
+
+    # Location can be 'us' or 'eu'
+    parent = 'projects/{}/locations/us'.format(project_id)
+    request = documentai.types.ProcessDocumentRequest(
+        input_config=input_config,
+        output_config=output_config,
+        table_extraction_params=table_extraction_params)
+
+    requests = []
+    requests.append(request)
+
+    batch_request = documentai.types.BatchProcessDocumentsRequest(
+        parent=parent, requests=requests
+    )
+
+    operation = client.batch_process_documents(batch_request)
+
+    # Wait for the operation to finish
+    operation.result()
+
+    # Results are written to GCS. Use a regex to find
+    # output files
+    match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
+    output_bucket = match.group(1)
+    prefix = match.group(2)
+
+    storage_client = storage.client.Client()
+    bucket = storage_client.get_bucket(output_bucket)
+    blob_list = list(bucket.list_blobs(prefix=prefix))
+    print('Output files:')
+    for blob in blob_list:
+        print(blob.name)
+
+# [END documentai_batch_parse_table_beta]
diff --git a/document/cloud-client/batch_parse_table_beta_test.py b/document/cloud-client/batch_parse_table_beta_test.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific ladnguage governing permissions and
+# limitations under the License.
+
+import batch_parse_table_beta
+import os
+import pytest
+import uuid
+from google.cloud import storage
+
+BUCKET = 'document-ai-{}'.format(uuid.uuid4())
+OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
+PROJECT_ID = os.environ['GCLOUD_PROJECT']
+INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
+BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
+
+
+@pytest.fixture(autouse=True)
+def setup_teardown():
+    """Create a temporary bucket to store annotation output."""
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET)
+
+    yield
+
+    bucket.delete(force=True)
+
+
+def test_batch_parse_table(capsys):
+    batch_parse_table_beta.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
+    out, _ = capsys.readouterr()
+    assert 'Output files:' in out
diff --git a/document/cloud-client/parse_form_beta.py b/document/cloud-client/parse_form_beta.py
@@ -0,0 +1,82 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the 'License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START documentai_parse_form_beta]
+from google.cloud import documentai_v1beta2 as documentai
+
+
+def parse_form(project_id='YOUR_PROJECT_ID',
+               input_uri='gs://cloud-samples-data/documentai/form.pdf'):
+    """Parse a form"""
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type='application/pdf')
+
+    # Improve form parsing results by providing key-value pair hints.
+    # For each key hint, key is text that is likely to appear in the
+    # document as a form field name (i.e. "DOB").
+    # Value types are optional, but can be one or more of:
+    # ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
+    # NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
+    key_value_pair_hints = [
+        documentai.types.KeyValuePairHint(key='Emergency Contact',
+                                          value_types=['NAME']),
+        documentai.types.KeyValuePairHint(
+            key='Referred By')
+    ]
+
+    # Setting enabled=True enables form extraction
+    form_extraction_params = documentai.types.FormExtractionParams(
+        enabled=True, key_value_pair_hints=key_value_pair_hints)
+
+    # Location can be 'us' or 'eu'
+    parent = 'projects/{}/locations/us'.format(project_id)
+    request = documentai.types.ProcessDocumentRequest(
+        parent=parent,
+        input_config=input_config,
+        form_extraction_params=form_extraction_params)
+
+    document = client.process_document(request=request)
+
+    def _get_text(el):
+        """Doc AI identifies form fields by their offsets
+        in document text. This function converts offsets
+        to text snippets.
+        """
+        response = ''
+        # If a text segment spans several lines, it will
+        # be stored in different text segments.
+        for segment in el.text_anchor.text_segments:
+            start_index = segment.start_index
+            end_index = segment.end_index
+            response += document.text[start_index:end_index]
+        return response
+
+    for page in document.pages:
+        print('Page number: {}'.format(page.page_number))
+        for form_field in page.form_fields:
+            print('Field Name: {}\tConfidence: {}'.format(
+                _get_text(form_field.field_name),
+                form_field.field_name.confidence))
+            print('Field Value: {}\tConfidence: {}'.format(
+                _get_text(form_field.field_value),
+                form_field.field_value.confidence))
+
+# [END documentai_parse_form_beta]