Skip to content

Commit

Permalink
Merge branch 'master' into fix-dlp
Browse files Browse the repository at this point in the history
  • Loading branch information
Takashi Matsuo authored Apr 17, 2020
2 parents 185cc00 + d66e3a7 commit 73378d4
Show file tree
Hide file tree
Showing 16 changed files with 784 additions and 0 deletions.
96 changes: 96 additions & 0 deletions document/cloud-client/batch_parse_form_beta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_parse_form_beta]
from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage
import re


def batch_parse_form(
project_id='YOUR_PROJECT_ID',
input_uri='gs://cloud-samples-data/documentai/form.pdf',
destination_uri='gs://your-bucket-id/path/to/save/results/'):
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()

gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type='application/pdf')

# where to write results
output_config = documentai.types.OutputConfig(
gcs_destination=documentai.types.GcsDestination(
uri=destination_uri),
pages_per_shard=1 # Map one doc page to one output page
)

# Improve form parsing results by providing key-value pair hints.
# For each key hint, key is text that is likely to appear in the
# document as a form field name (i.e. "DOB").
# Value types are optional, but can be one or more of:
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
key_value_pair_hints = [
documentai.types.KeyValuePairHint(
key='Emergency Contact',
value_types=['NAME']),
documentai.types.KeyValuePairHint(
key='Referred By')
]

# Setting enabled=True enables form extraction
form_extraction_params = documentai.types.FormExtractionParams(
enabled=True, key_value_pair_hints=key_value_pair_hints)

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
input_config=input_config,
output_config=output_config,
form_extraction_params=form_extraction_params)

# Add each ProcessDocumentRequest to the batch request
requests = []
requests.append(request)

batch_request = documentai.types.BatchProcessDocumentsRequest(
parent=parent, requests=requests
)

operation = client.batch_process_documents(batch_request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)


# [END documentai_batch_parse_form_beta]
42 changes: 42 additions & 0 deletions document/cloud-client/batch_parse_form_beta_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific ladnguage governing permissions and
# limitations under the License.

import batch_parse_form_beta
import os
import pytest
import uuid
from google.cloud import storage

BUCKET = 'document-ai-{}'.format(uuid.uuid4())
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
PROJECT_ID = os.environ['GCLOUD_PROJECT']
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)


@pytest.fixture(autouse=True)
def setup_teardown():
"""Create a temporary bucket to store annotation output."""
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET)

yield

bucket.delete(force=True)


def test_batch_parse_form(capsys):
batch_parse_form_beta.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
out, _ = capsys.readouterr()
assert 'Output files' in out
114 changes: 114 additions & 0 deletions document/cloud-client/batch_parse_table_beta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_parse_table_beta]
from google.cloud import documentai_v1beta2 as documentai
from google.cloud import storage
import re


def batch_parse_table(
project_id='YOUR_PROJECT_ID',
input_uri='gs://cloud-samples-data/documentai/form.pdf',
destination_uri='gs://your-bucket-id/path/to/save/results/'):
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()

gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type='application/pdf')

# where to write results
output_config = documentai.types.OutputConfig(
gcs_destination=documentai.types.GcsDestination(
uri=destination_uri),
pages_per_shard=1 # Map one doc page to one output page
)

# Improve table parsing results by providing bounding boxes
# specifying where the box appears in the document (optional)
table_bound_hints = [
documentai.types.TableBoundHint(
page_number=1,
bounding_box=documentai.types.BoundingPoly(
# Define a polygon around tables to detect
# Each vertice coordinate must be a number between 0 and 1
normalized_vertices=[
# Top left
documentai.types.geometry.NormalizedVertex(
x=0,
y=0
),
# Top right
documentai.types.geometry.NormalizedVertex(
x=1,
y=0
),
# Bottom right
documentai.types.geometry.NormalizedVertex(
x=1,
y=1
),
# Bottom left
documentai.types.geometry.NormalizedVertex(
x=0,
y=1
)
]
)
)
]

# Setting enabled=True enables form extraction
table_extraction_params = documentai.types.TableExtractionParams(
enabled=True, table_bound_hints=table_bound_hints)

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
input_config=input_config,
output_config=output_config,
table_extraction_params=table_extraction_params)

requests = []
requests.append(request)

batch_request = documentai.types.BatchProcessDocumentsRequest(
parent=parent, requests=requests
)

operation = client.batch_process_documents(batch_request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.client.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files:')
for blob in blob_list:
print(blob.name)

# [END documentai_batch_parse_table_beta]
42 changes: 42 additions & 0 deletions document/cloud-client/batch_parse_table_beta_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific ladnguage governing permissions and
# limitations under the License.

import batch_parse_table_beta
import os
import pytest
import uuid
from google.cloud import storage

BUCKET = 'document-ai-{}'.format(uuid.uuid4())
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
PROJECT_ID = os.environ['GCLOUD_PROJECT']
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)


@pytest.fixture(autouse=True)
def setup_teardown():
"""Create a temporary bucket to store annotation output."""
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET)

yield

bucket.delete(force=True)


def test_batch_parse_table(capsys):
batch_parse_table_beta.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
out, _ = capsys.readouterr()
assert 'Output files:' in out
82 changes: 82 additions & 0 deletions document/cloud-client/parse_form_beta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the 'License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START documentai_parse_form_beta]
from google.cloud import documentai_v1beta2 as documentai


def parse_form(project_id='YOUR_PROJECT_ID',
input_uri='gs://cloud-samples-data/documentai/form.pdf'):
"""Parse a form"""

client = documentai.DocumentUnderstandingServiceClient()

gcs_source = documentai.types.GcsSource(uri=input_uri)

# mime_type can be application/pdf, image/tiff,
# and image/gif, or application/json
input_config = documentai.types.InputConfig(
gcs_source=gcs_source, mime_type='application/pdf')

# Improve form parsing results by providing key-value pair hints.
# For each key hint, key is text that is likely to appear in the
# document as a form field name (i.e. "DOB").
# Value types are optional, but can be one or more of:
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
key_value_pair_hints = [
documentai.types.KeyValuePairHint(key='Emergency Contact',
value_types=['NAME']),
documentai.types.KeyValuePairHint(
key='Referred By')
]

# Setting enabled=True enables form extraction
form_extraction_params = documentai.types.FormExtractionParams(
enabled=True, key_value_pair_hints=key_value_pair_hints)

# Location can be 'us' or 'eu'
parent = 'projects/{}/locations/us'.format(project_id)
request = documentai.types.ProcessDocumentRequest(
parent=parent,
input_config=input_config,
form_extraction_params=form_extraction_params)

document = client.process_document(request=request)

def _get_text(el):
"""Doc AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ''
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in el.text_anchor.text_segments:
start_index = segment.start_index
end_index = segment.end_index
response += document.text[start_index:end_index]
return response

for page in document.pages:
print('Page number: {}'.format(page.page_number))
for form_field in page.form_fields:
print('Field Name: {}\tConfidence: {}'.format(
_get_text(form_field.field_name),
form_field.field_name.confidence))
print('Field Value: {}\tConfidence: {}'.format(
_get_text(form_field.field_value),
form_field.field_value.confidence))

# [END documentai_parse_form_beta]
Loading

0 comments on commit 73378d4

Please sign in to comment.