Skip to content
26 changes: 25 additions & 1 deletion sdk/formrecognizer/azure-ai-formrecognizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,32 @@ for style in result.styles:
print("Document contains handwritten content: ")
print(",".join([result.content[span.offset:span.offset + span.length] for span in style.spans]))

print("----Selection marks found in document----")
for page in result.pages:
print("----Analyzing document from page #{}----".format(page.page_number))
print(
"Page has width: {} and height: {}, measured with unit: {}".format(
page.width, page.height, page.unit
)
)

for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"...Line # {} has {} words and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_bounding_box(line.bounding_box),
)
)

for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)

for selection_mark in page.selection_marks:
print(
"...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,20 +101,22 @@ async def analyze_general_documents():
)

for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"Line # {} has text content '{}' within bounding box '{}'".format(
"...Line # {} has {} words and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_bounding_box(line.bounding_box),
)
)

for word in page.words:
print(
"...Word '{}' has a confidence of {}".format(
word.content, word.confidence
for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)
)

for selection_mark in page.selection_marks:
print(
Expand All @@ -131,6 +133,11 @@ async def analyze_general_documents():
table_idx, table.row_count, table.column_count
)
)
print(
"Table # {} has {} lines and {} words".format(
table_idx, len(table.get_lines()), len(table.get_words())
)
)
for region in table.bounding_regions:
print(
"Table # {} location on page: {} is {}".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,22 @@ async def analyze_layout_async():
)

for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"Line # {} has text content '{}' within bounding box '{}'".format(
"...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_bounding_box(line.bounding_box),
)
)

for word in page.words:
print(
"...Word '{}' has a confidence of {}".format(
word.content, word.confidence
for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)
)

for selection_mark in page.selection_marks:
print(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# coding: utf-8

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: sample_get_document_elements_async.py

DESCRIPTION:
This sample demonstrates how to get related document elements from the result of calling
`begin_analyze_document()`.

USAGE:
python sample_get_document_elements_async.py

Set the environment variables with your own values before running the sample:
1) AZURE_FORM_RECOGNIZER_ENDPOINT - the endpoint to your Cognitive Services resource.
2) AZURE_FORM_RECOGNIZER_KEY - your Form Recognizer API key
"""

import os
import asyncio

def format_bounding_region(bounding_regions):
if not bounding_regions:
return "N/A"
return ", ".join("Page #{}: {}".format(region.page_number, format_bounding_box(region.bounding_box)) for region in bounding_regions)

def format_bounding_box(bounding_box):
if not bounding_box:
return "N/A"
return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])


async def get_document_elements_async():
path_to_sample_documents = os.path.abspath(
os.path.join(
os.path.abspath(__file__),
"..",
"..",
"..",
"./sample_forms/forms/Form_1.jpg",
)
)

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer.aio import DocumentAnalysisClient

endpoint = os.environ["AZURE_FORM_RECOGNIZER_ENDPOINT"]
key = os.environ["AZURE_FORM_RECOGNIZER_KEY"]

document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
async with document_analysis_client:
with open(path_to_sample_documents, "rb") as f:
poller = await document_analysis_client.begin_analyze_document(
"prebuilt-document", document=f
)
result = await poller.result()

print("----Getting words in key-value pairs found in document----")
for kv_pair in result.key_value_pairs:
if kv_pair.key:
print(
"Key '{}' found within '{}' bounding regions".format(
kv_pair.key.content,
format_bounding_region(kv_pair.key.bounding_regions),
)
)
words = kv_pair.key.get_words()
print(
"Key has {} word(s):".format(
len(words),
)
)
for word in words:
print(
"...found '{}' word with confidence {}".format(
word.content,
word.confidence,
)
)

print("----Getting words in entities found in document----")
for entity in result.entities:
print("Entity of category '{}' with sub-category '{}'".format(entity.category, entity.sub_category))
# NOTE: Calling get_words() here will return a list of the DocumentWords that make up the entity.
# These words can be processed just like any other DocumentWord instance.
words = entity.get_words()
for word in words:
print(
"...contains '{}' with confidence {}".format(
word.content,
word.confidence,
)
)

print("----Getting lines in tables found in document----")
for table_idx, table in enumerate(result.tables):
print(
"Table # {} has {} rows and {} columns".format(
table_idx, table.row_count, table.column_count
)
)
print(
"Table # {} has {} lines and {} words".format(
table_idx, len(table.get_lines()), len(table.get_words())
)
)
for line in table.get_lines():
print(
"...found '{}' line".format(
line.content,
)
)
for word in line.get_words():
print(
"......contains '{}' with confidence {}".format(
word.content,
word.confidence,
)
)
print("----------------------------------------")


async def main():
await get_document_elements_async()


if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Original file line number Diff line number Diff line change
Expand Up @@ -97,20 +97,22 @@ def analyze_general_documents():
)

for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"...Line # {} has text content '{}' within bounding box '{}'".format(
"...Line # {} has {} words and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_bounding_box(line.bounding_box),
)
)

for word in page.words:
print(
"...Word '{}' has a confidence of {}".format(
word.content, word.confidence
for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)
)

for selection_mark in page.selection_marks:
print(
Expand All @@ -127,6 +129,11 @@ def analyze_general_documents():
table_idx, table.row_count, table.column_count
)
)
print(
"Table # {} has {} lines and {} words".format(
table_idx, len(table.get_lines()), len(table.get_words())
)
)
for region in table.bounding_regions:
print(
"Table # {} location on page: {} is {}".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,22 @@ def analyze_layout():
)

for line_idx, line in enumerate(page.lines):
words = line.get_words()
print(
"...Line # {} has text content '{}' within bounding box '{}'".format(
"...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
line_idx,
len(words),
line.content,
format_bounding_box(line.bounding_box),
)
)

for word in page.words:
print(
"...Word '{}' has a confidence of {}".format(
word.content, word.confidence
for word in words:
print(
"......Word '{}' has a confidence of {}".format(
word.content, word.confidence
)
)
)

for selection_mark in page.selection_marks:
print(
Expand Down
Loading