Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@

DESCRIPTION:
This sample demonstrates how to access the raw JSON response from analysis operations
using protocol methods. This is useful for advanced scenarios where you need direct access
to the JSON structure.
using the 'cls' callback parameter (async version). This is useful for advanced scenarios
where you need direct access to the JSON structure.

The Content Understanding SDK provides two approaches for accessing analysis results:

1. Object model approach (recommended): Returns strongly-typed AnalyzeResult objects
2. Protocol method approach: Returns raw BinaryData containing the JSON response
that are easier to navigate and use. This is shown in sample_analyze_binary_async.py.

2. Protocol method approach: Returns raw HTTP response containing the JSON. This sample
demonstrates this approach for advanced scenarios.

For production use, prefer the object model approach as it provides:
IMPORTANT: For production use, prefer the object model approach as it provides:
- Type safety
- IntelliSense support
- Easier navigation of results
Expand Down Expand Up @@ -66,21 +70,31 @@ async def main() -> None:

print(f"Analyzing {file_path} with prebuilt-documentSearch...")

# Use the standard method which returns an AnalyzeResult
# Then serialize to JSON for raw access
# Use the 'cls' callback parameter to get the raw HTTP response
# The 'cls' parameter allows us to intercept the response and return custom data
# We return a tuple: (deserialized_object, raw_http_response)
# Note: For production use, prefer the object model approach (without cls parameter)
# which returns AnalyzeResult objects that are easier to work with
poller = await client.begin_analyze_binary(
analyzer_id="prebuilt-documentSearch",
binary_input=file_bytes,
content_type="application/pdf",
cls=lambda pipeline_response, deserialized_obj, response_headers: (
deserialized_obj,
pipeline_response.http_response,
),
)
result = await poller.result()

# Convert to dictionary and then to JSON
result_dict = result.as_dict()
# Wait for completion and get both the deserialized object and raw HTTP response
_, raw_http_response = await poller.result()
# [END analyze_return_raw_json]

# [START parse_raw_json]
# Parse the raw JSON response
response_json = raw_http_response.json()

# Pretty-print the JSON
pretty_json = json.dumps(result_dict, indent=2, ensure_ascii=False, default=str)
pretty_json = json.dumps(response_json, indent=2, ensure_ascii=False)

# Create output directory if it doesn't exist
output_dir = Path(__file__).parent.parent / "sample_output"
Expand All @@ -94,17 +108,26 @@ async def main() -> None:
with open(output_path, "w", encoding="utf-8") as f:
f.write(pretty_json)

print(f"\nRaw JSON response saved to: {output_path}")
print(f"Raw JSON response saved to: {output_path}")
print(f"File size: {len(pretty_json):,} characters")

# Show a preview of the JSON structure
print("\nJSON Structure Preview:")
print("=" * 50)
preview = pretty_json[:2000] + "..." if len(pretty_json) > 2000 else pretty_json
print(preview)
print("=" * 50)
# [END parse_raw_json]

# [START extract_from_raw_json]
# Extract key information from raw JSON

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we follow samples/sample_analyze_binary.py to print out markdown and equivalent document structure to show two ways to access the same functionalities?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, updated

if "result" in response_json:
result_data = response_json["result"]
if "analyzerId" in result_data:
print(f"Analyzer ID: {result_data['analyzerId']}")
if "contents" in result_data and isinstance(result_data["contents"], list):
print(f"Contents count: {len(result_data['contents'])}")
if len(result_data["contents"]) > 0:
first_content = result_data["contents"][0]
if "kind" in first_content:
print(f"Content kind: {first_content['kind']}")
if "mimeType" in first_content:
print(f"MIME type: {first_content['mimeType']}")
# [END extract_from_raw_json]

if not isinstance(credential, AzureKeyCredential):
await credential.close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@

DESCRIPTION:
This sample demonstrates how to access the raw JSON response from analysis operations
using protocol methods. This is useful for advanced scenarios where you need direct access
to the JSON structure.
using the 'cls' callback parameter. This is useful for advanced scenarios where you need
direct access to the JSON structure.

The Content Understanding SDK provides two approaches for accessing analysis results:

1. Object model approach (recommended): Returns strongly-typed AnalyzeResult objects
2. Protocol method approach: Returns raw BinaryData containing the JSON response
that are easier to navigate and use. This is shown in sample_analyze_binary.py.

2. Protocol method approach: Returns raw HTTP response containing the JSON. This sample
demonstrates this approach for advanced scenarios.

For production use, prefer the object model approach as it provides:
IMPORTANT: For production use, prefer the object model approach as it provides:
- Type safety
- IntelliSense support
- Easier navigation of results
Expand Down Expand Up @@ -66,21 +70,31 @@ def main() -> None:

print(f"Analyzing {file_path} with prebuilt-documentSearch...")

# Use the standard method which returns an AnalyzeResult
# Then serialize to JSON for raw access
# Use the 'cls' callback parameter to get the raw HTTP response
# The 'cls' parameter allows us to intercept the response and return custom data
# We return a tuple: (deserialized_object, raw_http_response)
# Note: For production use, prefer the object model approach (without cls parameter)
# which returns AnalyzeResult objects that are easier to work with
poller = client.begin_analyze_binary(
analyzer_id="prebuilt-documentSearch",
binary_input=file_bytes,
content_type="application/pdf",
cls=lambda pipeline_response, deserialized_obj, response_headers: (
deserialized_obj,
pipeline_response.http_response,
),
)
result = poller.result()

# Convert to dictionary and then to JSON
result_dict = result.as_dict()
# Wait for completion and get both the deserialized object and raw HTTP response
_, raw_http_response = poller.result()
# [END analyze_return_raw_json]

# [START parse_raw_json]
# Parse the raw JSON response
response_json = raw_http_response.json()

# Pretty-print the JSON
pretty_json = json.dumps(result_dict, indent=2, ensure_ascii=False, default=str)
pretty_json = json.dumps(response_json, indent=2, ensure_ascii=False)

# Create output directory if it doesn't exist
output_dir = Path(__file__).parent / "sample_output"
Expand All @@ -94,17 +108,26 @@ def main() -> None:
with open(output_path, "w", encoding="utf-8") as f:
f.write(pretty_json)

print(f"\nRaw JSON response saved to: {output_path}")
print(f"Raw JSON response saved to: {output_path}")
print(f"File size: {len(pretty_json):,} characters")

# Show a preview of the JSON structure
print("\nJSON Structure Preview:")
print("=" * 50)
preview = pretty_json[:2000] + "..." if len(pretty_json) > 2000 else pretty_json
print(preview)
print("=" * 50)
# [END parse_raw_json]

# [START extract_from_raw_json]
# Extract key information from raw JSON
if "result" in response_json:
result_data = response_json["result"]
if "analyzerId" in result_data:
print(f"Analyzer ID: {result_data['analyzerId']}")
if "contents" in result_data and isinstance(result_data["contents"], list):
print(f"Contents count: {len(result_data['contents'])}")
if len(result_data["contents"]) > 0:
first_content = result_data["contents"][0]
if "kind" in first_content:
print(f"Content kind: {first_content['kind']}")
if "mimeType" in first_content:
print(f"MIME type: {first_content['mimeType']}")
# [END extract_from_raw_json]


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import os
import json
import pytest
from devtools_testutils import recorded_by_proxy
from testpreparer import ContentUnderstandingPreparer, ContentUnderstandingClientTestBase

Expand All @@ -32,11 +31,11 @@ def test_sample_analyze_return_raw_json(self, azure_content_understanding_endpoi
"""Test analyzing a document and getting raw JSON response.

This test validates:
1. Document analysis using protocol method
1. Document analysis using 'cls' callback to get raw HTTP response
2. Raw JSON response format
3. JSON structure validation

11_AnalyzeReturnRawJson.AnalyzeReturnRawJsonAsync()
11_AnalyzeReturnRawJson.AnalyzeReturnRawJson()
"""
client = self.create_client(endpoint=azure_content_understanding_endpoint)

Expand All @@ -55,80 +54,64 @@ def test_sample_analyze_return_raw_json(self, azure_content_understanding_endpoi
assert len(file_bytes) > 0, "File should not be empty"
print(f"[PASS] File loaded: {len(file_bytes)} bytes")

# Analyze the document and get raw response
# Note: The Python SDK returns structured objects by default
# We can access the raw response through the result
# Use 'cls' callback to get raw HTTP response
# The 'cls' parameter allows us to intercept the response before it gets deserialized as an object model
# We return a tuple: (deserialized_object, raw_http_response)
poller = client.begin_analyze_binary(
analyzer_id="prebuilt-documentSearch",
binary_input=file_bytes,
content_type="application/pdf"
content_type="application/pdf",
cls=lambda pipeline_response, deserialized_obj, response_headers: (
deserialized_obj,
pipeline_response.http_response,
),
)

result = poller.result()
# Wait for completion and get both model and raw HTTP response
_, raw_http_response = poller.result()

# Assertion: Verify analysis operation completed
assert poller is not None, "Analysis operation should not be null"
assert poller.done(), "Operation should be completed"

# Verify raw response status
if hasattr(poller, '_polling_method'):
polling_method = getattr(poller, '_polling_method', None)
if polling_method and hasattr(polling_method, '_initial_response'):
raw_response = getattr(polling_method, '_initial_response', None) # type: ignore
if raw_response:
if hasattr(raw_response, 'http_response'):
status = raw_response.http_response.status_code
elif hasattr(raw_response, 'status_code'):
status = raw_response.status_code
else:
status = None

if status:
assert status >= 200 and status < 300, \
f"Response status should be successful (200-299), but was {status}"
print(f"[PASS] Raw response status verified: {status}")

assert poller.status() == "Succeeded", f"Operation status should be Succeeded, but was {poller.status()}"
print("[PASS] Analysis operation completed successfully")

# Assertion: Verify result
assert result is not None, "Analysis result should not be null"
print("[PASS] Response data is not null")

# Convert result to JSON string to verify raw format capability
# In Python SDK, we can serialize the result to JSON
try:
# Try to access the raw response data
if hasattr(result, '__dict__'):
result_dict = result.__dict__
json_str = json.dumps(result_dict, default=str)
assert json_str is not None, "Response string should not be null"
assert len(json_str) > 0, "Response string should not be empty"
print(f"[PASS] Response converted to JSON string: {len(json_str)} characters")

# Verify it's valid JSON
parsed_json = json.loads(json_str)
assert parsed_json is not None, "Response should be valid JSON"
print("[PASS] Response is valid JSON format")
else:
print("[INFO] Result does not have __dict__ attribute, using alternative method")

# Alternative: Check if result has contents (which confirms it's a valid response)
assert hasattr(result, "contents"), "Result should have contents attribute"
assert result.contents is not None, "Result contents should not be null"
print("[PASS] Response data structure verified")

except json.JSONDecodeError as e:
pytest.fail(f"Response should be valid JSON format: {str(e)}")
except Exception as e:
print(f"[WARN] Could not serialize to JSON: {str(e)}")
# Still verify basic structure
assert result is not None, "Result should not be null"
print("[PASS] Response data verified (structured format)")

# Verify the response contains expected data
assert hasattr(result, "contents"), "Result should have contents"
if result.contents and len(result.contents) > 0:
print(f"[PASS] Response contains {len(result.contents)} content(s)")
# Assertion: Verify raw HTTP response
assert raw_http_response is not None, "Raw HTTP response should not be null"
print("[PASS] Raw HTTP response is not null")

# Parse the raw JSON response
response_json = raw_http_response.json()

# Assertion: Verify JSON is not empty
assert response_json is not None, "Response JSON should not be null"
print("[PASS] Response JSON parsed successfully")

# Verify it's valid JSON by serializing
json_str = json.dumps(response_json, indent=2, ensure_ascii=False)
assert json_str is not None, "Response string should not be null"
assert len(json_str) > 0, "Response string should not be empty"
print(f"[PASS] Response converted to JSON string: {len(json_str)} characters")

# Verify the response contains expected structure (matching C# sample validation)
assert "result" in response_json, "Response should contain 'result' key"
result_data = response_json["result"]
print("[PASS] Response contains 'result' key")

# Verify analyzerId
if "analyzerId" in result_data:
print(f"[PASS] Analyzer ID: {result_data['analyzerId']}")

# Verify contents
if "contents" in result_data and isinstance(result_data["contents"], list):
contents_count = len(result_data["contents"])
print(f"[PASS] Contents count: {contents_count}")

if contents_count > 0:
first_content = result_data["contents"][0]
if "kind" in first_content:
print(f"[PASS] Content kind: {first_content['kind']}")
if "mimeType" in first_content:
print(f"[PASS] MIME type: {first_content['mimeType']}")

print("\n[SUCCESS] All test_sample_analyze_return_raw_json assertions passed")
Loading