diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/async_samples/sample_analyze_return_raw_json_async.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/async_samples/sample_analyze_return_raw_json_async.py index 3b9d7371dfe4..d190635c212c 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/async_samples/sample_analyze_return_raw_json_async.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/async_samples/sample_analyze_return_raw_json_async.py @@ -9,14 +9,18 @@ DESCRIPTION: This sample demonstrates how to access the raw JSON response from analysis operations - using protocol methods. This is useful for advanced scenarios where you need direct access - to the JSON structure. + using the 'cls' callback parameter (async version). This is useful for advanced scenarios + where you need direct access to the JSON structure. The Content Understanding SDK provides two approaches for accessing analysis results: + 1. Object model approach (recommended): Returns strongly-typed AnalyzeResult objects - 2. Protocol method approach: Returns raw BinaryData containing the JSON response + that are easier to navigate and use. This is shown in sample_analyze_binary_async.py. + + 2. Protocol method approach: Returns raw HTTP response containing the JSON. This sample + demonstrates this approach for advanced scenarios. - For production use, prefer the object model approach as it provides: + IMPORTANT: For production use, prefer the object model approach as it provides: - Type safety - IntelliSense support - Easier navigation of results @@ -66,21 +70,31 @@ async def main() -> None: print(f"Analyzing {file_path} with prebuilt-documentSearch...") - # Use the standard method which returns an AnalyzeResult - # Then serialize to JSON for raw access + # Use the 'cls' callback parameter to get the raw HTTP response + # The 'cls' parameter allows us to intercept the response and return custom data + # We return a tuple: (deserialized_object, raw_http_response) + # Note: For production use, prefer the object model approach (without cls parameter) + # which returns AnalyzeResult objects that are easier to work with poller = await client.begin_analyze_binary( analyzer_id="prebuilt-documentSearch", binary_input=file_bytes, + content_type="application/pdf", + cls=lambda pipeline_response, deserialized_obj, response_headers: ( + deserialized_obj, + pipeline_response.http_response, + ), ) - result = await poller.result() - # Convert to dictionary and then to JSON - result_dict = result.as_dict() + # Wait for completion and get both the deserialized object and raw HTTP response + _, raw_http_response = await poller.result() # [END analyze_return_raw_json] # [START parse_raw_json] + # Parse the raw JSON response + response_json = raw_http_response.json() + # Pretty-print the JSON - pretty_json = json.dumps(result_dict, indent=2, ensure_ascii=False, default=str) + pretty_json = json.dumps(response_json, indent=2, ensure_ascii=False) # Create output directory if it doesn't exist output_dir = Path(__file__).parent.parent / "sample_output" @@ -94,17 +108,68 @@ async def main() -> None: with open(output_path, "w", encoding="utf-8") as f: f.write(pretty_json) - print(f"\nRaw JSON response saved to: {output_path}") + print(f"Raw JSON response saved to: {output_path}") print(f"File size: {len(pretty_json):,} characters") - - # Show a preview of the JSON structure - print("\nJSON Structure Preview:") - print("=" * 50) - preview = pretty_json[:2000] + "..." if len(pretty_json) > 2000 else pretty_json - print(preview) - print("=" * 50) # [END parse_raw_json] + # [START extract_from_raw_json] + # Extract key information from raw JSON + # This demonstrates accessing the same data that would be available via the object model + if "result" in response_json: + result_data = response_json["result"] + + if "analyzerId" in result_data: + print(f"\nAnalyzer ID: {result_data['analyzerId']}") + + if "contents" in result_data and isinstance(result_data["contents"], list): + print(f"Contents count: {len(result_data['contents'])}") + + if len(result_data["contents"]) > 0: + first_content = result_data["contents"][0] + + if "kind" in first_content: + print(f"Content kind: {first_content['kind']}") + if "mimeType" in first_content: + print(f"MIME type: {first_content['mimeType']}") + + # Extract markdown content from raw JSON + # Object model equivalent: content.markdown + print("\nMarkdown Content (from raw JSON):") + print("=" * 50) + if "markdown" in first_content and first_content["markdown"]: + print(first_content["markdown"]) + else: + print("No markdown content available.") + print("=" * 50) + + # Extract document properties from raw JSON + # Object model equivalent: document_content.start_page_number, etc. + if first_content.get("kind") == "document": + print("\nDocument Information (from raw JSON):") + if "startPageNumber" in first_content: + print(f" Start page: {first_content['startPageNumber']}") + if "endPageNumber" in first_content: + print(f" End page: {first_content['endPageNumber']}") + + start_page = first_content.get("startPageNumber") + end_page = first_content.get("endPageNumber") + if start_page and end_page: + total_pages = end_page - start_page + 1 + print(f" Total pages: {total_pages}") + + # Extract pages information + # Object model equivalent: document_content.pages + if "pages" in first_content and first_content["pages"]: + pages = first_content["pages"] + unit = first_content.get("unit", "units") + print(f"\nPages ({len(pages)}):") + for page in pages: + page_num = page.get("pageNumber") + width = page.get("width") + height = page.get("height") + print(f" Page {page_num}: {width} x {height} {unit}") + # [END extract_from_raw_json] + if not isinstance(credential, AzureKeyCredential): await credential.close() diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/sample_analyze_return_raw_json.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/sample_analyze_return_raw_json.py index 19f9411a9f7a..d8af896d59af 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/sample_analyze_return_raw_json.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/samples/sample_analyze_return_raw_json.py @@ -9,14 +9,18 @@ DESCRIPTION: This sample demonstrates how to access the raw JSON response from analysis operations - using protocol methods. This is useful for advanced scenarios where you need direct access - to the JSON structure. + using the 'cls' callback parameter. This is useful for advanced scenarios where you need + direct access to the JSON structure. The Content Understanding SDK provides two approaches for accessing analysis results: + 1. Object model approach (recommended): Returns strongly-typed AnalyzeResult objects - 2. Protocol method approach: Returns raw BinaryData containing the JSON response + that are easier to navigate and use. This is shown in sample_analyze_binary.py. + + 2. Protocol method approach: Returns raw HTTP response containing the JSON. This sample + demonstrates this approach for advanced scenarios. - For production use, prefer the object model approach as it provides: + IMPORTANT: For production use, prefer the object model approach as it provides: - Type safety - IntelliSense support - Easier navigation of results @@ -66,21 +70,31 @@ def main() -> None: print(f"Analyzing {file_path} with prebuilt-documentSearch...") - # Use the standard method which returns an AnalyzeResult - # Then serialize to JSON for raw access + # Use the 'cls' callback parameter to get the raw HTTP response + # The 'cls' parameter allows us to intercept the response and return custom data + # We return a tuple: (deserialized_object, raw_http_response) + # Note: For production use, prefer the object model approach (without cls parameter) + # which returns AnalyzeResult objects that are easier to work with poller = client.begin_analyze_binary( analyzer_id="prebuilt-documentSearch", binary_input=file_bytes, + content_type="application/pdf", + cls=lambda pipeline_response, deserialized_obj, response_headers: ( + deserialized_obj, + pipeline_response.http_response, + ), ) - result = poller.result() - # Convert to dictionary and then to JSON - result_dict = result.as_dict() + # Wait for completion and get both the deserialized object and raw HTTP response + _, raw_http_response = poller.result() # [END analyze_return_raw_json] # [START parse_raw_json] + # Parse the raw JSON response + response_json = raw_http_response.json() + # Pretty-print the JSON - pretty_json = json.dumps(result_dict, indent=2, ensure_ascii=False, default=str) + pretty_json = json.dumps(response_json, indent=2, ensure_ascii=False) # Create output directory if it doesn't exist output_dir = Path(__file__).parent / "sample_output" @@ -94,17 +108,68 @@ def main() -> None: with open(output_path, "w", encoding="utf-8") as f: f.write(pretty_json) - print(f"\nRaw JSON response saved to: {output_path}") + print(f"Raw JSON response saved to: {output_path}") print(f"File size: {len(pretty_json):,} characters") - - # Show a preview of the JSON structure - print("\nJSON Structure Preview:") - print("=" * 50) - preview = pretty_json[:2000] + "..." if len(pretty_json) > 2000 else pretty_json - print(preview) - print("=" * 50) # [END parse_raw_json] + # [START extract_from_raw_json] + # Extract key information from raw JSON + # This demonstrates accessing the same data that would be available via the object model + if "result" in response_json: + result_data = response_json["result"] + + if "analyzerId" in result_data: + print(f"\nAnalyzer ID: {result_data['analyzerId']}") + + if "contents" in result_data and isinstance(result_data["contents"], list): + print(f"Contents count: {len(result_data['contents'])}") + + if len(result_data["contents"]) > 0: + first_content = result_data["contents"][0] + + if "kind" in first_content: + print(f"Content kind: {first_content['kind']}") + if "mimeType" in first_content: + print(f"MIME type: {first_content['mimeType']}") + + # Extract markdown content from raw JSON + # Object model equivalent: content.markdown + print("\nMarkdown Content (from raw JSON):") + print("=" * 50) + if "markdown" in first_content and first_content["markdown"]: + print(first_content["markdown"]) + else: + print("No markdown content available.") + print("=" * 50) + + # Extract document properties from raw JSON + # Object model equivalent: document_content.start_page_number, etc. + if first_content.get("kind") == "document": + print("\nDocument Information (from raw JSON):") + if "startPageNumber" in first_content: + print(f" Start page: {first_content['startPageNumber']}") + if "endPageNumber" in first_content: + print(f" End page: {first_content['endPageNumber']}") + + start_page = first_content.get("startPageNumber") + end_page = first_content.get("endPageNumber") + if start_page and end_page: + total_pages = end_page - start_page + 1 + print(f" Total pages: {total_pages}") + + # Extract pages information + # Object model equivalent: document_content.pages + if "pages" in first_content and first_content["pages"]: + pages = first_content["pages"] + unit = first_content.get("unit", "units") + print(f"\nPages ({len(pages)}):") + for page in pages: + page_num = page.get("pageNumber") + width = page.get("width") + height = page.get("height") + print(f" Page {page_num}: {width} x {height} {unit}") + # [END extract_from_raw_json] + if __name__ == "__main__": main() diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json.py index 0fd6cfe69a73..da6ae5d40ae7 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json.py @@ -18,7 +18,6 @@ import os import json -import pytest from devtools_testutils import recorded_by_proxy from testpreparer import ContentUnderstandingPreparer, ContentUnderstandingClientTestBase @@ -32,11 +31,11 @@ def test_sample_analyze_return_raw_json(self, azure_content_understanding_endpoi """Test analyzing a document and getting raw JSON response. This test validates: - 1. Document analysis using protocol method + 1. Document analysis using 'cls' callback to get raw HTTP response 2. Raw JSON response format 3. JSON structure validation - 11_AnalyzeReturnRawJson.AnalyzeReturnRawJsonAsync() + 11_AnalyzeReturnRawJson.AnalyzeReturnRawJson() """ client = self.create_client(endpoint=azure_content_understanding_endpoint) @@ -55,80 +54,64 @@ def test_sample_analyze_return_raw_json(self, azure_content_understanding_endpoi assert len(file_bytes) > 0, "File should not be empty" print(f"[PASS] File loaded: {len(file_bytes)} bytes") - # Analyze the document and get raw response - # Note: The Python SDK returns structured objects by default - # We can access the raw response through the result + # Use 'cls' callback to get raw HTTP response + # The 'cls' parameter allows us to intercept the response before it gets deserialized as an object model + # We return a tuple: (deserialized_object, raw_http_response) poller = client.begin_analyze_binary( analyzer_id="prebuilt-documentSearch", binary_input=file_bytes, - content_type="application/pdf" + content_type="application/pdf", + cls=lambda pipeline_response, deserialized_obj, response_headers: ( + deserialized_obj, + pipeline_response.http_response, + ), ) - result = poller.result() + # Wait for completion and get both model and raw HTTP response + _, raw_http_response = poller.result() # Assertion: Verify analysis operation completed assert poller is not None, "Analysis operation should not be null" assert poller.done(), "Operation should be completed" - - # Verify raw response status - if hasattr(poller, '_polling_method'): - polling_method = getattr(poller, '_polling_method', None) - if polling_method and hasattr(polling_method, '_initial_response'): - raw_response = getattr(polling_method, '_initial_response', None) # type: ignore - if raw_response: - if hasattr(raw_response, 'http_response'): - status = raw_response.http_response.status_code - elif hasattr(raw_response, 'status_code'): - status = raw_response.status_code - else: - status = None - - if status: - assert status >= 200 and status < 300, \ - f"Response status should be successful (200-299), but was {status}" - print(f"[PASS] Raw response status verified: {status}") - assert poller.status() == "Succeeded", f"Operation status should be Succeeded, but was {poller.status()}" print("[PASS] Analysis operation completed successfully") - # Assertion: Verify result - assert result is not None, "Analysis result should not be null" - print("[PASS] Response data is not null") - - # Convert result to JSON string to verify raw format capability - # In Python SDK, we can serialize the result to JSON - try: - # Try to access the raw response data - if hasattr(result, '__dict__'): - result_dict = result.__dict__ - json_str = json.dumps(result_dict, default=str) - assert json_str is not None, "Response string should not be null" - assert len(json_str) > 0, "Response string should not be empty" - print(f"[PASS] Response converted to JSON string: {len(json_str)} characters") - - # Verify it's valid JSON - parsed_json = json.loads(json_str) - assert parsed_json is not None, "Response should be valid JSON" - print("[PASS] Response is valid JSON format") - else: - print("[INFO] Result does not have __dict__ attribute, using alternative method") - - # Alternative: Check if result has contents (which confirms it's a valid response) - assert hasattr(result, "contents"), "Result should have contents attribute" - assert result.contents is not None, "Result contents should not be null" - print("[PASS] Response data structure verified") - - except json.JSONDecodeError as e: - pytest.fail(f"Response should be valid JSON format: {str(e)}") - except Exception as e: - print(f"[WARN] Could not serialize to JSON: {str(e)}") - # Still verify basic structure - assert result is not None, "Result should not be null" - print("[PASS] Response data verified (structured format)") - - # Verify the response contains expected data - assert hasattr(result, "contents"), "Result should have contents" - if result.contents and len(result.contents) > 0: - print(f"[PASS] Response contains {len(result.contents)} content(s)") + # Assertion: Verify raw HTTP response + assert raw_http_response is not None, "Raw HTTP response should not be null" + print("[PASS] Raw HTTP response is not null") + + # Parse the raw JSON response + response_json = raw_http_response.json() + + # Assertion: Verify JSON is not empty + assert response_json is not None, "Response JSON should not be null" + print("[PASS] Response JSON parsed successfully") + + # Verify it's valid JSON by serializing + json_str = json.dumps(response_json, indent=2, ensure_ascii=False) + assert json_str is not None, "Response string should not be null" + assert len(json_str) > 0, "Response string should not be empty" + print(f"[PASS] Response converted to JSON string: {len(json_str)} characters") + + # Verify the response contains expected structure (matching C# sample validation) + assert "result" in response_json, "Response should contain 'result' key" + result_data = response_json["result"] + print("[PASS] Response contains 'result' key") + + # Verify analyzerId + if "analyzerId" in result_data: + print(f"[PASS] Analyzer ID: {result_data['analyzerId']}") + + # Verify contents + if "contents" in result_data and isinstance(result_data["contents"], list): + contents_count = len(result_data["contents"]) + print(f"[PASS] Contents count: {contents_count}") + + if contents_count > 0: + first_content = result_data["contents"][0] + if "kind" in first_content: + print(f"[PASS] Content kind: {first_content['kind']}") + if "mimeType" in first_content: + print(f"[PASS] MIME type: {first_content['mimeType']}") print("\n[SUCCESS] All test_sample_analyze_return_raw_json assertions passed") diff --git a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json_async.py b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json_async.py index cb6eb376ddf7..f9e00b62c0ac 100644 --- a/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json_async.py +++ b/sdk/contentunderstanding/azure-ai-contentunderstanding/tests/samples/test_sample_analyze_return_raw_json_async.py @@ -18,7 +18,6 @@ import os import json -import pytest from devtools_testutils.aio import recorded_by_proxy_async from testpreparer_async import ContentUnderstandingPreparer, ContentUnderstandingClientTestBaseAsync @@ -32,7 +31,7 @@ async def test_sample_analyze_return_raw_json_async(self, azure_content_understa """Test analyzing a document and getting raw JSON response (async version). This test validates: - 1. Document analysis using protocol method + 1. Document analysis using 'cls' callback to get raw HTTP response 2. Raw JSON response format 3. JSON structure validation @@ -55,81 +54,65 @@ async def test_sample_analyze_return_raw_json_async(self, azure_content_understa assert len(file_bytes) > 0, "File should not be empty" print(f"[PASS] File loaded: {len(file_bytes)} bytes") - # Analyze the document and get raw response - # Note: The Python SDK returns structured objects by default - # We can access the raw response through the result + # Use 'cls' callback to get raw HTTP response + # The 'cls' parameter allows us to intercept the response before it gets deserialized as an object model + # We return a tuple: (deserialized_object, raw_http_response) poller = await client.begin_analyze_binary( analyzer_id="prebuilt-documentSearch", binary_input=file_bytes, - content_type="application/pdf" + content_type="application/pdf", + cls=lambda pipeline_response, deserialized_obj, response_headers: ( + deserialized_obj, + pipeline_response.http_response, + ), ) - result = await poller.result() + # Wait for completion and get both model and raw HTTP response + _, raw_http_response = await poller.result() # Assertion: Verify analysis operation completed assert poller is not None, "Analysis operation should not be null" assert poller.done(), "Operation should be completed" - - # Verify raw response status - if hasattr(poller, '_polling_method'): - polling_method = getattr(poller, '_polling_method', None) - if polling_method and hasattr(polling_method, '_initial_response'): - raw_response = getattr(polling_method, '_initial_response', None) # type: ignore - if raw_response: - if hasattr(raw_response, 'http_response'): - status = raw_response.http_response.status_code - elif hasattr(raw_response, 'status_code'): - status = raw_response.status_code - else: - status = None - - if status: - assert status >= 200 and status < 300, \ - f"Response status should be successful (200-299), but was {status}" - print(f"[PASS] Raw response status verified: {status}") - assert poller.status() == "Succeeded", f"Operation status should be Succeeded, but was {poller.status()}" print("[PASS] Analysis operation completed successfully") - # Assertion: Verify result - assert result is not None, "Analysis result should not be null" - print("[PASS] Response data is not null") - - # Convert result to JSON string to verify raw format capability - # In Python SDK, we can serialize the result to JSON - try: - # Try to access the raw response data - if hasattr(result, '__dict__'): - result_dict = result.__dict__ - json_str = json.dumps(result_dict, default=str) - assert json_str is not None, "Response string should not be null" - assert len(json_str) > 0, "Response string should not be empty" - print(f"[PASS] Response converted to JSON string: {len(json_str)} characters") - - # Verify it's valid JSON - parsed_json = json.loads(json_str) - assert parsed_json is not None, "Response should be valid JSON" - print("[PASS] Response is valid JSON format") - else: - print("[INFO] Result does not have __dict__ attribute, using alternative method") - - # Alternative: Check if result has contents (which confirms it's a valid response) - assert hasattr(result, "contents"), "Result should have contents attribute" - assert result.contents is not None, "Result contents should not be null" - print("[PASS] Response data structure verified") - - except json.JSONDecodeError as e: - pytest.fail(f"Response should be valid JSON format: {str(e)}") - except Exception as e: - print(f"[WARN] Could not serialize to JSON: {str(e)}") - # Still verify basic structure - assert result is not None, "Result should not be null" - print("[PASS] Response data verified (structured format)") - - # Verify the response contains expected data - assert hasattr(result, "contents"), "Result should have contents" - if result.contents and len(result.contents) > 0: - print(f"[PASS] Response contains {len(result.contents)} content(s)") + # Assertion: Verify raw HTTP response + assert raw_http_response is not None, "Raw HTTP response should not be null" + print("[PASS] Raw HTTP response is not null") + + # Parse the raw JSON response + response_json = raw_http_response.json() + + # Assertion: Verify JSON is not empty + assert response_json is not None, "Response JSON should not be null" + print("[PASS] Response JSON parsed successfully") + + # Verify it's valid JSON by serializing + json_str = json.dumps(response_json, indent=2, ensure_ascii=False) + assert json_str is not None, "Response string should not be null" + assert len(json_str) > 0, "Response string should not be empty" + print(f"[PASS] Response converted to JSON string: {len(json_str)} characters") + + # Verify the response contains expected structure (matching C# sample validation) + assert "result" in response_json, "Response should contain 'result' key" + result_data = response_json["result"] + print("[PASS] Response contains 'result' key") + + # Verify analyzerId + if "analyzerId" in result_data: + print(f"[PASS] Analyzer ID: {result_data['analyzerId']}") + + # Verify contents + if "contents" in result_data and isinstance(result_data["contents"], list): + contents_count = len(result_data["contents"]) + print(f"[PASS] Contents count: {contents_count}") + + if contents_count > 0: + first_content = result_data["contents"][0] + if "kind" in first_content: + print(f"[PASS] Content kind: {first_content['kind']}") + if "mimeType" in first_content: + print(f"[PASS] MIME type: {first_content['mimeType']}") await client.close() print("\n[SUCCESS] All test_sample_analyze_return_raw_json_async assertions passed")