From baa1f2c7cd355dda0caafd58733e2ea28c22c997 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Wed, 7 Jan 2026 12:44:05 +0100 Subject: [PATCH 1/5] Add AzureDocumentIntelligenceConverter using the azure-ai-documentintelligence package --- haystack/components/converters/__init__.py | 3 +- haystack/components/converters/azure.py | 374 ++++++++++++++++++ pyproject.toml | 7 +- ...telligence-converter-55bbd3bb1fe0a714.yaml | 14 + .../test_azure_document_intelligence.py | 152 +++++++ 5 files changed, 546 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml create mode 100644 test/components/converters/test_azure_document_intelligence.py diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 97d1978782..bb37cb7f93 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -8,7 +8,7 @@ from lazy_imports import LazyImporter _import_structure = { - "azure": ["AzureOCRDocumentConverter"], + "azure": ["AzureOCRDocumentConverter", "AzureDocumentIntelligenceConverter"], "csv": ["CSVToDocument"], "docx": ["DOCXToDocument"], "html": ["HTMLToDocument"], @@ -27,6 +27,7 @@ } if TYPE_CHECKING: + from .azure import AzureDocumentIntelligenceConverter as AzureDocumentIntelligenceConverter from .azure import AzureOCRDocumentConverter as AzureOCRDocumentConverter from .csv import CSVToDocument as CSVToDocument from .docx import DOCXToDocument as DOCXToDocument diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 887cb60f32..7bf48fc16f 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -25,6 +25,9 @@ with LazyImport(message="Run 'pip install pandas'") as pandas_import: from pandas import DataFrame +with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import: + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult, DocumentContentFormat + @component class AzureOCRDocumentConverter: @@ -98,6 +101,17 @@ def __init__( # pylint: disable=too-many-positional-arguments If True, the full path of the file is stored in the metadata of the document. If False, only the file name is stored. """ + import warnings + + warnings.warn( + "AzureOCRDocumentConverter is deprecated and will be removed in Haystack 2.25. " + "Please migrate to AzureDocumentIntelligenceConverter which uses the latest " + "azure-ai-documentintelligence package and supports markdown output for better " + "RAG/LLM integration.", + DeprecationWarning, + stacklevel=2, + ) + azure_import.check() pandas_import.check() @@ -486,3 +500,363 @@ def _check_if_in_table( in_table = True break return in_table + + +@component +class AzureDocumentIntelligenceConverter: + """ + Converts files to Documents using Azure's Document Intelligence service (2024 API). + + This component uses the latest azure-ai-documentintelligence package and supports + markdown output for better integration with LLM/RAG applications. + + Supported file formats: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, HTML. + + Key features: + - Markdown output with preserved structure (headings, tables, lists) + - Inline table integration (no separate table documents) + - Improved layout analysis and reading order + - Better table extraction + - Support for section headings + + To use this component, you need an active Azure account + and a Document Intelligence or Cognitive Services resource. For help with setting up your resource, see + [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api). + + ### Usage example + + ```python + import os + from haystack.components.converters import AzureDocumentIntelligenceConverter + from haystack.utils import Secret + + # Basic usage with markdown output (recommended for RAG) + converter = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + output_format="markdown" + ) + + results = converter.run(sources=["invoice.pdf", "contract.docx"]) + documents = results["documents"] + + # Documents contain markdown with inline tables + print(documents[0].content) + # Output: + # # Invoice + # + # | Item | Quantity | Price | + # |------|----------|-------| + # | Widget | 10 | $50.00 | + # + # Total: $500.00 + + # For backward compatibility, use text mode with CSV tables + converter_compat = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + output_format="text", + table_format="csv" + ) + ``` + + Migration from AzureOCRDocumentConverter: + - Replace `page_layout="natural"` with `output_format="markdown"` + - Remove `preceding_context_len`, `following_context_len` (not needed) + - Tables are now inline in markdown mode + - Install: `pip install azure-ai-documentintelligence` + """ + + def __init__( + self, + endpoint: str, + api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"), + model_id: str = "prebuilt-read", + output_format: Literal["text", "markdown"] = "markdown", + table_format: Literal["csv", "markdown"] = "markdown", + store_full_path: bool = False, + ): + """ + Creates an AzureDocumentIntelligenceConverter component. + + :param endpoint: + The endpoint URL of your Azure Document Intelligence resource. + Example: "https://YOUR_RESOURCE.cognitiveservices.azure.com/" + :param api_key: + API key for Azure authentication. Can use Secret.from_env_var() + to load from AZURE_AI_API_KEY environment variable. + :param model_id: + Azure model to use for analysis. Options: + - "prebuilt-read": Fast OCR for text extraction (default) + - "prebuilt-layout": Enhanced layout analysis with better table/structure detection + - "prebuilt-document": General document analysis + - Custom model IDs from your Azure resource + :param output_format: + Output format for document content. + - "markdown": Returns GitHub Flavored Markdown with inline tables (recommended for RAG) + - "text": Returns plain text with optional separate table documents + :param table_format: + How to format tables when output_format="text". + - "markdown": Format tables as markdown (inline) + - "csv": Format tables as CSV in separate documents (like old AzureOCRDocumentConverter) + Ignored when output_format="markdown" (tables are already in markdown). + :param store_full_path: + If True, stores complete file path in metadata. + If False, stores only the filename (default). + """ + azure_di_import.check() + from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.core.credentials import AzureKeyCredential + + if output_format == "text" and table_format == "csv": + pandas_import.check() + + self.client = DocumentIntelligenceClient( + endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "") + ) + self.endpoint = endpoint + self.api_key = api_key + self.model_id = model_id + self.output_format = output_format + self.table_format = table_format + self.store_full_path = store_full_path + + @component.output_types(documents=list[Document], raw_azure_response=list[dict]) + def run(self, sources: list[str | Path | ByteStream], meta: dict[str, Any] | list[dict[str, Any]] | None = None): + """ + Convert a list of files to Documents using Azure's Document Intelligence service. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be + zipped. If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: List of created Documents + - `raw_azure_response`: List of raw Azure responses used to create the Documents + """ + documents = [] + azure_responses = [] + meta_list: list[dict[str, Any]] = normalize_metadata(meta=meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list): + try: + bytestream = get_bytestream_from_source(source=source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + + try: + # Determine output format + content_format = DocumentContentFormat.MARKDOWN if self.output_format == "markdown" else None + + # Create analyze request + analyze_request = AnalyzeDocumentRequest(bytes_source=bytestream.data) + + # Call Azure API + poller = self.client.begin_analyze_document( + model_id=self.model_id, body=analyze_request, output_content_format=content_format + ) + result = poller.result() + azure_responses.append(result.as_dict()) + + # Merge metadata + merged_metadata = {**bytestream.meta, **metadata} + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) + + # Process based on output format + if self.output_format == "markdown": + doc = self._process_markdown_result(result, merged_metadata) + documents.append(doc) + else: + docs = self._process_text_result(result, merged_metadata) + documents.extend(docs) + + except Exception as e: + logger.warning( + "Failed to convert {source} using Azure Document Intelligence. Skipping it. Error: {error}", + source=source, + error=e, + ) + continue + + return {"documents": documents, "raw_azure_response": azure_responses} + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + endpoint=self.endpoint, + model_id=self.model_id, + output_format=self.output_format, + table_format=self.table_format, + store_full_path=self.store_full_path, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "AzureDocumentIntelligenceConverter": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document: + """ + Process result when output_format='markdown'. + + :param result: The AnalyzeResult from Azure Document Intelligence. + :param meta: Metadata dictionary to attach to the document. + :returns: A single Document with markdown content. + """ + # Azure returns complete markdown in result.content + markdown_content = result.content or "" + + # Build metadata + doc_meta = { + **meta, + "content_format": "markdown", + "model_id": self.model_id, + "page_count": len(result.pages) if result.pages else 0, + } + + return Document(content=markdown_content, meta=doc_meta) + + def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]: + """ + Process result when output_format='text'. + + :param result: The AnalyzeResult from Azure Document Intelligence. + :param meta: Metadata dictionary to attach to the documents. + :returns: List of Documents (text + optional table documents). + """ + documents = [] + + # Extract tables if table_format='csv' + if self.table_format == "csv" and result.tables: + table_docs = self._extract_csv_tables(result, meta) + documents.extend(table_docs) + + # Extract main text content + text_doc = self._extract_text_content(result, meta) + documents.append(text_doc) + + return documents + + def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document: + """ + Extract text from paragraphs. + + :param result: The AnalyzeResult from Azure Document Intelligence. + :param meta: Metadata dictionary to attach to the document. + :returns: A single Document with all text. + """ + # Group paragraphs by page + pages_text = [] + if result.paragraphs: + page_to_paragraphs = defaultdict(list) + for para in result.paragraphs: + page_num = para.bounding_regions[0].page_number if para.bounding_regions else 1 + # Skip paragraphs that are part of tables if extracting tables separately + if self.table_format == "csv" and self._is_paragraph_in_table(para, result.tables): + continue + page_to_paragraphs[page_num].append(para.content) + + # Join paragraphs with page breaks + max_page = max(page_to_paragraphs.keys()) if page_to_paragraphs else 0 + for page_num in range(1, max_page + 1): + page_text = "\n".join(page_to_paragraphs.get(page_num, [])) + pages_text.append(page_text) + + all_text = "\f".join(pages_text) + return Document(content=all_text, meta={**meta, "content_format": "text"}) + + def _extract_csv_tables(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]: + """ + Extract tables as CSV (backward compatibility mode). + + :param result: The AnalyzeResult from Azure Document Intelligence. + :param meta: Metadata dictionary to attach to the documents. + :returns: List of Documents containing table CSV content. + """ + table_documents = [] + + if not result.tables: + return table_documents + + for table in result.tables: + # Build table as 2D array + table_data = [[""] * table.column_count for _ in range(table.row_count)] + + for cell in table.cells: + # Remove selection markers + content = cell.content.replace(":selected:", "").replace(":unselected:", "") + + # Handle cell spanning + column_span = cell.column_span if cell.column_span else 1 + row_span = cell.row_span if cell.row_span else 1 + + for r in range(row_span): + for c in range(column_span): + row_idx = cell.row_index + r + col_idx = cell.column_index + c + if row_idx < table.row_count and col_idx < table.column_count: + table_data[row_idx][col_idx] = content + + # Convert to CSV + df = DataFrame(data=table_data) + csv_content = df.to_csv(header=False, index=False, lineterminator="\n") + + # Build metadata + table_meta = {**meta, "table_format": "csv", "content_format": "table"} + + if table.bounding_regions: + table_meta["page"] = table.bounding_regions[0].page_number + + table_documents.append(Document(content=csv_content, meta=table_meta)) + + return table_documents + + def _is_paragraph_in_table(self, paragraph: Any, tables: list | None) -> bool: + """ + Check if a paragraph is part of a table. + + :param paragraph: Paragraph object to check. + :param tables: List of table objects. + :returns: True if paragraph is in a table, False otherwise. + """ + if not tables or not paragraph.spans: + return False + + para_offset = paragraph.spans[0].offset + para_length = paragraph.spans[0].length + para_end = para_offset + para_length + + for table in tables: + if not table.spans: + continue + table_offset = table.spans[0].offset + table_end = table_offset + table.spans[0].length + + # Check if paragraph overlaps with table + if table_offset <= para_offset <= table_end or table_offset <= para_end <= table_end: + return True + + return False diff --git a/pyproject.toml b/pyproject.toml index 49b2168766..b09df70013 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,9 +106,10 @@ dependencies = [ "pdfminer.six", # PDFMinerToDocument "markdown-it-py", # MarkdownToDocument "mdit_plain", # MarkdownToDocument - "tika", # TikaDocumentConverter - "azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter - "trafilatura", # HTMLToDocument + "tika", # TikaDocumentConverter + "azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter (deprecated) + "azure-ai-documentintelligence>=1.0.0", # AzureDocumentIntelligenceConverter + "trafilatura", # HTMLToDocument "python-pptx", # PPTXToDocument "python-docx", # DocxToDocument "jq", # JSONConverter diff --git a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml new file mode 100644 index 0000000000..457889b7ab --- /dev/null +++ b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml @@ -0,0 +1,14 @@ +--- +features: + - | + Added `AzureDocumentIntelligenceConverter` component that uses the new + azure-ai-documentintelligence package (v1.0.0). Supports markdown output + for better LLM/RAG integration. Key features: + - Markdown output with inline tables using Azure's native support + - Improved layout analysis and table extraction + - Simplified API (removes layout complexity) + - Backward compatible text mode with CSV tables +deprecations: + - | + `AzureOCRDocumentConverter` is deprecated and will be removed in + Haystack 2.25. Migrate to `AzureDocumentIntelligenceConverter`. diff --git a/test/components/converters/test_azure_document_intelligence.py b/test/components/converters/test_azure_document_intelligence.py new file mode 100644 index 0000000000..ed30f9cfab --- /dev/null +++ b/test/components/converters/test_azure_document_intelligence.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest + +from haystack.components.converters.azure import AzureDocumentIntelligenceConverter +from haystack.utils import Secret + + +class TestAzureDocumentIntelligenceConverter: + def test_init(self): + """Test basic initialization with defaults""" + converter = AzureDocumentIntelligenceConverter( + endpoint="https://test.cognitiveservices.azure.com/", api_key=Secret.from_token("test_api_key") + ) + + assert converter.endpoint == "https://test.cognitiveservices.azure.com/" + assert converter.model_id == "prebuilt-read" + assert converter.output_format == "markdown" + assert converter.table_format == "markdown" + assert converter.store_full_path is False + + def test_to_dict(self): + """Test serialization with Secret handling""" + converter = AzureDocumentIntelligenceConverter( + endpoint="https://test.cognitiveservices.azure.com/", + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + model_id="prebuilt-layout", + output_format="text", + table_format="csv", + store_full_path=True, + ) + + data = converter.to_dict() + + assert data == { + "type": "haystack.components.converters.azure.AzureDocumentIntelligenceConverter", + "init_parameters": { + "api_key": {"type": "env_var", "env_vars": ["AZURE_AI_API_KEY"], "strict": True}, + "endpoint": "https://test.cognitiveservices.azure.com/", + "model_id": "prebuilt-layout", + "output_format": "text", + "table_format": "csv", + "store_full_path": True, + }, + } + + def test_from_dict(self): + """Test deserialization""" + data = { + "type": "haystack.components.converters.azure.AzureDocumentIntelligenceConverter", + "init_parameters": { + "api_key": {"type": "env_var", "env_vars": ["AZURE_AI_API_KEY"], "strict": True}, + "endpoint": "https://test.cognitiveservices.azure.com/", + "model_id": "prebuilt-layout", + "output_format": "markdown", + "table_format": "markdown", + "store_full_path": False, + }, + } + + converter = AzureDocumentIntelligenceConverter.from_dict(data) + + assert converter.endpoint == "https://test.cognitiveservices.azure.com/" + assert converter.model_id == "prebuilt-layout" + assert converter.output_format == "markdown" + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available") + @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available") + @pytest.mark.flaky(reruns=5, reruns_delay=5) + def test_run_with_markdown_output(self, test_files_path): + """Integration test with real Azure API - markdown mode""" + converter = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + output_format="markdown", + ) + + results = converter.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"]) + + assert "documents" in results + assert len(results["documents"]) == 1 + assert len(results["documents"][0].content) > 0 + assert results["documents"][0].meta["content_format"] == "markdown" + assert "A sample PDF file" in results["documents"][0].content + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available") + @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available") + @pytest.mark.flaky(reruns=5, reruns_delay=5) + def test_run_with_text_output_csv_tables(self, test_files_path): + """Integration test with real Azure API - text mode with CSV tables""" + converter = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + output_format="text", + table_format="csv", + ) + + results = converter.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"]) + + assert "documents" in results + assert len(results["documents"]) >= 1 + + # Check that we have text document + text_docs = [d for d in results["documents"] if d.meta.get("content_format") == "text"] + assert len(text_docs) == 1 + assert "A sample PDF file" in text_docs[0].content + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available") + @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available") + @pytest.mark.flaky(reruns=5, reruns_delay=5) + def test_run_with_metadata(self, test_files_path): + """Integration test - verify metadata handling""" + converter = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], + api_key=Secret.from_env_var("AZURE_AI_API_KEY"), + store_full_path=False, + ) + + results = converter.run( + sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"custom_key": "custom_value"} + ) + + doc = results["documents"][0] + assert doc.meta["custom_key"] == "custom_value" + # Should be basename only + assert doc.meta["file_path"] == "sample_pdf_1.pdf" + + @pytest.mark.integration + @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available") + @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available") + @pytest.mark.flaky(reruns=5, reruns_delay=5) + def test_run_with_multiple_files(self, test_files_path): + """Integration test - process multiple files""" + converter = AzureDocumentIntelligenceConverter( + endpoint=os.environ["AZURE_DI_ENDPOINT"], api_key=Secret.from_env_var("AZURE_AI_API_KEY") + ) + + results = converter.run( + sources=[test_files_path / "pdf" / "sample_pdf_1.pdf", test_files_path / "pdf" / "sample_pdf_2.pdf"] + ) + + assert "documents" in results + assert len(results["documents"]) == 2 + assert "A sample PDF file" in results["documents"][0].content + assert "wiki" in results["documents"][1].content.lower() From 31d4ae9ea76929494d54c40215eb781d3e0dd3f3 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Jan 2026 12:24:58 +0100 Subject: [PATCH 2/5] Use double backticks in repo notes --- ...re-document-intelligence-converter-55bbd3bb1fe0a714.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml index 457889b7ab..7938bd962a 100644 --- a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml +++ b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml @@ -1,7 +1,7 @@ --- features: - | - Added `AzureDocumentIntelligenceConverter` component that uses the new + Added ``AzureDocumentIntelligenceConverter`` component that uses the new azure-ai-documentintelligence package (v1.0.0). Supports markdown output for better LLM/RAG integration. Key features: - Markdown output with inline tables using Azure's native support @@ -10,5 +10,5 @@ features: - Backward compatible text mode with CSV tables deprecations: - | - `AzureOCRDocumentConverter` is deprecated and will be removed in - Haystack 2.25. Migrate to `AzureDocumentIntelligenceConverter`. + ``AzureOCRDocumentConverter`` is deprecated and will be removed in + Haystack 2.25. Migrate to ``AzureDocumentIntelligenceConverter``. From 857dfd826b74ef78704fdba83f099950a36386ae Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Jan 2026 12:40:15 +0100 Subject: [PATCH 3/5] Add AZURE_AI_API_KEY and AZURE_DI_ENDPOINT env vars from Github secrets --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b54dedcd12..060d7c17b0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,6 +30,8 @@ env: CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }} AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + AZURE_DI_ENDPOINT: ${{ secrets.AZURE_DI_ENDPOINT }} + AZURE_AI_API_KEY: ${{ secrets.AZURE_AI_API_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }} PYTHON_VERSION: "3.10" From b41ab47e5da25fcc2f0d32a2a82e6b241bf5f7b3 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Jan 2026 13:56:12 +0100 Subject: [PATCH 4/5] Linting --- haystack/components/converters/azure.py | 49 +++++++++++++------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 7bf48fc16f..09d634036e 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -26,7 +26,8 @@ from pandas import DataFrame with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import: - from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult, DocumentContentFormat + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat + from azure.ai.documentintelligence.models import AnalyzeResult as DIAnalyzeResult @component @@ -212,11 +213,11 @@ def _convert_tables_and_text(self, result: "AnalyzeResult", meta: dict[str, Any] """ Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents. - :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result - can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. + Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation. :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values. - :returns: List of Documents containing the tables and text extracted from the AnalyzeResult object. + :returns: List of Documents containing the tables and text extracted from the DIAnalyzeResult object. """ tables = self._convert_tables(result=result, meta=meta) if self.page_layout == "natural": @@ -231,10 +232,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: dict[str, Any] | None) """ Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents. - :param result: The AnalyzeResult Azure object + :param result: The DIAnalyzeResult Azure object :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. - :returns: List of Documents containing the tables extracted from the AnalyzeResult object. + :returns: List of Documents containing the tables extracted from the DIAnalyzeResult object. """ converted_tables: list[Document] = [] @@ -336,16 +337,16 @@ def _convert_tables(self, result: "AnalyzeResult", meta: dict[str, Any] | None) def _convert_to_natural_text(self, result: "AnalyzeResult", meta: dict[str, Any] | None) -> Document: """ - This converts the `AnalyzeResult` object into a single document. + This converts the `DIAnalyzeResult` object into a single document. We add "\f" separators between to differentiate between the text on separate pages. This is the expected format for the PreProcessor. - :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result - can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. + Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation. :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values. - :returns: A single Document containing all the text extracted from the AnalyzeResult object. + :returns: A single Document containing all the text extracted from the DIAnalyzeResult object. """ table_spans_by_page = self._collect_table_spans(result=result) @@ -382,17 +383,17 @@ def _convert_to_single_column_text( self, result: "AnalyzeResult", meta: dict[str, str] | None, threshold_y: float = 0.05 ) -> Document: """ - This converts the `AnalyzeResult` object into a single Haystack Document. + This converts the `DIAnalyzeResult` object into a single Haystack Document. We add "\f" separators between to differentiate between the text on separate pages. This is the expected format for the PreProcessor. - :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result - can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult). + :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. + Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation. :param meta: Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values. :param threshold_y: height threshold in inches for PDF and pixels for images - :returns: A single Document containing all the text extracted from the AnalyzeResult object. + :returns: A single Document containing all the text extracted from the DIAnalyzeResult object. """ table_spans_by_page = self._collect_table_spans(result=result) @@ -472,7 +473,7 @@ def _collect_table_spans(self, result: "AnalyzeResult") -> dict: """ Collect the spans of all tables by page number. - :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. + :param result: The DIAnalyzeResult object returned by the `begin_analyze_document` method. :returns: A dictionary with the page number as key and a list of table spans as value. """ table_spans_by_page = defaultdict(list) @@ -718,11 +719,11 @@ def from_dict(cls, data: dict[str, Any]) -> "AzureDocumentIntelligenceConverter" deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) - def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document: + def _process_markdown_result(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> Document: """ Process result when output_format='markdown'. - :param result: The AnalyzeResult from Azure Document Intelligence. + :param result: The DIAnalyzeResult from Azure Document Intelligence. :param meta: Metadata dictionary to attach to the document. :returns: A single Document with markdown content. """ @@ -739,11 +740,11 @@ def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any] return Document(content=markdown_content, meta=doc_meta) - def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]: + def _process_text_result(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> list[Document]: """ Process result when output_format='text'. - :param result: The AnalyzeResult from Azure Document Intelligence. + :param result: The DIAnalyzeResult from Azure Document Intelligence. :param meta: Metadata dictionary to attach to the documents. :returns: List of Documents (text + optional table documents). """ @@ -760,11 +761,11 @@ def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> return documents - def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document: + def _extract_text_content(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> Document: """ Extract text from paragraphs. - :param result: The AnalyzeResult from Azure Document Intelligence. + :param result: The DIAnalyzeResult from Azure Document Intelligence. :param meta: Metadata dictionary to attach to the document. :returns: A single Document with all text. """ @@ -788,15 +789,15 @@ def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) - all_text = "\f".join(pages_text) return Document(content=all_text, meta={**meta, "content_format": "text"}) - def _extract_csv_tables(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]: + def _extract_csv_tables(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> list[Document]: """ Extract tables as CSV (backward compatibility mode). - :param result: The AnalyzeResult from Azure Document Intelligence. + :param result: The DIAnalyzeResult from Azure Document Intelligence. :param meta: Metadata dictionary to attach to the documents. :returns: List of Documents containing table CSV content. """ - table_documents = [] + table_documents: list[Document] = [] if not result.tables: return table_documents From 53362e799d91f8273933e09bdb651e5006dfe169 Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Thu, 8 Jan 2026 14:15:16 +0100 Subject: [PATCH 5/5] More linting --- haystack/components/converters/azure.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index 09d634036e..6c65b32d1a 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -26,7 +26,10 @@ from pandas import DataFrame with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import: - from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat + from azure.ai.documentintelligence.models import ( # pylint: disable=ungrouped-imports + AnalyzeDocumentRequest, + DocumentContentFormat, + ) from azure.ai.documentintelligence.models import AnalyzeResult as DIAnalyzeResult @@ -571,6 +574,7 @@ class AzureDocumentIntelligenceConverter: def __init__( self, endpoint: str, + *, api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"), model_id: str = "prebuilt-read", output_format: Literal["text", "markdown"] = "markdown",