From baa1f2c7cd355dda0caafd58733e2ea28c22c997 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Wed, 7 Jan 2026 12:44:05 +0100
Subject: [PATCH 1/5] Add AzureDocumentIntelligenceConverter using the
 azure-ai-documentintelligence package

---
 haystack/components/converters/__init__.py    |   3 +-
 haystack/components/converters/azure.py       | 374 ++++++++++++++++++
 pyproject.toml                                |   7 +-
 ...telligence-converter-55bbd3bb1fe0a714.yaml |  14 +
 .../test_azure_document_intelligence.py       | 152 +++++++
 5 files changed, 546 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
 create mode 100644 test/components/converters/test_azure_document_intelligence.py

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
index 97d1978782..bb37cb7f93 100644
--- a/haystack/components/converters/__init__.py
+++ b/haystack/components/converters/__init__.py
@@ -8,7 +8,7 @@
 from lazy_imports import LazyImporter
 
 _import_structure = {
-    "azure": ["AzureOCRDocumentConverter"],
+    "azure": ["AzureOCRDocumentConverter", "AzureDocumentIntelligenceConverter"],
     "csv": ["CSVToDocument"],
     "docx": ["DOCXToDocument"],
     "html": ["HTMLToDocument"],
@@ -27,6 +27,7 @@
 }
 
 if TYPE_CHECKING:
+    from .azure import AzureDocumentIntelligenceConverter as AzureDocumentIntelligenceConverter
     from .azure import AzureOCRDocumentConverter as AzureOCRDocumentConverter
     from .csv import CSVToDocument as CSVToDocument
     from .docx import DOCXToDocument as DOCXToDocument
diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
index 887cb60f32..7bf48fc16f 100644
--- a/haystack/components/converters/azure.py
+++ b/haystack/components/converters/azure.py
@@ -25,6 +25,9 @@
 with LazyImport(message="Run 'pip install pandas'") as pandas_import:
     from pandas import DataFrame
 
+with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import:
+    from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult, DocumentContentFormat
+
 
 @component
 class AzureOCRDocumentConverter:
@@ -98,6 +101,17 @@ def __init__(  # pylint: disable=too-many-positional-arguments
             If True, the full path of the file is stored in the metadata of the document.
             If False, only the file name is stored.
         """
+        import warnings
+
+        warnings.warn(
+            "AzureOCRDocumentConverter is deprecated and will be removed in Haystack 2.25. "
+            "Please migrate to AzureDocumentIntelligenceConverter which uses the latest "
+            "azure-ai-documentintelligence package and supports markdown output for better "
+            "RAG/LLM integration.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
         azure_import.check()
         pandas_import.check()
 
@@ -486,3 +500,363 @@ def _check_if_in_table(
                 in_table = True
                 break
         return in_table
+
+
+@component
+class AzureDocumentIntelligenceConverter:
+    """
+    Converts files to Documents using Azure's Document Intelligence service (2024 API).
+
+    This component uses the latest azure-ai-documentintelligence package and supports
+    markdown output for better integration with LLM/RAG applications.
+
+    Supported file formats: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, HTML.
+
+    Key features:
+    - Markdown output with preserved structure (headings, tables, lists)
+    - Inline table integration (no separate table documents)
+    - Improved layout analysis and reading order
+    - Better table extraction
+    - Support for section headings
+
+    To use this component, you need an active Azure account
+    and a Document Intelligence or Cognitive Services resource. For help with setting up your resource, see
+    [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api).
+
+    ### Usage example
+
+    ```python
+    import os
+    from haystack.components.converters import AzureDocumentIntelligenceConverter
+    from haystack.utils import Secret
+
+    # Basic usage with markdown output (recommended for RAG)
+    converter = AzureDocumentIntelligenceConverter(
+        endpoint=os.environ["AZURE_DI_ENDPOINT"],
+        api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+        output_format="markdown"
+    )
+
+    results = converter.run(sources=["invoice.pdf", "contract.docx"])
+    documents = results["documents"]
+
+    # Documents contain markdown with inline tables
+    print(documents[0].content)
+    # Output:
+    # # Invoice
+    #
+    # | Item | Quantity | Price |
+    # |------|----------|-------|
+    # | Widget | 10 | $50.00 |
+    #
+    # Total: $500.00
+
+    # For backward compatibility, use text mode with CSV tables
+    converter_compat = AzureDocumentIntelligenceConverter(
+        endpoint=os.environ["AZURE_DI_ENDPOINT"],
+        api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+        output_format="text",
+        table_format="csv"
+    )
+    ```
+
+    Migration from AzureOCRDocumentConverter:
+    - Replace `page_layout="natural"` with `output_format="markdown"`
+    - Remove `preceding_context_len`, `following_context_len` (not needed)
+    - Tables are now inline in markdown mode
+    - Install: `pip install azure-ai-documentintelligence`
+    """
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
+        model_id: str = "prebuilt-read",
+        output_format: Literal["text", "markdown"] = "markdown",
+        table_format: Literal["csv", "markdown"] = "markdown",
+        store_full_path: bool = False,
+    ):
+        """
+        Creates an AzureDocumentIntelligenceConverter component.
+
+        :param endpoint:
+            The endpoint URL of your Azure Document Intelligence resource.
+            Example: "https://YOUR_RESOURCE.cognitiveservices.azure.com/"
+        :param api_key:
+            API key for Azure authentication. Can use Secret.from_env_var()
+            to load from AZURE_AI_API_KEY environment variable.
+        :param model_id:
+            Azure model to use for analysis. Options:
+            - "prebuilt-read": Fast OCR for text extraction (default)
+            - "prebuilt-layout": Enhanced layout analysis with better table/structure detection
+            - "prebuilt-document": General document analysis
+            - Custom model IDs from your Azure resource
+        :param output_format:
+            Output format for document content.
+            - "markdown": Returns GitHub Flavored Markdown with inline tables (recommended for RAG)
+            - "text": Returns plain text with optional separate table documents
+        :param table_format:
+            How to format tables when output_format="text".
+            - "markdown": Format tables as markdown (inline)
+            - "csv": Format tables as CSV in separate documents (like old AzureOCRDocumentConverter)
+            Ignored when output_format="markdown" (tables are already in markdown).
+        :param store_full_path:
+            If True, stores complete file path in metadata.
+            If False, stores only the filename (default).
+        """
+        azure_di_import.check()
+        from azure.ai.documentintelligence import DocumentIntelligenceClient
+        from azure.core.credentials import AzureKeyCredential
+
+        if output_format == "text" and table_format == "csv":
+            pandas_import.check()
+
+        self.client = DocumentIntelligenceClient(
+            endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "")
+        )
+        self.endpoint = endpoint
+        self.api_key = api_key
+        self.model_id = model_id
+        self.output_format = output_format
+        self.table_format = table_format
+        self.store_full_path = store_full_path
+
+    @component.output_types(documents=list[Document], raw_azure_response=list[dict])
+    def run(self, sources: list[str | Path | ByteStream], meta: dict[str, Any] | list[dict[str, Any]] | None = None):
+        """
+        Convert a list of files to Documents using Azure's Document Intelligence service.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be
+            zipped. If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: List of created Documents
+            - `raw_azure_response`: List of raw Azure responses used to create the Documents
+        """
+        documents = []
+        azure_responses = []
+        meta_list: list[dict[str, Any]] = normalize_metadata(meta=meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source=source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+
+            try:
+                # Determine output format
+                content_format = DocumentContentFormat.MARKDOWN if self.output_format == "markdown" else None
+
+                # Create analyze request
+                analyze_request = AnalyzeDocumentRequest(bytes_source=bytestream.data)
+
+                # Call Azure API
+                poller = self.client.begin_analyze_document(
+                    model_id=self.model_id, body=analyze_request, output_content_format=content_format
+                )
+                result = poller.result()
+                azure_responses.append(result.as_dict())
+
+                # Merge metadata
+                merged_metadata = {**bytestream.meta, **metadata}
+                if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
+                # Process based on output format
+                if self.output_format == "markdown":
+                    doc = self._process_markdown_result(result, merged_metadata)
+                    documents.append(doc)
+                else:
+                    docs = self._process_text_result(result, merged_metadata)
+                    documents.extend(docs)
+
+            except Exception as e:
+                logger.warning(
+                    "Failed to convert {source} using Azure Document Intelligence. Skipping it. Error: {error}",
+                    source=source,
+                    error=e,
+                )
+                continue
+
+        return {"documents": documents, "raw_azure_response": azure_responses}
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            api_key=self.api_key.to_dict(),
+            endpoint=self.endpoint,
+            model_id=self.model_id,
+            output_format=self.output_format,
+            table_format=self.table_format,
+            store_full_path=self.store_full_path,
+        )
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "AzureDocumentIntelligenceConverter":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            The dictionary to deserialize from.
+        :returns:
+            The deserialized component.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
+
+    def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document:
+        """
+        Process result when output_format='markdown'.
+
+        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param meta: Metadata dictionary to attach to the document.
+        :returns: A single Document with markdown content.
+        """
+        # Azure returns complete markdown in result.content
+        markdown_content = result.content or ""
+
+        # Build metadata
+        doc_meta = {
+            **meta,
+            "content_format": "markdown",
+            "model_id": self.model_id,
+            "page_count": len(result.pages) if result.pages else 0,
+        }
+
+        return Document(content=markdown_content, meta=doc_meta)
+
+    def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]:
+        """
+        Process result when output_format='text'.
+
+        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param meta: Metadata dictionary to attach to the documents.
+        :returns: List of Documents (text + optional table documents).
+        """
+        documents = []
+
+        # Extract tables if table_format='csv'
+        if self.table_format == "csv" and result.tables:
+            table_docs = self._extract_csv_tables(result, meta)
+            documents.extend(table_docs)
+
+        # Extract main text content
+        text_doc = self._extract_text_content(result, meta)
+        documents.append(text_doc)
+
+        return documents
+
+    def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document:
+        """
+        Extract text from paragraphs.
+
+        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param meta: Metadata dictionary to attach to the document.
+        :returns: A single Document with all text.
+        """
+        # Group paragraphs by page
+        pages_text = []
+        if result.paragraphs:
+            page_to_paragraphs = defaultdict(list)
+            for para in result.paragraphs:
+                page_num = para.bounding_regions[0].page_number if para.bounding_regions else 1
+                # Skip paragraphs that are part of tables if extracting tables separately
+                if self.table_format == "csv" and self._is_paragraph_in_table(para, result.tables):
+                    continue
+                page_to_paragraphs[page_num].append(para.content)
+
+            # Join paragraphs with page breaks
+            max_page = max(page_to_paragraphs.keys()) if page_to_paragraphs else 0
+            for page_num in range(1, max_page + 1):
+                page_text = "\n".join(page_to_paragraphs.get(page_num, []))
+                pages_text.append(page_text)
+
+        all_text = "\f".join(pages_text)
+        return Document(content=all_text, meta={**meta, "content_format": "text"})
+
+    def _extract_csv_tables(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]:
+        """
+        Extract tables as CSV (backward compatibility mode).
+
+        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param meta: Metadata dictionary to attach to the documents.
+        :returns: List of Documents containing table CSV content.
+        """
+        table_documents = []
+
+        if not result.tables:
+            return table_documents
+
+        for table in result.tables:
+            # Build table as 2D array
+            table_data = [[""] * table.column_count for _ in range(table.row_count)]
+
+            for cell in table.cells:
+                # Remove selection markers
+                content = cell.content.replace(":selected:", "").replace(":unselected:", "")
+
+                # Handle cell spanning
+                column_span = cell.column_span if cell.column_span else 1
+                row_span = cell.row_span if cell.row_span else 1
+
+                for r in range(row_span):
+                    for c in range(column_span):
+                        row_idx = cell.row_index + r
+                        col_idx = cell.column_index + c
+                        if row_idx < table.row_count and col_idx < table.column_count:
+                            table_data[row_idx][col_idx] = content
+
+            # Convert to CSV
+            df = DataFrame(data=table_data)
+            csv_content = df.to_csv(header=False, index=False, lineterminator="\n")
+
+            # Build metadata
+            table_meta = {**meta, "table_format": "csv", "content_format": "table"}
+
+            if table.bounding_regions:
+                table_meta["page"] = table.bounding_regions[0].page_number
+
+            table_documents.append(Document(content=csv_content, meta=table_meta))
+
+        return table_documents
+
+    def _is_paragraph_in_table(self, paragraph: Any, tables: list | None) -> bool:
+        """
+        Check if a paragraph is part of a table.
+
+        :param paragraph: Paragraph object to check.
+        :param tables: List of table objects.
+        :returns: True if paragraph is in a table, False otherwise.
+        """
+        if not tables or not paragraph.spans:
+            return False
+
+        para_offset = paragraph.spans[0].offset
+        para_length = paragraph.spans[0].length
+        para_end = para_offset + para_length
+
+        for table in tables:
+            if not table.spans:
+                continue
+            table_offset = table.spans[0].offset
+            table_end = table_offset + table.spans[0].length
+
+            # Check if paragraph overlaps with table
+            if table_offset <= para_offset <= table_end or table_offset <= para_end <= table_end:
+                return True
+
+        return False
diff --git a/pyproject.toml b/pyproject.toml
index 49b2168766..b09df70013 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,9 +106,10 @@ dependencies = [
   "pdfminer.six",                     # PDFMinerToDocument
   "markdown-it-py",                   # MarkdownToDocument
   "mdit_plain",                       # MarkdownToDocument
-  "tika",                             # TikaDocumentConverter
-  "azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter
-  "trafilatura",                      # HTMLToDocument
+  "tika",                                    # TikaDocumentConverter
+  "azure-ai-formrecognizer>=3.2.0b2",        # AzureOCRDocumentConverter (deprecated)
+  "azure-ai-documentintelligence>=1.0.0",    # AzureDocumentIntelligenceConverter
+  "trafilatura",                             # HTMLToDocument
   "python-pptx",                      # PPTXToDocument
   "python-docx",                      # DocxToDocument
   "jq",                               # JSONConverter
diff --git a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
new file mode 100644
index 0000000000..457889b7ab
--- /dev/null
+++ b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
@@ -0,0 +1,14 @@
+---
+features:
+  - |
+    Added `AzureDocumentIntelligenceConverter` component that uses the new
+    azure-ai-documentintelligence package (v1.0.0). Supports markdown output
+    for better LLM/RAG integration. Key features:
+    - Markdown output with inline tables using Azure's native support
+    - Improved layout analysis and table extraction
+    - Simplified API (removes layout complexity)
+    - Backward compatible text mode with CSV tables
+deprecations:
+  - |
+    `AzureOCRDocumentConverter` is deprecated and will be removed in
+    Haystack 2.25. Migrate to `AzureDocumentIntelligenceConverter`.
diff --git a/test/components/converters/test_azure_document_intelligence.py b/test/components/converters/test_azure_document_intelligence.py
new file mode 100644
index 0000000000..ed30f9cfab
--- /dev/null
+++ b/test/components/converters/test_azure_document_intelligence.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import pytest
+
+from haystack.components.converters.azure import AzureDocumentIntelligenceConverter
+from haystack.utils import Secret
+
+
+class TestAzureDocumentIntelligenceConverter:
+    def test_init(self):
+        """Test basic initialization with defaults"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint="https://test.cognitiveservices.azure.com/", api_key=Secret.from_token("test_api_key")
+        )
+
+        assert converter.endpoint == "https://test.cognitiveservices.azure.com/"
+        assert converter.model_id == "prebuilt-read"
+        assert converter.output_format == "markdown"
+        assert converter.table_format == "markdown"
+        assert converter.store_full_path is False
+
+    def test_to_dict(self):
+        """Test serialization with Secret handling"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint="https://test.cognitiveservices.azure.com/",
+            api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+            model_id="prebuilt-layout",
+            output_format="text",
+            table_format="csv",
+            store_full_path=True,
+        )
+
+        data = converter.to_dict()
+
+        assert data == {
+            "type": "haystack.components.converters.azure.AzureDocumentIntelligenceConverter",
+            "init_parameters": {
+                "api_key": {"type": "env_var", "env_vars": ["AZURE_AI_API_KEY"], "strict": True},
+                "endpoint": "https://test.cognitiveservices.azure.com/",
+                "model_id": "prebuilt-layout",
+                "output_format": "text",
+                "table_format": "csv",
+                "store_full_path": True,
+            },
+        }
+
+    def test_from_dict(self):
+        """Test deserialization"""
+        data = {
+            "type": "haystack.components.converters.azure.AzureDocumentIntelligenceConverter",
+            "init_parameters": {
+                "api_key": {"type": "env_var", "env_vars": ["AZURE_AI_API_KEY"], "strict": True},
+                "endpoint": "https://test.cognitiveservices.azure.com/",
+                "model_id": "prebuilt-layout",
+                "output_format": "markdown",
+                "table_format": "markdown",
+                "store_full_path": False,
+            },
+        }
+
+        converter = AzureDocumentIntelligenceConverter.from_dict(data)
+
+        assert converter.endpoint == "https://test.cognitiveservices.azure.com/"
+        assert converter.model_id == "prebuilt-layout"
+        assert converter.output_format == "markdown"
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available")
+    @pytest.mark.flaky(reruns=5, reruns_delay=5)
+    def test_run_with_markdown_output(self, test_files_path):
+        """Integration test with real Azure API - markdown mode"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint=os.environ["AZURE_DI_ENDPOINT"],
+            api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+            output_format="markdown",
+        )
+
+        results = converter.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
+
+        assert "documents" in results
+        assert len(results["documents"]) == 1
+        assert len(results["documents"][0].content) > 0
+        assert results["documents"][0].meta["content_format"] == "markdown"
+        assert "A sample PDF file" in results["documents"][0].content
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available")
+    @pytest.mark.flaky(reruns=5, reruns_delay=5)
+    def test_run_with_text_output_csv_tables(self, test_files_path):
+        """Integration test with real Azure API - text mode with CSV tables"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint=os.environ["AZURE_DI_ENDPOINT"],
+            api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+            output_format="text",
+            table_format="csv",
+        )
+
+        results = converter.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
+
+        assert "documents" in results
+        assert len(results["documents"]) >= 1
+
+        # Check that we have text document
+        text_docs = [d for d in results["documents"] if d.meta.get("content_format") == "text"]
+        assert len(text_docs) == 1
+        assert "A sample PDF file" in text_docs[0].content
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available")
+    @pytest.mark.flaky(reruns=5, reruns_delay=5)
+    def test_run_with_metadata(self, test_files_path):
+        """Integration test - verify metadata handling"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint=os.environ["AZURE_DI_ENDPOINT"],
+            api_key=Secret.from_env_var("AZURE_AI_API_KEY"),
+            store_full_path=False,
+        )
+
+        results = converter.run(
+            sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"custom_key": "custom_value"}
+        )
+
+        doc = results["documents"][0]
+        assert doc.meta["custom_key"] == "custom_value"
+        # Should be basename only
+        assert doc.meta["file_path"] == "sample_pdf_1.pdf"
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(not os.environ.get("AZURE_DI_ENDPOINT", None), reason="Azure endpoint not available")
+    @pytest.mark.skipif(not os.environ.get("AZURE_AI_API_KEY", None), reason="Azure credentials not available")
+    @pytest.mark.flaky(reruns=5, reruns_delay=5)
+    def test_run_with_multiple_files(self, test_files_path):
+        """Integration test - process multiple files"""
+        converter = AzureDocumentIntelligenceConverter(
+            endpoint=os.environ["AZURE_DI_ENDPOINT"], api_key=Secret.from_env_var("AZURE_AI_API_KEY")
+        )
+
+        results = converter.run(
+            sources=[test_files_path / "pdf" / "sample_pdf_1.pdf", test_files_path / "pdf" / "sample_pdf_2.pdf"]
+        )
+
+        assert "documents" in results
+        assert len(results["documents"]) == 2
+        assert "A sample PDF file" in results["documents"][0].content
+        assert "wiki" in results["documents"][1].content.lower()

From 31d4ae9ea76929494d54c40215eb781d3e0dd3f3 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 8 Jan 2026 12:24:58 +0100
Subject: [PATCH 2/5] Use double backticks in repo notes

---
 ...re-document-intelligence-converter-55bbd3bb1fe0a714.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
index 457889b7ab..7938bd962a 100644
--- a/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
+++ b/releasenotes/notes/add-azure-document-intelligence-converter-55bbd3bb1fe0a714.yaml
@@ -1,7 +1,7 @@
 ---
 features:
   - |
-    Added `AzureDocumentIntelligenceConverter` component that uses the new
+    Added ``AzureDocumentIntelligenceConverter`` component that uses the new
     azure-ai-documentintelligence package (v1.0.0). Supports markdown output
     for better LLM/RAG integration. Key features:
     - Markdown output with inline tables using Azure's native support
@@ -10,5 +10,5 @@ features:
     - Backward compatible text mode with CSV tables
 deprecations:
   - |
-    `AzureOCRDocumentConverter` is deprecated and will be removed in
-    Haystack 2.25. Migrate to `AzureDocumentIntelligenceConverter`.
+    ``AzureOCRDocumentConverter`` is deprecated and will be removed in
+    Haystack 2.25. Migrate to ``AzureDocumentIntelligenceConverter``.

From 857dfd826b74ef78704fdba83f099950a36386ae Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 8 Jan 2026 12:40:15 +0100
Subject: [PATCH 3/5] Add AZURE_AI_API_KEY and AZURE_DI_ENDPOINT env vars from
 Github secrets

---
 .github/workflows/tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b54dedcd12..060d7c17b0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -30,6 +30,8 @@ env:
   CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }}
   AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
   AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+  AZURE_DI_ENDPOINT: ${{ secrets.AZURE_DI_ENDPOINT }}
+  AZURE_AI_API_KEY: ${{ secrets.AZURE_AI_API_KEY }}
   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   HF_API_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
   PYTHON_VERSION: "3.10"

From b41ab47e5da25fcc2f0d32a2a82e6b241bf5f7b3 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 8 Jan 2026 13:56:12 +0100
Subject: [PATCH 4/5] Linting

---
 haystack/components/converters/azure.py | 49 +++++++++++++------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
index 7bf48fc16f..09d634036e 100644
--- a/haystack/components/converters/azure.py
+++ b/haystack/components/converters/azure.py
@@ -26,7 +26,8 @@
     from pandas import DataFrame
 
 with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import:
-    from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult, DocumentContentFormat
+    from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat
+    from azure.ai.documentintelligence.models import AnalyzeResult as DIAnalyzeResult
 
 
 @component
@@ -212,11 +213,11 @@ def _convert_tables_and_text(self, result: "AnalyzeResult", meta: dict[str, Any]
         """
         Converts the tables and text extracted by Azure's Document Intelligence service into Haystack Documents.
 
-        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
-            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
+        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
+            Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation.
         :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
             Can be any custom keys and values.
-        :returns: List of Documents containing the tables and text extracted from the AnalyzeResult object.
+        :returns: List of Documents containing the tables and text extracted from the DIAnalyzeResult object.
         """
         tables = self._convert_tables(result=result, meta=meta)
         if self.page_layout == "natural":
@@ -231,10 +232,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: dict[str, Any] | None)
         """
         Converts the tables extracted by Azure's Document Intelligence service into Haystack Documents.
 
-        :param result: The AnalyzeResult Azure object
+        :param result: The DIAnalyzeResult Azure object
         :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
 
-        :returns: List of Documents containing the tables extracted from the AnalyzeResult object.
+        :returns: List of Documents containing the tables extracted from the DIAnalyzeResult object.
         """
         converted_tables: list[Document] = []
 
@@ -336,16 +337,16 @@ def _convert_tables(self, result: "AnalyzeResult", meta: dict[str, Any] | None)
 
     def _convert_to_natural_text(self, result: "AnalyzeResult", meta: dict[str, Any] | None) -> Document:
         """
-        This converts the `AnalyzeResult` object into a single document.
+        This converts the `DIAnalyzeResult` object into a single document.
 
         We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
         for the PreProcessor.
 
-        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
-            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
+        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
+            Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation.
         :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
             Can be any custom keys and values.
-        :returns: A single Document containing all the text extracted from the AnalyzeResult object.
+        :returns: A single Document containing all the text extracted from the DIAnalyzeResult object.
         """
         table_spans_by_page = self._collect_table_spans(result=result)
 
@@ -382,17 +383,17 @@ def _convert_to_single_column_text(
         self, result: "AnalyzeResult", meta: dict[str, str] | None, threshold_y: float = 0.05
     ) -> Document:
         """
-        This converts the `AnalyzeResult` object into a single Haystack Document.
+        This converts the `DIAnalyzeResult` object into a single Haystack Document.
 
         We add "\f" separators between to differentiate between the text on separate pages. This is the expected format
         for the PreProcessor.
 
-        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method. Docs on Analyze result
-            can be found [here](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.3.0/azure.ai.formrecognizer.html?highlight=read#azure.ai.formrecognizer.AnalyzeResult).
+        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
+            Docs on AnalyzeResult can be found in the Azure FormRecognizer documentation.
         :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
             Can be any custom keys and values.
         :param threshold_y: height threshold in inches for PDF and pixels for images
-        :returns: A single Document containing all the text extracted from the AnalyzeResult object.
+        :returns: A single Document containing all the text extracted from the DIAnalyzeResult object.
         """
         table_spans_by_page = self._collect_table_spans(result=result)
 
@@ -472,7 +473,7 @@ def _collect_table_spans(self, result: "AnalyzeResult") -> dict:
         """
         Collect the spans of all tables by page number.
 
-        :param result: The AnalyzeResult object returned by the `begin_analyze_document` method.
+        :param result: The DIAnalyzeResult object returned by the `begin_analyze_document` method.
         :returns: A dictionary with the page number as key and a list of table spans as value.
         """
         table_spans_by_page = defaultdict(list)
@@ -718,11 +719,11 @@ def from_dict(cls, data: dict[str, Any]) -> "AzureDocumentIntelligenceConverter"
         deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
         return default_from_dict(cls, data)
 
-    def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document:
+    def _process_markdown_result(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> Document:
         """
         Process result when output_format='markdown'.
 
-        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param result: The DIAnalyzeResult from Azure Document Intelligence.
         :param meta: Metadata dictionary to attach to the document.
         :returns: A single Document with markdown content.
         """
@@ -739,11 +740,11 @@ def _process_markdown_result(self, result: "AnalyzeResult", meta: dict[str, Any]
 
         return Document(content=markdown_content, meta=doc_meta)
 
-    def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]:
+    def _process_text_result(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> list[Document]:
         """
         Process result when output_format='text'.
 
-        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param result: The DIAnalyzeResult from Azure Document Intelligence.
         :param meta: Metadata dictionary to attach to the documents.
         :returns: List of Documents (text + optional table documents).
         """
@@ -760,11 +761,11 @@ def _process_text_result(self, result: "AnalyzeResult", meta: dict[str, Any]) ->
 
         return documents
 
-    def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) -> Document:
+    def _extract_text_content(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> Document:
         """
         Extract text from paragraphs.
 
-        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param result: The DIAnalyzeResult from Azure Document Intelligence.
         :param meta: Metadata dictionary to attach to the document.
         :returns: A single Document with all text.
         """
@@ -788,15 +789,15 @@ def _extract_text_content(self, result: "AnalyzeResult", meta: dict[str, Any]) -
         all_text = "\f".join(pages_text)
         return Document(content=all_text, meta={**meta, "content_format": "text"})
 
-    def _extract_csv_tables(self, result: "AnalyzeResult", meta: dict[str, Any]) -> list[Document]:
+    def _extract_csv_tables(self, result: "DIAnalyzeResult", meta: dict[str, Any]) -> list[Document]:
         """
         Extract tables as CSV (backward compatibility mode).
 
-        :param result: The AnalyzeResult from Azure Document Intelligence.
+        :param result: The DIAnalyzeResult from Azure Document Intelligence.
         :param meta: Metadata dictionary to attach to the documents.
         :returns: List of Documents containing table CSV content.
         """
-        table_documents = []
+        table_documents: list[Document] = []
 
         if not result.tables:
             return table_documents

From 53362e799d91f8273933e09bdb651e5006dfe169 Mon Sep 17 00:00:00 2001
From: Vladimir Blagojevic <dovlex@gmail.com>
Date: Thu, 8 Jan 2026 14:15:16 +0100
Subject: [PATCH 5/5] More linting

---
 haystack/components/converters/azure.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py
index 09d634036e..6c65b32d1a 100644
--- a/haystack/components/converters/azure.py
+++ b/haystack/components/converters/azure.py
@@ -26,7 +26,10 @@
     from pandas import DataFrame
 
 with LazyImport(message="Run 'pip install \"azure-ai-documentintelligence>=1.0.0\"'") as azure_di_import:
-    from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentContentFormat
+    from azure.ai.documentintelligence.models import (  # pylint: disable=ungrouped-imports
+        AnalyzeDocumentRequest,
+        DocumentContentFormat,
+    )
     from azure.ai.documentintelligence.models import AnalyzeResult as DIAnalyzeResult
 
 
@@ -571,6 +574,7 @@ class AzureDocumentIntelligenceConverter:
     def __init__(
         self,
         endpoint: str,
+        *,
         api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"),
         model_id: str = "prebuilt-read",
         output_format: Literal["text", "markdown"] = "markdown",