Querent-ai · Ansh5461 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/querent/ingestors/csv/csv_ingestor.py b/querent/ingestors/csv/csv_ingestor.py
@@ -77,7 +77,6 @@ async def extract_text_from_csv(
         self, collected_bytes: CollectedBytes
     ) -> csv.reader:
         text_data = collected_bytes.data.decode("utf-8")
-        print(text_data)
         text = csv.reader(io.StringIO(text_data))
         return text
 

diff --git a/querent/ingestors/doc/doc_ingestor.py b/querent/ingestors/doc/doc_ingestor.py
@@ -60,7 +60,6 @@ async def ingest(
                 CollectedBytes(file=current_file, data=collected_bytes)
             ):
                 yield text
-            pass
 
     async def extract_and_process_doc(
         self, collected_bytes: CollectedBytes

diff --git a/querent/ingestors/html/html_ingestor.py b/querent/ingestors/html/html_ingestor.py
@@ -0,0 +1,97 @@
+"""Ingestor file for html"""
+from typing import List, AsyncGenerator
+from bs4 import BeautifulSoup
+
+from querent.processors.async_processor import AsyncProcessor
+from querent.ingestors.ingestor_factory import IngestorFactory
+from querent.ingestors.base_ingestor import BaseIngestor
+from querent.config.ingestor_config import IngestorBackend
+from querent.common.types.collected_bytes import CollectedBytes
+
+
+class HtmlIngestorFactory(IngestorFactory):
+    """Ingestor factory for html files"""
+
+    SUPPORTED_EXTENSIONS = {"html"}
+
+    async def supports(self, file_extension: str) -> bool:
+        return file_extension.lower() in self.SUPPORTED_EXTENSIONS
+
+    async def create(
+        self, file_extension: str, processors: List[AsyncProcessor]
+    ) -> BaseIngestor:
+        if not await self.supports(file_extension):
+            return None
+        return HtmlIngestor(processors)
+
+
+class HtmlIngestor(BaseIngestor):
+    """Ingestor for html"""
+
+    def __init__(self, processors: List[AsyncProcessor]):
+        super().__init__(IngestorBackend.HTML)
+        self.processors = processors
+
+    async def ingest(
+        self, poll_function: AsyncGenerator[CollectedBytes, None]
+    ) -> AsyncGenerator[str, None]:
+        """Ingesting bytes of xml file"""
+        current_file = None
+        collected_bytes = b""
+        try:
+            async for chunk_bytes in poll_function:
+                if chunk_bytes.is_error():
+                    # TODO handle error
+                    continue
+                if current_file is None:
+                    current_file = chunk_bytes.file
+                elif current_file != chunk_bytes.file:
+                    # we have a new file, process the old one
+                    async for text in self.extract_and_process_html(
+                        CollectedBytes(file=current_file, data=collected_bytes)
+                    ):
+                        yield text
+                    collected_bytes = b""
+                    current_file = chunk_bytes.file
+                collected_bytes += chunk_bytes.data
+        except Exception as e:
+            # TODO handle exception
+            yield ""
+        finally:
+            # process the last file
+            async for text in self.extract_and_process_html(
+                CollectedBytes(file=current_file, data=collected_bytes)
+            ):
+                yield text
+
+    async def extract_and_process_html(
+        self, collected_bytes: CollectedBytes
+    ) -> AsyncGenerator[str, None]:
+        """Function to extract and process xml files"""
+        text = await self.extract_text_from_html(collected_bytes)
+        processed_text = await self.process_data(text)
+        yield processed_text
+
+    async def extract_text_from_html(self, collected_bytes: CollectedBytes) -> str:
+        """Function to extract text from xml"""
+        html_content = collected_bytes.data.decode("UTF-8")
+        soup = BeautifulSoup(html_content, "html.parser")
+        text = []
+        links = []
+        tags = ["p", "h1", "h2", "h3", "h4", "h5", "a", "footer", "article"]
+        for element in soup.find_all(tags):
+            if element.name == "a":
+                link_text = element.get_text().strip()
+                link_href = element.get("href")
+                links.append((link_text, link_href))
+            else:
+                element_text = element.get_text().strip()
+                text.append(element_text)
+
+        return text
+
+    async def process_data(self, text: str) -> List[str]:
+        processed_data = text
+        for processor in self.processors:
+            processed_data = await processor.process(processed_data)
+        return processed_data
diff --git a/querent/ingestors/ingestor_manager.py b/querent/ingestors/ingestor_manager.py
@@ -14,6 +14,8 @@
 from querent.ingestors.csv.csv_ingestor import CsvIngestorFactory
 from querent.ingestors.xlsx.xlsx_ingestor import XlsxIngestorFactory
 from querent.ingestors.ppt.ppt_ingestor import PptIngestorFactory
+from querent.ingestors.xml.xml_ingestor import XmlIngestorFactory
+from querent.ingestors.html.html_ingestor import HtmlIngestorFactory
 
 
 class IngestorFactoryManager:
@@ -34,6 +36,8 @@ def __init__(self):
             IngestorBackend.XLSX.value: XlsxIngestorFactory(),
             IngestorBackend.PPT.value: PptIngestorFactory(),
             IngestorBackend.PPTX.value: PptIngestorFactory(),
+            IngestorBackend.XML.value: XmlIngestorFactory(),
+            IngestorBackend.HTML.value: HtmlIngestorFactory(),
             # Ingestor.TEXT.value: TextIngestor(),
             # Add more mappings as needed
         }

diff --git a/querent/ingestors/xml/xml_ingestor.py b/querent/ingestors/xml/xml_ingestor.py
@@ -0,0 +1,85 @@
+"""Ingestor file for xml"""
+from typing import List, AsyncGenerator
+import xml.etree.ElementTree as ET
+from io import BytesIO
+
+from querent.processors.async_processor import AsyncProcessor
+from querent.ingestors.ingestor_factory import IngestorFactory
+from querent.config.ingestor_config import IngestorBackend
+from querent.ingestors.base_ingestor import BaseIngestor
+from querent.common.types.collected_bytes import CollectedBytes
+
+
+class XmlIngestorFactory(IngestorFactory):
+    """Ingestor factory for xlsx files"""
+
+    SUPPORTED_EXTENSIONS = {"xml"}
+
+    async def supports(self, file_extension: str) -> bool:
+        return file_extension.lower() in self.SUPPORTED_EXTENSIONS
+
+    async def create(
+        self, file_extension: str, processors: List[AsyncProcessor]
+    ) -> BaseIngestor:
+        if not await self.supports(file_extension):
+            return None
+        return XmlIngestor(processors)
+
+
+class XmlIngestor(BaseIngestor):
+    """Ingestor for xml"""
+
+    def __init__(self, processors: List[AsyncProcessor]):
+        super().__init__(IngestorBackend.XML)
+        self.processors = processors
+
+    async def ingest(
+        self, poll_function: AsyncGenerator[CollectedBytes, None]
+    ) -> AsyncGenerator[str, None]:
+        """Ingesting bytes of xml file"""
+        current_file = None
+        collected_bytes = b""
+        try:
+            async for chunk_bytes in poll_function:
+                if chunk_bytes.is_error():
+                    # TODO handle error
+                    continue
+                if current_file is None:
+                    current_file = chunk_bytes.file
+                elif current_file != chunk_bytes.file:
+                    # we have a new file, process the old one
+                    async for text in self.extract_and_process_xml(
+                        CollectedBytes(file=current_file, data=collected_bytes)
+                    ):
+                        yield text
+                    collected_bytes = b""
+                    current_file = chunk_bytes.file
+                collected_bytes += chunk_bytes.data
+        except Exception as e:
+            # TODO handle exception
+            yield ""
+        finally:
+            # process the last file
+            async for text in self.extract_and_process_xml(
+                CollectedBytes(file=current_file, data=collected_bytes)
+            ):
+                yield text
+
+    async def extract_and_process_xml(
+        self, collected_bytes: CollectedBytes
+    ) -> AsyncGenerator[str, None]:
+        """Function to extract and process xml files"""
+        text = await self.extract_text_from_xml(collected_bytes)
+        processed_text = await self.process_data(text)
+        yield processed_text
+
+    async def extract_text_from_xml(self, collected_bytes: CollectedBytes) -> str:
+        """Function to extract text from xml"""
+        text = collected_bytes.data.decode("UTF-8")
+        return text
+
+    async def process_data(self, text: str) -> List[str]:
+        processed_data = text
+        for processor in self.processors:
+            processed_data = await processor.process(processed_data)
+        return processed_data
diff --git a/requirements.txt b/requirements.txt
@@ -165,3 +165,4 @@ pandas
 python-pptx
 tika
 openpyxl
+xml
diff --git a/tests/data/html/gnn_and_cnn.html b/tests/data/html/gnn_and_cnn.html
@@ -0,0 +1,53 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GNNs and CNNs</title>
+</head>
+<body>
+    <header>
+        <h1>Graph Neural Networks (GNNs)</h1>
+    </header>
+    <section>
+        <h2>Introduction</h2>
+        <p>Graph Neural Networks (GNNs) are a class of neural networks designed for handling graph-structured data.</p>
+        <p>They have gained popularity in various domains, including social networks, recommendation systems, and biology.</p>
+    </section>
+    <section>
+        <h2>Key Features</h2>
+        <ul>
+            <li>Graph Convolutional Layers</li>
+            <li>Node and Graph Classification</li>
+            <li>Message Passing</li>
+        </ul>
+    </section>
+    <section>
+        <h2>Learn More</h2>
+        <p>For more information on GNNs, visit the <a href="https://en.wikipedia.org/wiki/Graph_neural_network">Wikipedia page</a>.</p>
+        <img src="gnn_image.png" alt="Graph Neural Network Diagram">
+    </section>
+    <hr>
+    <header>
+        <h1>Convolutional Neural Networks (CNNs)</h1>
+    </header>
+    <section>
+        <h2>Introduction</h2>
+        <p>Convolutional Neural Networks (CNNs) are a class of deep learning models commonly used in computer vision tasks.</p>
+        <p>They are known for their ability to capture spatial patterns in data.</p>
+    </section>
+    <section>
+        <h2>Key Features</h2>
+        <ul>
+            <li>Convolutional Layers</li>
+            <li>Pooling Layers</li>
+            <li>Feature Maps</li>
+        </ul>
+    </section>
+    <section>
+        <h2>Learn More</h2>
+        <p>For more information on CNNs, visit the <a href="https://en.wikipedia.org/wiki/Convolutional_neural_network">Wikipedia page</a>.</p>
+        <img src="cnn_image.png" alt="Convolutional Neural Network Diagram">
+    </section>
+</body>
+</html>
diff --git a/tests/data/xml/cd_catalog.xml b/tests/data/xml/cd_catalog.xml
@@ -0,0 +1,210 @@
+<CATALOG>
+<CD>
+<TITLE>Empire Burlesque</TITLE>
+<ARTIST>Bob Dylan</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>Columbia</COMPANY>
+<PRICE>10.90</PRICE>
+<YEAR>1985</YEAR>
+</CD>
+<CD>
+<TITLE>Hide your heart</TITLE>
+<ARTIST>Bonnie Tyler</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>CBS Records</COMPANY>
+<PRICE>9.90</PRICE>
+<YEAR>1988</YEAR>
+</CD>
+<CD>
+<TITLE>Greatest Hits</TITLE>
+<ARTIST>Dolly Parton</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>RCA</COMPANY>
+<PRICE>9.90</PRICE>
+<YEAR>1982</YEAR>
+</CD>
+<CD>
+<TITLE>Still got the blues</TITLE>
+<ARTIST>Gary Moore</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Virgin records</COMPANY>
+<PRICE>10.20</PRICE>
+<YEAR>1990</YEAR>
+</CD>
+<CD>
+<TITLE>Eros</TITLE>
+<ARTIST>Eros Ramazzotti</ARTIST>
+<COUNTRY>EU</COUNTRY>
+<COMPANY>BMG</COMPANY>
+<PRICE>9.90</PRICE>
+<YEAR>1997</YEAR>
+</CD>
+<CD>
+<TITLE>One night only</TITLE>
+<ARTIST>Bee Gees</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Polydor</COMPANY>
+<PRICE>10.90</PRICE>
+<YEAR>1998</YEAR>
+</CD>
+<CD>
+<TITLE>Sylvias Mother</TITLE>
+<ARTIST>Dr.Hook</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>CBS</COMPANY>
+<PRICE>8.10</PRICE>
+<YEAR>1973</YEAR>
+</CD>
+<CD>
+<TITLE>Maggie May</TITLE>
+<ARTIST>Rod Stewart</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Pickwick</COMPANY>
+<PRICE>8.50</PRICE>
+<YEAR>1990</YEAR>
+</CD>
+<CD>
+<TITLE>Romanza</TITLE>
+<ARTIST>Andrea Bocelli</ARTIST>
+<COUNTRY>EU</COUNTRY>
+<COMPANY>Polydor</COMPANY>
+<PRICE>10.80</PRICE>
+<YEAR>1996</YEAR>
+</CD>
+<CD>
+<TITLE>When a man loves a woman</TITLE>
+<ARTIST>Percy Sledge</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>Atlantic</COMPANY>
+<PRICE>8.70</PRICE>
+<YEAR>1987</YEAR>
+</CD>
+<CD>
+<TITLE>Black angel</TITLE>
+<ARTIST>Savage Rose</ARTIST>
+<COUNTRY>EU</COUNTRY>
+<COMPANY>Mega</COMPANY>
+<PRICE>10.90</PRICE>
+<YEAR>1995</YEAR>
+</CD>
+<CD>
+<TITLE>1999 Grammy Nominees</TITLE>
+<ARTIST>Many</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>Grammy</COMPANY>
+<PRICE>10.20</PRICE>
+<YEAR>1999</YEAR>
+</CD>
+<CD>
+<TITLE>For the good times</TITLE>
+<ARTIST>Kenny Rogers</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Mucik Master</COMPANY>
+<PRICE>8.70</PRICE>
+<YEAR>1995</YEAR>
+</CD>
+<CD>
+<TITLE>Big Willie style</TITLE>
+<ARTIST>Will Smith</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>Columbia</COMPANY>
+<PRICE>9.90</PRICE>
+<YEAR>1997</YEAR>
+</CD>
+<CD>
+<TITLE>Tupelo Honey</TITLE>
+<ARTIST>Van Morrison</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Polydor</COMPANY>
+<PRICE>8.20</PRICE>
+<YEAR>1971</YEAR>
+</CD>
+<CD>
+<TITLE>Soulsville</TITLE>
+<ARTIST>Jorn Hoel</ARTIST>
+<COUNTRY>Norway</COUNTRY>
+<COMPANY>WEA</COMPANY>
+<PRICE>7.90</PRICE>
+<YEAR>1996</YEAR>
+</CD>
+<CD>
+<TITLE>The very best of</TITLE>
+<ARTIST>Cat Stevens</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Island</COMPANY>
+<PRICE>8.90</PRICE>
+<YEAR>1990</YEAR>
+</CD>
+<CD>
+<TITLE>Stop</TITLE>
+<ARTIST>Sam Brown</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>A and M</COMPANY>
+<PRICE>8.90</PRICE>
+<YEAR>1988</YEAR>
+</CD>
+<CD>
+<TITLE>Bridge of Spies</TITLE>
+<ARTIST>T'Pau</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Siren</COMPANY>
+<PRICE>7.90</PRICE>
+<YEAR>1987</YEAR>
+</CD>
+<CD>
+<TITLE>Private Dancer</TITLE>
+<ARTIST>Tina Turner</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>Capitol</COMPANY>
+<PRICE>8.90</PRICE>
+<YEAR>1983</YEAR>
+</CD>
+<CD>
+<TITLE>Midt om natten</TITLE>
+<ARTIST>Kim Larsen</ARTIST>
+<COUNTRY>EU</COUNTRY>
+<COMPANY>Medley</COMPANY>
+<PRICE>7.80</PRICE>
+<YEAR>1983</YEAR>
+</CD>
+<CD>
+<TITLE>Pavarotti Gala Concert</TITLE>
+<ARTIST>Luciano Pavarotti</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>DECCA</COMPANY>
+<PRICE>9.90</PRICE>
+<YEAR>1991</YEAR>
+</CD>
+<CD>
+<TITLE>The dock of the bay</TITLE>
+<ARTIST>Otis Redding</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>Stax Records</COMPANY>
+<PRICE>7.90</PRICE>
+<YEAR>1968</YEAR>
+</CD>
+<CD>
+<TITLE>Picture book</TITLE>
+<ARTIST>Simply Red</ARTIST>
+<COUNTRY>EU</COUNTRY>
+<COMPANY>Elektra</COMPANY>
+<PRICE>7.20</PRICE>
+<YEAR>1985</YEAR>
+</CD>
+<CD>
+<TITLE>Red</TITLE>
+<ARTIST>The Communards</ARTIST>
+<COUNTRY>UK</COUNTRY>
+<COMPANY>London</COMPANY>
+<PRICE>7.80</PRICE>
+<YEAR>1987</YEAR>
+</CD>
+<CD>
+<TITLE>Unchain my heart</TITLE>
+<ARTIST>Joe Cocker</ARTIST>
+<COUNTRY>USA</COUNTRY>
+<COMPANY>EMI</COMPANY>
+<PRICE>8.20</PRICE>
+<YEAR>1987</YEAR>
+</CD>
+</CATALOG>
diff --git a/tests/data/xml/notes.xml b/tests/data/xml/notes.xml
@@ -0,0 +1,6 @@
+<note>
+<to>My friend</to>
+<from>Me</from>
+<heading>Reminder</heading>
+<body>Don't forget me this weekend!</body>
+</note>
diff --git a/tests/test_html_ingestor.py b/tests/test_html_ingestor.py
@@ -0,0 +1,39 @@
+import asyncio
+from pathlib import Path
+from querent.collectors.fs.fs_collector import FSCollectorFactory
+from querent.config.collector_config import FSCollectorConfig
+from querent.common.uri import Uri
+from querent.ingestors.ingestor_manager import IngestorFactoryManager
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_collect_and_ingest_xml():
+    # Set up the collector
+    collector_factory = FSCollectorFactory()
+    uri = Uri("file://" + str(Path("./tests/data/html/").resolve()))
+    config = FSCollectorConfig(root_path=uri.path)
+    collector = collector_factory.resolve(uri, config)
+
+    # Set up the ingestor
+    ingestor_factory_manager = IngestorFactoryManager()
+    ingestor_factory = await ingestor_factory_manager.get_factory("html")
+    ingestor = await ingestor_factory.create("html", [])
+
+    # Collect and ingest the PDF
+    ingested_call = ingestor.ingest(collector.poll())
+    counter = 0
+
+    async def poll_and_print():
+        counter = 0
+        async for ingested in ingested_call:
+            assert ingested is not None
+            if ingested != "" or ingested is not None:
+                counter += 1
+        assert counter == 1
+
+    await poll_and_print()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_collect_and_ingest_xml())
diff --git a/tests/test_xml_ingestor.py b/tests/test_xml_ingestor.py
@@ -0,0 +1,39 @@
+import asyncio
+from pathlib import Path
+from querent.collectors.fs.fs_collector import FSCollectorFactory
+from querent.config.collector_config import FSCollectorConfig
+from querent.common.uri import Uri
+from querent.ingestors.ingestor_manager import IngestorFactoryManager
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_collect_and_ingest_xml():
+    # Set up the collector
+    collector_factory = FSCollectorFactory()
+    uri = Uri("file://" + str(Path("./tests/data/xml/").resolve()))
+    config = FSCollectorConfig(root_path=uri.path)
+    collector = collector_factory.resolve(uri, config)
+
+    # Set up the ingestor
+    ingestor_factory_manager = IngestorFactoryManager()
+    ingestor_factory = await ingestor_factory_manager.get_factory("xml")
+    ingestor = await ingestor_factory.create("xml", [])
+
+    # Collect and ingest the PDF
+    ingested_call = ingestor.ingest(collector.poll())
+    counter = 0
+
+    async def poll_and_print():
+        counter = 0
+        async for ingested in ingested_call:
+            assert ingested is not None
+            if ingested != "" or ingested is not None:
+                counter += 1
+        assert counter == 2
+
+    await poll_and_print()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_collect_and_ingest_xml())
-Original file line number
+Diff line change
@@ @@ -165,3 +165,4 @@ pandas @@
     python-pptx
     tika
     openpyxl
+    xml