Xlsx ingestor (#51)

Ansh5461 · web-flow · commit b0259a22e945 · 2023-09-14T01:48:45.000+05:30
* Added xlsx ingestor

* conflicts resolve

* removed unused import

* added dependencies
diff --git a/querent/ingestors/ingestor_manager.py b/querent/ingestors/ingestor_manager.py
@@ -12,6 +12,7 @@
 from querent.ingestors.images.image_ingestor import ImageIngestorFactory
 from querent.ingestors.doc.doc_ingestor import DocIngestorFactory
 from querent.ingestors.csv.csv_ingestor import CsvIngestorFactory
+from querent.ingestors.xlsx.xlsx_ingestor import XlsxIngestorFactory
 from querent.ingestors.ppt.ppt_ingestor import PptIngestorFactory
 
 
@@ -30,6 +31,7 @@ def __init__(self):
             IngestorBackend.DOCX.value: DocIngestorFactory(),
             IngestorBackend.DOC.value: DocIngestorFactory(),
             IngestorBackend.CSV.value: CsvIngestorFactory(),
+            IngestorBackend.XLSX.value: XlsxIngestorFactory(),
             IngestorBackend.PPT.value: PptIngestorFactory(),
             IngestorBackend.PPTX.value: PptIngestorFactory(),
             # Ingestor.TEXT.value: TextIngestor(),
diff --git a/querent/ingestors/xlsx/xlsx_ingestor.py b/querent/ingestors/xlsx/xlsx_ingestor.py
@@ -0,0 +1,86 @@
+"""Ingestor file for xlsx files"""
+from typing import List, AsyncGenerator
+import io
+import pandas as pd
+
+from querent.ingestors.ingestor_factory import IngestorFactory
+from querent.ingestors.base_ingestor import BaseIngestor
+from querent.processors.async_processor import AsyncProcessor
+from querent.config.ingestor_config import IngestorBackend
+from querent.common.types.collected_bytes import CollectedBytes
+
+
+class XlsxIngestorFactory(IngestorFactory):
+    """Ingestor factory for xlsx files"""
+
+    SUPPORTED_EXTENSIONS = {"xlsx"}
+
+    async def supports(self, file_extension: str) -> bool:
+        return file_extension.lower() in self.SUPPORTED_EXTENSIONS
+
+    async def create(
+        self, file_extension: str, processors: List[AsyncProcessor]
+    ) -> BaseIngestor:
+        if not await self.supports(file_extension):
+            return None
+        return XlsxIngestor(processors)
+
+
+class XlsxIngestor(BaseIngestor):
+    """Ingestor for xlsx files"""
+
+    def __init__(self, processors: List[AsyncProcessor]):
+        super().__init__(IngestorBackend.XLSX)
+        self.processors = processors
+
+    async def ingest(
+        self, poll_function: AsyncGenerator[CollectedBytes, None]
+    ) -> AsyncGenerator[str, None]:
+        current_file = None
+        collected_bytes = b""
+        try:
+            async for chunk_bytes in poll_function:
+                if chunk_bytes.is_error():
+                    # TODO handle error
+                    continue
+                if current_file is None:
+                    current_file = chunk_bytes.file
+                elif current_file != chunk_bytes.file:
+                    # we have a new file, process the old one
+                    async for frames in self.extract_and_process_xlsx(
+                        CollectedBytes(file=current_file, data=collected_bytes)
+                    ):
+                        yield frames
+                    collected_bytes = b""
+                    current_file = chunk_bytes.file
+                collected_bytes += chunk_bytes.data
+        except Exception as e:
+            # TODO handle exception
+            yield ""
+        finally:
+            # process the last file
+            async for frames in self.extract_and_process_xlsx(
+                CollectedBytes(file=current_file, data=collected_bytes)
+            ):
+                yield frames
+
+    async def extract_and_process_xlsx(
+        self, collected_bytes: CollectedBytes
+    ) -> AsyncGenerator[str, None]:
+        """function to extract and process xlsx file bytes"""
+        df = await self.extract_text_from_xlsx(collected_bytes)
+        yield df
+
+    async def extract_text_from_xlsx(
+        self, collected_bytes: CollectedBytes
+    ) -> pd.DataFrame:
+        """function to extract all the rows in the file"""
+        excel_buffer = io.BytesIO(collected_bytes.data)
+        dataframe = pd.read_excel(excel_buffer)
+        return dataframe
+
+    async def process_data(self, text: str) -> List[str]:
+        processed_data = text
+        for processor in self.processors:
+            processed_data = await processor.process(processed_data)
+        return processed_data
diff --git a/requirements.txt b/requirements.txt
@@ -161,5 +161,7 @@ SpeechRecognition
 pytesseract
 pillow
 pytextract
+pandas
 python-pptx
-tika
+tika
+openpyxl
diff --git a/tests/data/xlsx/book1.xlsx b/tests/data/xlsx/book1.xlsx
diff --git a/tests/test_json_ingestor.py b/tests/test_json_ingestor.py
@@ -25,9 +25,9 @@ async def poll_and_print():
         counter = 0
         async for ingested in ingested_call:
             assert ingested is not None
-            if len(ingested) == 0:
+            if len(ingested) != 0:
                 counter += 1
-        assert counter == 0
+        assert counter == 2
 
     await poll_and_print()
 
diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py
@@ -32,6 +32,5 @@ def test_scrapping_data():
     async def poll_and_print():
         async for result in collector.poll():
             assert not result.is_error()
-            print(result.unwrap())
 
     asyncio.run(poll_and_print())
diff --git a/tests/test_xlsx_ingestor.py b/tests/test_xlsx_ingestor.py
@@ -0,0 +1,36 @@
+import asyncio
+from pathlib import Path
+from querent.collectors.fs.fs_collector import FSCollectorFactory
+from querent.config.collector_config import FSCollectorConfig
+from querent.common.uri import Uri
+from querent.ingestors.ingestor_manager import IngestorFactoryManager
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_collect_and_ingest_xlsx():
+    collector_factory = FSCollectorFactory()
+    uri = Uri("file://" + str(Path("./tests/data/xlsx/").resolve()))
+    config = FSCollectorConfig(root_path=uri.path)
+    collector = collector_factory.resolve(uri, config)
+
+    ingestor_factory_manager = IngestorFactoryManager()
+    ingestor_factory = await ingestor_factory_manager.get_factory("xlsx")
+    ingestor = await ingestor_factory.create("xlsx", [])
+
+    ingested_call = ingestor.ingest(collector.poll())
+    counter = 0
+
+    async def poll_and_print():
+        counter = 0
+        async for ingested in ingested_call:
+            assert ingested is not None
+            for i in range(0, ingested.shape[0]):
+                counter += 1
+        assert counter == 3
+
+    await poll_and_print()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_collect_and_ingest_xlsx())