diff --git a/querent/collectors/aws/aws_collector.py b/querent/collectors/aws/aws_collector.py index f8437809..bad1e781 100644 --- a/querent/collectors/aws/aws_collector.py +++ b/querent/collectors/aws/aws_collector.py @@ -48,9 +48,11 @@ async def poll(self) -> AsyncGenerator[CollectedBytes, None]: file = self.download_object_as_byte_stream(obj["Key"]) async for chunk in self.read_chunks(file): yield CollectedBytes(file=obj["Key"], data=chunk, error=None) - except Exception as e: - # Handle exceptions gracefully, e.g., log the error - print(f"An error occurred: {e}") + + except PermissionError as exc: + print(f"Unable to open this file {file}, getting error as {exc}") + except OSError as exc: + print(f"Getting OS Error on file {file}, as {exc}") finally: await self.disconnect() # Disconnect when done diff --git a/querent/collectors/fs/fs_collector.py b/querent/collectors/fs/fs_collector.py index 14bb6844..03265f4e 100644 --- a/querent/collectors/fs/fs_collector.py +++ b/querent/collectors/fs/fs_collector.py @@ -7,6 +7,7 @@ from querent.common.uri import Uri from querent.config.collector_config import CollectorBackend, FSCollectorConfig import aiofiles +from querent.common import common_errors class FSCollector(Collector): @@ -24,9 +25,18 @@ async def disconnect(self): async def poll(self) -> AsyncGenerator[CollectedBytes, None]: async for file_path in self.walk_files(self.root_dir): - async with aiofiles.open(file_path, "rb") as file: - async for chunk in self.read_chunks(file): - yield CollectedBytes(file=file_path, data=chunk, error=None) + try: + async with aiofiles.open(file_path, "rb") as file: + async for chunk in self.read_chunks(file): + yield CollectedBytes(file=file_path, data=chunk, error=None) + except PermissionError as exc: + raise common_errors.PermissionError( + f"Unable to open this file {file_path}, getting error as {exc}" + ) from exc + except OSError as exc: + raise common_errors.OSError( + f"Getting OS Error on file {file_path}, as {exc}" + ) from exc async def read_chunks(self, file): while True: diff --git a/querent/collectors/gcs/gcs_collector.py b/querent/collectors/gcs/gcs_collector.py index 18a3fa8d..9d627c57 100644 --- a/querent/collectors/gcs/gcs_collector.py +++ b/querent/collectors/gcs/gcs_collector.py @@ -8,6 +8,7 @@ from querent.collectors.collector_base import Collector from querent.collectors.collector_factory import CollectorFactory from querent.common.uri import Uri +from querent.common import common_errors from google.cloud import storage from dotenv import load_dotenv @@ -23,7 +24,12 @@ def __init__(self, config: GcsCollectConfig): async def connect(self): if not self.client: - self.client = storage.Client.from_service_account_info(self.credentials) + try: + self.client = storage.Client.from_service_account_info(self.credentials) + except ConnectionError as exc: + raise common_errors.ConnectionError( + "Please pass the credentials file" + ) from exc async def disconnect(self): if self.client is not None: @@ -58,9 +64,18 @@ async def stream_blob(self, blob): if not chunk: break yield chunk - except Exception as e: - # Handle exceptions gracefully, e.g., log the error - print(f"An error occurred while streaming blob: {e}") + except PermissionError as exc: + raise common_errors.PermissionError( + f"Unable to open this file {blob_file}, getting error as {exc}" + ) from exc + except OSError as exc: + raise common_errors.OSError( + f"Getting OS Error on file {blob_file}, as {exc}" + ) from exc + except Exception as exc: + raise common_errors.UnknownError( + f"Getting OS Error on file {blob_file}, as {exc}" + ) from exc class GCSCollectorFactory(CollectorFactory): diff --git a/querent/common/common_errors.py b/querent/common/common_errors.py new file mode 100644 index 00000000..9ad7920f --- /dev/null +++ b/querent/common/common_errors.py @@ -0,0 +1,148 @@ +from enum import Enum + + +class IngestorError(Enum): + EOF = "End of File" + ETCD = "ETCD Error" + NETWORK = "Network Error" + TIMEOUT = "Timeout" + UNKNOWN = "Unknown Error" + FILE_NOT_FOUND = "File Not Found" + IOERROR = "IOError" + UIE = "UnidentifiedImageError" + WRONGPPTFILE = "Wrong PPt file" + INVALIDXMLERROR = "Invalid Xml Error" + BADZIPFILE = "BadZipFile" + UNICODEDECODEERROR = "UnicodeDecodeError" + LOOKUPERROR = "LookupError" + TYPEERROR = "TypeError" + UNKNOWNVALUEERROR = "UnknownValueError" + REQUESTERROR = "RequestError" + INDEXERROR = "IndexError" + CSVERROR = "CsvError" + RUNTIMEERROR = "RuntimeError" + JSONDECODEERROR = "JsonDecodeError" + DOCUMENTERROR = "DocumentError" + SHELLERROR = "ShellError" + PERMISSIONERROR = "PermissionError" + OSERROR = "OSError" + CONNECTIONERROR = "ConnectionError" + + +class IngestorErrorBase(Exception): + def __init__(self, error_code, message=None) -> None: + super().__init__(message) + self.error_code = error_code + + +class FileNotFoundError(IngestorErrorBase): + """Error function for file not found""" + + def __init__(self, message=None): + super().__init__(message=message, error_code=IngestorError.FILE_NOT_FOUND) + + +class IOError(IngestorErrorBase): + """Error function for I/O Errors""" + + def __init__(self, message=None) -> None: + super().__init__(IngestorError.IOERROR, message) + + +class UnidentifiedImageError(IngestorErrorBase): + """Error function for UnidentifiedImageError""" + + def __init__(self, message=None) -> None: + super().__init__(IngestorError.UIE, message) + + +class WrongPptFileError(IngestorErrorBase): + """Error function if ppt file is not being passed into ppt ingestor""" + + def __init__(self, message=None) -> None: + super().__init__(IngestorError.WRONGPPTFILE, message) + + +class InvalidXmlError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.INVALIDXMLERROR, message) + + +class BadZipFile(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.BADZIPFILE, message) + + +class UnicodeDecodeError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.UNICODEDECODEERROR, message) + + +class LookupError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.LOOKUPERROR, message) + + +class TypeError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.TYPEERROR, message) + + +class UnknownValueError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.UNKNOWNVALUEERROR, message) + + +class RequestError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.REQUESTERROR, message) + + +class IndexErrorException(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.INDEXERROR, message) + + +class UnknownError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.UNKNOWN, message) + + +class CsvError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.CSVERROR, message) + + +class RuntimeError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.RUNTIMEERROR, message) + + +class JsonDecodeError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.JSONDECODEERROR, message) + + +class DocumentError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.DOCUMENTERROR, message) + + +class ShellError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.SHELLERROR, message) + + +class PermissionError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.PERMISSIONERROR, message) + + +class OSError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.OSERROR, message) + + +class ConnectionError(IngestorErrorBase): + def __init__(self, message=None) -> None: + super().__init__(IngestorError.CONNECTIONERROR, message) diff --git a/querent/ingestors/audio/audio_ingestors.py b/querent/ingestors/audio/audio_ingestors.py index ccbb5b04..0632ace0 100644 --- a/querent/ingestors/audio/audio_ingestors.py +++ b/querent/ingestors/audio/audio_ingestors.py @@ -8,6 +8,12 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.config.ingestor_config import IngestorBackend from querent.common.types.collected_bytes import CollectedBytes +from querent.common.common_errors import ( + UnknownValueError, + RequestError, + IndexErrorException, + UnknownError, +) from querent.common.types.ingested_tokens import IngestedTokens @@ -63,32 +69,41 @@ async def ingest( async def extract_and_process_audio( self, collected_bytes: CollectedBytes ) -> AsyncGenerator[IngestedTokens, None]: - audio_segment = AudioSegment.from_file( - io.BytesIO(collected_bytes.data), format=collected_bytes.extension - ) - temp_wave = io.BytesIO() - audio_segment.export(temp_wave, format="wav") - # Initialize the recognizer - recognizer = sr.Recognizer() - - temp_wave.seek(0) - with sr.AudioFile(temp_wave) as source: - recognizer.adjust_for_ambient_noise(source, duration=1) - audio_data = recognizer.record(source) - # Recognize the text using the recognizer try: - recognized_text = recognizer.recognize_google(audio_data, language="en-US") - processed_text = await self.process_data(recognized_text) - yield IngestedTokens( - file=collected_bytes.file, data=[processed_text], error=None + audio_segment = AudioSegment.from_file( + io.BytesIO(collected_bytes.data), format=collected_bytes.extension ) - except sr.UnknownValueError: - print("Could not understand the audio") - except sr.RequestError as e: - print(f"Could not request results; {e}") - except Exception as e: - print(e) + temp_wave = io.BytesIO() + audio_segment.export(temp_wave, format="wav") + # Initialize the recognizer + recognizer = sr.Recognizer() + + temp_wave.seek(0) + with sr.AudioFile(temp_wave) as source: + recognizer.adjust_for_ambient_noise(source, duration=1) + audio_data = recognizer.record(source) + + # Recognize the text using the recognizer + + recognized_text = recognizer.recognize_google(audio_data, language="en-US") + yield recognized_text + except sr.UnknownValueError as exc: + raise UnknownValueError( + f"The following file gave Unknown Value Error {collected_bytes.file}" + ) from exc + except sr.RequestError as exc: + raise RequestError( + f"The following file gave Request Error {collected_bytes.file}" + ) from exc + except IndexError as exc: + raise IndexErrorException( + f"The following file gave Request Error {collected_bytes.file}" + ) from exc + except Exception as exc: + raise UnknownError( + f"Received unknown error {exc} from the file {collected_bytes.file}" + ) from exc async def process_data(self, text: str) -> str: processed_data = text diff --git a/querent/ingestors/csv/csv_ingestor.py b/querent/ingestors/csv/csv_ingestor.py index bca617a1..a03a5d6d 100644 --- a/querent/ingestors/csv/csv_ingestor.py +++ b/querent/ingestors/csv/csv_ingestor.py @@ -7,6 +7,7 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.config.ingestor_config import IngestorBackend from querent.common.types.collected_bytes import CollectedBytes +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens @@ -19,7 +20,7 @@ async def supports(self, file_extension: str) -> bool: async def create( self, file_extension: str, processors: List[AsyncProcessor] ) -> BaseIngestor: - if not self.supports(file_extension): + if not await self.supports(file_extension): return None return CsvIngestor(processors) @@ -69,8 +70,25 @@ async def extract_and_process_csv( yield processed_row async def extract_text_from_csv(self, collected_bytes: CollectedBytes) -> str: - text_data = collected_bytes.data.decode("utf-8") - return text_data + try: + text_data = collected_bytes.data.decode("utf-8") + return text_data + except UnicodeDecodeError as exc: + raise common_errors.UnicodeDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc + except LookupError as exc: + raise common_errors.LookupError( + f"Getting LookupError on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting TypeError on this file {collected_bytes.file}" + ) from exc + except Exception as exc: + raise common_errors.UnknownError( + f"Getting error message:- {exc} on file {collected_bytes.file}" + ) async def process_data(self, text: str) -> str: processed_data = text diff --git a/querent/ingestors/doc/doc_ingestor.py b/querent/ingestors/doc/doc_ingestor.py index 1689b867..ce42c9e4 100644 --- a/querent/ingestors/doc/doc_ingestor.py +++ b/querent/ingestors/doc/doc_ingestor.py @@ -9,6 +9,7 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.config.ingestor_config import IngestorBackend from querent.common.types.collected_bytes import CollectedBytes +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens import pytextract @@ -88,7 +89,9 @@ async def extract_text_from_doc(self, collected_bytes: CollectedBytes) -> str: current_doc_text = await self.temp_extract_from(collected_bytes) return current_doc_text else: - return "" + raise common_errors.UnknownError( + f"Not a doc or docx file {collected_bytes.file}" + ) async def temp_extract_from(self, collected_bytes: CollectedBytes) -> str: suffix = "." + collected_bytes.extension @@ -99,6 +102,26 @@ async def temp_extract_from(self, collected_bytes: CollectedBytes) -> str: try: txt = pytextract.process(temp_file_path).decode("utf-8") return txt + except RuntimeError as exc: + raise common_errors.RuntimeError( + f"Getting ExtractionError on this file {collected_bytes.file}" + ) from exc + except UnicodeDecodeError as exc: + raise common_errors.UnicodeDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc + except LookupError as exc: + raise common_errors.LookupError( + f"Getting LookupError on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting TypeError on this file {collected_bytes.file}" + ) from exc + except pytextract.exceptions.ShellError as exc: + raise common_errors.ShellError( + f"Getting ShellError on this file {collected_bytes.file}" + ) from exc finally: os.remove(temp_file_path) diff --git a/querent/ingestors/html/html_ingestor.py b/querent/ingestors/html/html_ingestor.py index 387a1adb..996babe8 100644 --- a/querent/ingestors/html/html_ingestor.py +++ b/querent/ingestors/html/html_ingestor.py @@ -6,6 +6,7 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.config.ingestor_config import IngestorBackend from querent.common.types.collected_bytes import CollectedBytes +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens @@ -72,18 +73,31 @@ async def extract_text_from_html( self, collected_bytes: CollectedBytes ) -> List[str]: """Function to extract text from xml""" - html_content = collected_bytes.data.decode("UTF-8") - soup = BeautifulSoup(html_content, "html.parser") - elements = [] - tags = ["p", "h1", "h2", "h3", "h4", "h5", "a", "footer", "article"] - for element in soup.find_all(tags): - if element.name == "a": - link_text = element.get_text().strip() - link_href = element.get("href") - elements.append(f"Link: {link_text}, URL: {link_href}") - else: - element_text = element.get_text().strip() - elements.append(element_text) + try: + html_content = collected_bytes.data.decode("UTF-8") + soup = BeautifulSoup(html_content, "html.parser") + elements = [] + tags = ["p", "h1", "h2", "h3", "h4", "h5", "a", "footer", "article"] + for element in soup.find_all(tags): + if element.name == "a": + link_text = element.get_text().strip() + link_href = element.get("href") + elements.append(f"Link: {link_text}, URL: {link_href}") + else: + element_text = element.get_text().strip() + elements.append(element_text) + except UnicodeDecodeError as exc: + raise common_errors.UnicodeDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc + except LookupError as exc: + raise common_errors.LookupError( + f"Getting LookupError on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting TypeError on this file {collected_bytes.file}" + ) from exc return elements diff --git a/querent/ingestors/images/image_ingestor.py b/querent/ingestors/images/image_ingestor.py index 77704b9b..5b3d2f29 100644 --- a/querent/ingestors/images/image_ingestor.py +++ b/querent/ingestors/images/image_ingestor.py @@ -4,8 +4,14 @@ from querent.ingestors.ingestor_factory import IngestorFactory from querent.processors.async_processor import AsyncProcessor from querent.config.ingestor_config import IngestorBackend +from querent.processors.async_processor import AsyncProcessor +from querent.common.common_errors import ( + FileNotFoundError, + IOError, + UnidentifiedImageError, +) import pytesseract -from PIL import Image +from PIL import Image, UnidentifiedImageError import io from querent.common.types.ingested_tokens import IngestedTokens @@ -19,7 +25,7 @@ async def supports(self, file_extension: str) -> bool: async def create( self, file_extension: str, processors: List[AsyncProcessor] ) -> BaseIngestor: - if not self.supports(file_extension): + if not await self.supports(file_extension): return None return ImageIngestor(processors) @@ -65,7 +71,22 @@ async def extract_and_process_image(self, collected_bytes: CollectedBytes) -> st return await self.process_data(text) async def extract_text_from_image(self, collected_bytes: CollectedBytes) -> str: - image = Image.open(io.BytesIO(collected_bytes.data)) + try: + image = Image.open(io.BytesIO(collected_bytes.data)) + + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Given file with name as {collected_bytes.file} not found" + ) from exc + except IOError as exc: + raise IOError( + f"Unable to open given file with name as {collected_bytes.file}" + ) from exc + except UnidentifiedImageError as exc: + raise UnidentifiedImageError( + f"Unable to open the given file with given name {collected_bytes.file}" + ) from exc + text = pytesseract.image_to_string(image) return text diff --git a/querent/ingestors/ingestor_errors.py b/querent/ingestors/ingestor_errors.py deleted file mode 100644 index 169f71f1..00000000 --- a/querent/ingestors/ingestor_errors.py +++ /dev/null @@ -1,9 +0,0 @@ -from enum import Enum - - -class IngestorError(Enum): - EOF = "End of File" - ETCD = "ETCD Error" - NETWORK = "Network Error" - TIMEOUT = "Timeout" - UNKNOWN = "Unknown Error" diff --git a/querent/ingestors/json/json_ingestor.py b/querent/ingestors/json/json_ingestor.py index 003bfcbe..85a4e67b 100644 --- a/querent/ingestors/json/json_ingestor.py +++ b/querent/ingestors/json/json_ingestor.py @@ -5,6 +5,7 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.ingestors.ingestor_factory import IngestorFactory from querent.processors.async_processor import AsyncProcessor +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens @@ -76,9 +77,23 @@ async def extract_and_process_json( try: json_data = json.loads(collected_bytes.data.decode("utf-8")) return [json_data] if isinstance(json_data, dict) else [] + except json.JSONDecodeError as exc: + raise common_errors.JsonDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc - except json.JSONDecodeError: - return [] + except UnicodeDecodeError as exc: + raise common_errors.UnicodeDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc + except LookupError as exc: + raise common_errors.LookupError( + f"Getting LookupError on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting TypeError on this file {collected_bytes.file}" + ) from exc async def process_data(self, text: dict) -> dict: processed_data = text diff --git a/querent/ingestors/pdfs/pdf_ingestor_v1.py b/querent/ingestors/pdfs/pdf_ingestor_v1.py index 4d3ec319..cff9606d 100644 --- a/querent/ingestors/pdfs/pdf_ingestor_v1.py +++ b/querent/ingestors/pdfs/pdf_ingestor_v1.py @@ -6,6 +6,7 @@ from querent.ingestors.base_ingestor import BaseIngestor from querent.ingestors.ingestor_factory import IngestorFactory from querent.processors.async_processor import AsyncProcessor +from querent.common import common_errors class PdfIngestorFactory(IngestorFactory): @@ -57,15 +58,27 @@ async def ingest( yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}") finally: # process the last file - async for page_text in self.extract_and_process_pdf( - CollectedBytes(file=current_file, data=collected_bytes) - ): - yield page_text + try: + async for page_text in self.extract_and_process_pdf( + CollectedBytes(file=current_file, data=collected_bytes) + ): + yield page_text + except Exception as exc: + yield None async def extract_and_process_pdf( self, collected_bytes: CollectedBytes ) -> AsyncGenerator[IngestedTokens, None]: - pdf = fitz.open(stream=collected_bytes.data, filetype="pdf") + try: + pdf = fitz.open(stream=collected_bytes.data, filetype="pdf") + except fitz.DocumentError as exc: + raise common_errors.DocumentError( + f"Getting Document error on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting type error on this file {collected_bytes.file}" + ) from exc for page in pdf: text = page.get_text() if not text: diff --git a/querent/ingestors/ppt/ppt_ingestor.py b/querent/ingestors/ppt/ppt_ingestor.py index a72d403e..ac5f0eab 100644 --- a/querent/ingestors/ppt/ppt_ingestor.py +++ b/querent/ingestors/ppt/ppt_ingestor.py @@ -1,13 +1,16 @@ from typing import List, AsyncGenerator from io import BytesIO from pptx import Presentation +from pptx.exc import InvalidXmlError from tika import parser +from zipfile import BadZipFile from querent.ingestors.ingestor_factory import IngestorFactory from querent.processors.async_processor import AsyncProcessor from querent.ingestors.base_ingestor import BaseIngestor from querent.config.ingestor_config import IngestorBackend from querent.common.types.collected_bytes import CollectedBytes +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens @@ -63,21 +66,38 @@ async def ingest( async def extract_and_process_ppt( self, collected_bytes: CollectedBytes ) -> AsyncGenerator[str, None]: - if collected_bytes.extension == "pptx": - ppt_file = BytesIO(collected_bytes.data) - presentation = Presentation(ppt_file) - for slide in presentation.slides: - text = [] - for shape in slide.shapes: - if hasattr(shape, "text"): - text.append(shape.text) - slide_text = "\n".join(text) - processed_slide_text = await self.process_data(slide_text) - yield processed_slide_text - elif collected_bytes.extension == "ppt": - parsed = parser.from_buffer(collected_bytes.data) - extracted_text = parsed["content"] - yield extracted_text + try: + if collected_bytes.extension == "pptx": + ppt_file = BytesIO(collected_bytes.data) + presentation = Presentation(ppt_file) + for slide in presentation.slides: + text = [] + for shape in slide.shapes: + if hasattr(shape, "text"): + text.append(shape.text) + slide_text = "\n".join(text) + processed_slide_text = await self.process_data(slide_text) + yield processed_slide_text + elif collected_bytes.extension == "ppt": + parsed = parser.from_buffer(collected_bytes.data) + extracted_text = parsed["content"] + yield extracted_text + else: + raise common_errors.WrongPptFileError( + f"Given file is not ppt {collected_bytes.file}" + ) + except InvalidXmlError as exc: + raise common_errors.InvalidXmlError( + f"The following file is not in proper xml format {collected_bytes.file}" + ) from exc + except BadZipFile as exc: + raise common_errors.BadZipFile( + f"The following file is not a zip file{collected_bytes.file}" + ) from exc + except Exception as exc: + raise common_errors.UnknownError( + f"Received Unknown error as {exc} from file {collected_bytes.file}" + ) from exc async def process_data(self, text: str) -> str: processed_data = text diff --git a/querent/ingestors/texts/text_ingestor.py b/querent/ingestors/texts/text_ingestor.py index ec4e4541..4d92ae19 100644 --- a/querent/ingestors/texts/text_ingestor.py +++ b/querent/ingestors/texts/text_ingestor.py @@ -4,6 +4,7 @@ from querent.ingestors.ingestor_factory import IngestorFactory from querent.processors.async_processor import AsyncProcessor from querent.config.ingestor_config import IngestorBackend +from querent.common import common_errors from querent.common.types.ingested_tokens import IngestedTokens @@ -74,7 +75,21 @@ async def extract_and_process_text( yield line async def extract_text_from_file(self, collected_bytes: CollectedBytes) -> str: - text = collected_bytes.data.decode("utf-8") + text = "" + try: + text = collected_bytes.data.decode("utf-8") + except UnicodeDecodeError as exc: + raise common_errors.UnicodeDecodeError( + f"Getting UnicodeDecodeError on this file {collected_bytes.file}" + ) from exc + except LookupError as exc: + raise common_errors.LookupError( + f"Getting LookupError on this file {collected_bytes.file}" + ) from exc + except TypeError as exc: + raise common_errors.TypeError( + f"Getting TypeError on this file {collected_bytes.file}" + ) from exc return text async def process_data(self, text: str) -> List[str]: diff --git a/tests/collector_tests/__init__.py b/tests/collector_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/sad_tests/__init__.py b/tests/sad_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/sad_tests/test_audio_error.py b/tests/sad_tests/test_audio_error.py new file mode 100644 index 00000000..0ecfb65c --- /dev/null +++ b/tests/sad_tests/test_audio_error.py @@ -0,0 +1,34 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common import common_errors +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_audio(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/image/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("mp3") + ingestor = await ingestor_factory.create("mp3", []) + + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(common_errors.IndexErrorException): + async for ingested in ingested_call: + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_audio()) diff --git a/tests/sad_tests/test_csv_error.py b/tests/sad_tests/test_csv_error.py new file mode 100644 index 00000000..60f0d275 --- /dev/null +++ b/tests/sad_tests/test_csv_error.py @@ -0,0 +1,34 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common import common_errors +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_csv(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/image/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("csv") + ingestor = await ingestor_factory.create("csv", []) + + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(common_errors.UnicodeDecodeError): + async for ingested in ingested_call: + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_csv()) diff --git a/tests/sad_tests/test_doc_ingestor.py b/tests/sad_tests/test_doc_ingestor.py new file mode 100644 index 00000000..96d18608 --- /dev/null +++ b/tests/sad_tests/test_doc_ingestor.py @@ -0,0 +1,35 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common.common_errors import UnknownError +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_doc(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/audio/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("docx") + ingestor = await ingestor_factory.create("docx", []) + + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(UnknownError): + async for ingested in ingested_call: + print(ingested) + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_doc()) diff --git a/tests/sad_tests/test_html_ingestor_error.py b/tests/sad_tests/test_html_ingestor_error.py new file mode 100644 index 00000000..563c1f28 --- /dev/null +++ b/tests/sad_tests/test_html_ingestor_error.py @@ -0,0 +1,34 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common import common_errors +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_html(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/ppt/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("html") + ingestor = await ingestor_factory.create("html", []) + + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(common_errors.UnicodeDecodeError): + async for ingested in ingested_call: + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_html()) diff --git a/tests/sad_tests/test_image_ingestor_error.py b/tests/sad_tests/test_image_ingestor_error.py new file mode 100644 index 00000000..98935963 --- /dev/null +++ b/tests/sad_tests/test_image_ingestor_error.py @@ -0,0 +1,38 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_image_bytes(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/html/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("jpg") + ingestor = await ingestor_factory.create("jpg", []) + + ingested_call = ingestor.ingest(collector.poll()) + counter = 0 + + async def poll_and_print(): + counter = 0 + async for ingested in ingested_call: + if ingested.data is None: + continue + if len(ingested.data) != 0: + counter += 1 + # Counter would be zero as we are not able to open the given image file + assert counter == 0 + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_image_bytes()) diff --git a/tests/sad_tests/test_ppt_ingestor_error.py b/tests/sad_tests/test_ppt_ingestor_error.py new file mode 100644 index 00000000..69310787 --- /dev/null +++ b/tests/sad_tests/test_ppt_ingestor_error.py @@ -0,0 +1,37 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common import common_errors +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_ppt(): + # Set up the collector + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/html/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + # Set up the ingestor + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("ppt") + ingestor = await ingestor_factory.create("ppt", []) + + # Collect and ingest the PDF + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(common_errors.UnknownError): + async for ingested in ingested_call: + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_ppt()) diff --git a/tests/sad_tests/test_txt_ingestor_error.py b/tests/sad_tests/test_txt_ingestor_error.py new file mode 100644 index 00000000..1b66c4d7 --- /dev/null +++ b/tests/sad_tests/test_txt_ingestor_error.py @@ -0,0 +1,34 @@ +import asyncio +from pathlib import Path +from querent.collectors.fs.fs_collector import FSCollectorFactory +from querent.config.collector_config import FSCollectorConfig +from querent.common.uri import Uri +from querent.ingestors.ingestor_manager import IngestorFactoryManager +from querent.common import common_errors +import pytest + + +@pytest.mark.asyncio +async def test_collect_and_ingest_wrong_txt(): + collector_factory = FSCollectorFactory() + uri = Uri("file://" + str(Path("./tests/data/image/").resolve())) + config = FSCollectorConfig(root_path=uri.path) + collector = collector_factory.resolve(uri, config) + + ingestor_factory_manager = IngestorFactoryManager() + ingestor_factory = await ingestor_factory_manager.get_factory("html") + ingestor = await ingestor_factory.create("html", []) + + ingested_call = ingestor.ingest(collector.poll()) + + async def poll_and_print(): + with pytest.raises(common_errors.UnicodeDecodeError): + async for ingested in ingested_call: + if ingested.data is None: + continue + + await poll_and_print() + + +if __name__ == "__main__": + asyncio.run(test_collect_and_ingest_wrong_txt()) diff --git a/tests/test_aws_collector.py b/tests/test_aws_collector.py index f8558d3f..1275ae13 100644 --- a/tests/test_aws_collector.py +++ b/tests/test_aws_collector.py @@ -52,7 +52,7 @@ async def poll_and_print(): assert not result.is_error() chunk = result.unwrap() assert chunk is not None - if chunk is not "" or chunk is not None: + if chunk != "" or chunk is not None: counter += 1 assert counter == 5392 diff --git a/tests/test_gcs_collector.py b/tests/test_gcs_collector.py index d7a5de1f..7f47b26b 100644 --- a/tests/test_gcs_collector.py +++ b/tests/test_gcs_collector.py @@ -47,7 +47,7 @@ async def poll_and_print(): assert not result.is_error() chunk = result.unwrap() assert chunk is not None - if chunk is not "" or chunk is not None: + if chunk != "" or chunk is not None: counter += 1 assert counter == 797 diff --git a/tests/test_html_ingestor.py b/tests/test_html_ingestor.py index a1b33f88..79870e44 100644 --- a/tests/test_html_ingestor.py +++ b/tests/test_html_ingestor.py @@ -27,7 +27,6 @@ async def test_collect_and_ingest_html(): async def poll_and_print(): counter = 0 async for ingested in ingested_call: - assert ingested is not None if ingested != "" or ingested is not None: counter += 1 assert counter == 16 diff --git a/tests/test_local_storage.py b/tests/test_local_storage.py index 24bfe3f3..d013b6d0 100644 --- a/tests/test_local_storage.py +++ b/tests/test_local_storage.py @@ -18,7 +18,6 @@ def temp_dir(): temp_dir.cleanup() -@pytest.mark.asyncio def test_local_storage(temp_dir): uri = Uri("file://" + temp_dir) # Use the temp_dir as the base URI storage = LocalFileStorage(uri, Path(temp_dir)) # Provide the 'uri' argument only @@ -39,7 +38,6 @@ def test_local_storage(temp_dir): assert content == b"test" -@pytest.mark.asyncio def test_storage_resolver(temp_dir): uri = Uri("file://" + temp_dir) # Use the temp_dir as the base URI resolver = StorageResolver() diff --git a/tests/test_ppt_ingestor.py b/tests/test_ppt_ingestor.py index 93e97be5..945a06e4 100644 --- a/tests/test_ppt_ingestor.py +++ b/tests/test_ppt_ingestor.py @@ -27,7 +27,6 @@ async def test_collect_and_ingest_ppt(): async def poll_and_print(): counter = 0 async for ingested in ingested_call: - assert ingested is not None if ingested != "" or ingested is not None: counter += 1 assert counter == 10 diff --git a/tests/test_webscrapper.py b/tests/test_webscrapper.py index cef40428..452a80cd 100644 --- a/tests/test_webscrapper.py +++ b/tests/test_webscrapper.py @@ -29,12 +29,8 @@ def test_scrapping_data(): collector = resolver.resolve(uri, webscrapperConfig) assert collector is not None - print("REached here") - async def poll_and_print(): - print("Part 2") async for result in collector.poll(): - print("Hola...") assert not result.is_error() asyncio.run(poll_and_print()) diff --git a/tests/test_xml_ingestor.py b/tests/test_xml_ingestor.py index 055c2bf3..59c94795 100644 --- a/tests/test_xml_ingestor.py +++ b/tests/test_xml_ingestor.py @@ -27,7 +27,6 @@ async def test_collect_and_ingest_xml(): async def poll_and_print(): counter = 0 async for ingested in ingested_call: - assert ingested is not None if ingested != "" or ingested is not None: counter += 1 assert counter == 2