Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added html and xml ingestor #52

Merged
merged 5 commits into from
Sep 15, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion querent/ingestors/csv/csv_ingestor.py
Original file line number Diff line number Diff line change
@@ -77,7 +77,6 @@ async def extract_text_from_csv(
self, collected_bytes: CollectedBytes
) -> csv.reader:
text_data = collected_bytes.data.decode("utf-8")
print(text_data)
text = csv.reader(io.StringIO(text_data))
return text

1 change: 0 additions & 1 deletion querent/ingestors/doc/doc_ingestor.py
Original file line number Diff line number Diff line change
@@ -60,7 +60,6 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield text
pass

async def extract_and_process_doc(
self, collected_bytes: CollectedBytes
97 changes: 97 additions & 0 deletions querent/ingestors/html/html_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Ingestor file for html"""
from typing import List, AsyncGenerator
from bs4 import BeautifulSoup

from querent.processors.async_processor import AsyncProcessor
from querent.ingestors.ingestor_factory import IngestorFactory
from querent.ingestors.base_ingestor import BaseIngestor
from querent.config.ingestor_config import IngestorBackend
from querent.common.types.collected_bytes import CollectedBytes


class HtmlIngestorFactory(IngestorFactory):
"""Ingestor factory for html files"""

SUPPORTED_EXTENSIONS = {"html"}

async def supports(self, file_extension: str) -> bool:
return file_extension.lower() in self.SUPPORTED_EXTENSIONS

async def create(
self, file_extension: str, processors: List[AsyncProcessor]
) -> BaseIngestor:
if not await self.supports(file_extension):
return None
return HtmlIngestor(processors)


class HtmlIngestor(BaseIngestor):
"""Ingestor for html"""

def __init__(self, processors: List[AsyncProcessor]):
super().__init__(IngestorBackend.HTML)
self.processors = processors

async def ingest(
self, poll_function: AsyncGenerator[CollectedBytes, None]
) -> AsyncGenerator[str, None]:
"""Ingesting bytes of xml file"""
current_file = None
collected_bytes = b""
try:
async for chunk_bytes in poll_function:
if chunk_bytes.is_error():
# TODO handle error
continue
if current_file is None:
current_file = chunk_bytes.file
elif current_file != chunk_bytes.file:
# we have a new file, process the old one
async for text in self.extract_and_process_html(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield text
collected_bytes = b""
current_file = chunk_bytes.file
collected_bytes += chunk_bytes.data
except Exception as e:
# TODO handle exception
yield ""
finally:
# process the last file
async for text in self.extract_and_process_html(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield text

async def extract_and_process_html(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
"""Function to extract and process xml files"""
text = await self.extract_text_from_html(collected_bytes)
processed_text = await self.process_data(text)
yield processed_text

async def extract_text_from_html(self, collected_bytes: CollectedBytes) -> str:
"""Function to extract text from xml"""
html_content = collected_bytes.data.decode("UTF-8")
soup = BeautifulSoup(html_content, "html.parser")
text = []
links = []
tags = ["p", "h1", "h2", "h3", "h4", "h5", "a", "footer", "article"]
for element in soup.find_all(tags):
if element.name == "a":
link_text = element.get_text().strip()
link_href = element.get("href")
links.append((link_text, link_href))
else:
element_text = element.get_text().strip()
text.append(element_text)

return text

async def process_data(self, text: str) -> List[str]:
processed_data = text
for processor in self.processors:
processed_data = await processor.process(processed_data)
return processed_data
4 changes: 4 additions & 0 deletions querent/ingestors/ingestor_manager.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,8 @@
from querent.ingestors.csv.csv_ingestor import CsvIngestorFactory
from querent.ingestors.xlsx.xlsx_ingestor import XlsxIngestorFactory
from querent.ingestors.ppt.ppt_ingestor import PptIngestorFactory
from querent.ingestors.xml.xml_ingestor import XmlIngestorFactory
from querent.ingestors.html.html_ingestor import HtmlIngestorFactory


class IngestorFactoryManager:
@@ -34,6 +36,8 @@ def __init__(self):
IngestorBackend.XLSX.value: XlsxIngestorFactory(),
IngestorBackend.PPT.value: PptIngestorFactory(),
IngestorBackend.PPTX.value: PptIngestorFactory(),
IngestorBackend.XML.value: XmlIngestorFactory(),
IngestorBackend.HTML.value: HtmlIngestorFactory(),
# Ingestor.TEXT.value: TextIngestor(),
# Add more mappings as needed
}
85 changes: 85 additions & 0 deletions querent/ingestors/xml/xml_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Ingestor file for xml"""
from typing import List, AsyncGenerator
import xml.etree.ElementTree as ET
from io import BytesIO

from querent.processors.async_processor import AsyncProcessor
from querent.ingestors.ingestor_factory import IngestorFactory
from querent.config.ingestor_config import IngestorBackend
from querent.ingestors.base_ingestor import BaseIngestor
from querent.common.types.collected_bytes import CollectedBytes


class XmlIngestorFactory(IngestorFactory):
"""Ingestor factory for xlsx files"""

SUPPORTED_EXTENSIONS = {"xml"}

async def supports(self, file_extension: str) -> bool:
return file_extension.lower() in self.SUPPORTED_EXTENSIONS

async def create(
self, file_extension: str, processors: List[AsyncProcessor]
) -> BaseIngestor:
if not await self.supports(file_extension):
return None
return XmlIngestor(processors)


class XmlIngestor(BaseIngestor):
"""Ingestor for xml"""

def __init__(self, processors: List[AsyncProcessor]):
super().__init__(IngestorBackend.XML)
self.processors = processors

async def ingest(
self, poll_function: AsyncGenerator[CollectedBytes, None]
) -> AsyncGenerator[str, None]:
"""Ingesting bytes of xml file"""
current_file = None
collected_bytes = b""
try:
async for chunk_bytes in poll_function:
if chunk_bytes.is_error():
# TODO handle error
continue
if current_file is None:
current_file = chunk_bytes.file
elif current_file != chunk_bytes.file:
# we have a new file, process the old one
async for text in self.extract_and_process_xml(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield text
collected_bytes = b""
current_file = chunk_bytes.file
collected_bytes += chunk_bytes.data
except Exception as e:
# TODO handle exception
yield ""
finally:
# process the last file
async for text in self.extract_and_process_xml(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield text

async def extract_and_process_xml(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
"""Function to extract and process xml files"""
text = await self.extract_text_from_xml(collected_bytes)
processed_text = await self.process_data(text)
yield processed_text

async def extract_text_from_xml(self, collected_bytes: CollectedBytes) -> str:
"""Function to extract text from xml"""
text = collected_bytes.data.decode("UTF-8")
return text

async def process_data(self, text: str) -> List[str]:
processed_data = text
for processor in self.processors:
processed_data = await processor.process(processed_data)
return processed_data
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -165,3 +165,4 @@ pandas
python-pptx
tika
openpyxl
xml
53 changes: 53 additions & 0 deletions tests/data/html/gnn_and_cnn.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GNNs and CNNs</title>
</head>
<body>
<header>
<h1>Graph Neural Networks (GNNs)</h1>
</header>
<section>
<h2>Introduction</h2>
<p>Graph Neural Networks (GNNs) are a class of neural networks designed for handling graph-structured data.</p>
<p>They have gained popularity in various domains, including social networks, recommendation systems, and biology.</p>
</section>
<section>
<h2>Key Features</h2>
<ul>
<li>Graph Convolutional Layers</li>
<li>Node and Graph Classification</li>
<li>Message Passing</li>
</ul>
</section>
<section>
<h2>Learn More</h2>
<p>For more information on GNNs, visit the <a href="https://en.wikipedia.org/wiki/Graph_neural_network">Wikipedia page</a>.</p>
<img src="gnn_image.png" alt="Graph Neural Network Diagram">
</section>
<hr>
<header>
<h1>Convolutional Neural Networks (CNNs)</h1>
</header>
<section>
<h2>Introduction</h2>
<p>Convolutional Neural Networks (CNNs) are a class of deep learning models commonly used in computer vision tasks.</p>
<p>They are known for their ability to capture spatial patterns in data.</p>
</section>
<section>
<h2>Key Features</h2>
<ul>
<li>Convolutional Layers</li>
<li>Pooling Layers</li>
<li>Feature Maps</li>
</ul>
</section>
<section>
<h2>Learn More</h2>
<p>For more information on CNNs, visit the <a href="https://en.wikipedia.org/wiki/Convolutional_neural_network">Wikipedia page</a>.</p>
<img src="cnn_image.png" alt="Convolutional Neural Network Diagram">
</section>
</body>
</html>
210 changes: 210 additions & 0 deletions tests/data/xml/cd_catalog.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
<CATALOG>
<CD>
<TITLE>Empire Burlesque</TITLE>
<ARTIST>Bob Dylan</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Columbia</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1985</YEAR>
</CD>
<CD>
<TITLE>Hide your heart</TITLE>
<ARTIST>Bonnie Tyler</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>CBS Records</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1988</YEAR>
</CD>
<CD>
<TITLE>Greatest Hits</TITLE>
<ARTIST>Dolly Parton</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>RCA</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1982</YEAR>
</CD>
<CD>
<TITLE>Still got the blues</TITLE>
<ARTIST>Gary Moore</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Virgin records</COMPANY>
<PRICE>10.20</PRICE>
<YEAR>1990</YEAR>
</CD>
<CD>
<TITLE>Eros</TITLE>
<ARTIST>Eros Ramazzotti</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>BMG</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1997</YEAR>
</CD>
<CD>
<TITLE>One night only</TITLE>
<ARTIST>Bee Gees</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Polydor</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1998</YEAR>
</CD>
<CD>
<TITLE>Sylvias Mother</TITLE>
<ARTIST>Dr.Hook</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>CBS</COMPANY>
<PRICE>8.10</PRICE>
<YEAR>1973</YEAR>
</CD>
<CD>
<TITLE>Maggie May</TITLE>
<ARTIST>Rod Stewart</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Pickwick</COMPANY>
<PRICE>8.50</PRICE>
<YEAR>1990</YEAR>
</CD>
<CD>
<TITLE>Romanza</TITLE>
<ARTIST>Andrea Bocelli</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>Polydor</COMPANY>
<PRICE>10.80</PRICE>
<YEAR>1996</YEAR>
</CD>
<CD>
<TITLE>When a man loves a woman</TITLE>
<ARTIST>Percy Sledge</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Atlantic</COMPANY>
<PRICE>8.70</PRICE>
<YEAR>1987</YEAR>
</CD>
<CD>
<TITLE>Black angel</TITLE>
<ARTIST>Savage Rose</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>Mega</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1995</YEAR>
</CD>
<CD>
<TITLE>1999 Grammy Nominees</TITLE>
<ARTIST>Many</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Grammy</COMPANY>
<PRICE>10.20</PRICE>
<YEAR>1999</YEAR>
</CD>
<CD>
<TITLE>For the good times</TITLE>
<ARTIST>Kenny Rogers</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Mucik Master</COMPANY>
<PRICE>8.70</PRICE>
<YEAR>1995</YEAR>
</CD>
<CD>
<TITLE>Big Willie style</TITLE>
<ARTIST>Will Smith</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Columbia</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1997</YEAR>
</CD>
<CD>
<TITLE>Tupelo Honey</TITLE>
<ARTIST>Van Morrison</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Polydor</COMPANY>
<PRICE>8.20</PRICE>
<YEAR>1971</YEAR>
</CD>
<CD>
<TITLE>Soulsville</TITLE>
<ARTIST>Jorn Hoel</ARTIST>
<COUNTRY>Norway</COUNTRY>
<COMPANY>WEA</COMPANY>
<PRICE>7.90</PRICE>
<YEAR>1996</YEAR>
</CD>
<CD>
<TITLE>The very best of</TITLE>
<ARTIST>Cat Stevens</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Island</COMPANY>
<PRICE>8.90</PRICE>
<YEAR>1990</YEAR>
</CD>
<CD>
<TITLE>Stop</TITLE>
<ARTIST>Sam Brown</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>A and M</COMPANY>
<PRICE>8.90</PRICE>
<YEAR>1988</YEAR>
</CD>
<CD>
<TITLE>Bridge of Spies</TITLE>
<ARTIST>T'Pau</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Siren</COMPANY>
<PRICE>7.90</PRICE>
<YEAR>1987</YEAR>
</CD>
<CD>
<TITLE>Private Dancer</TITLE>
<ARTIST>Tina Turner</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Capitol</COMPANY>
<PRICE>8.90</PRICE>
<YEAR>1983</YEAR>
</CD>
<CD>
<TITLE>Midt om natten</TITLE>
<ARTIST>Kim Larsen</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>Medley</COMPANY>
<PRICE>7.80</PRICE>
<YEAR>1983</YEAR>
</CD>
<CD>
<TITLE>Pavarotti Gala Concert</TITLE>
<ARTIST>Luciano Pavarotti</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>DECCA</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1991</YEAR>
</CD>
<CD>
<TITLE>The dock of the bay</TITLE>
<ARTIST>Otis Redding</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Stax Records</COMPANY>
<PRICE>7.90</PRICE>
<YEAR>1968</YEAR>
</CD>
<CD>
<TITLE>Picture book</TITLE>
<ARTIST>Simply Red</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>Elektra</COMPANY>
<PRICE>7.20</PRICE>
<YEAR>1985</YEAR>
</CD>
<CD>
<TITLE>Red</TITLE>
<ARTIST>The Communards</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>London</COMPANY>
<PRICE>7.80</PRICE>
<YEAR>1987</YEAR>
</CD>
<CD>
<TITLE>Unchain my heart</TITLE>
<ARTIST>Joe Cocker</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>EMI</COMPANY>
<PRICE>8.20</PRICE>
<YEAR>1987</YEAR>
</CD>
</CATALOG>
6 changes: 6 additions & 0 deletions tests/data/xml/notes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<note>
<to>My friend</to>
<from>Me</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>
39 changes: 39 additions & 0 deletions tests/test_html_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import asyncio
from pathlib import Path
from querent.collectors.fs.fs_collector import FSCollectorFactory
from querent.config.collector_config import FSCollectorConfig
from querent.common.uri import Uri
from querent.ingestors.ingestor_manager import IngestorFactoryManager
import pytest


@pytest.mark.asyncio
async def test_collect_and_ingest_xml():
# Set up the collector
collector_factory = FSCollectorFactory()
uri = Uri("file://" + str(Path("./tests/data/html/").resolve()))
config = FSCollectorConfig(root_path=uri.path)
collector = collector_factory.resolve(uri, config)

# Set up the ingestor
ingestor_factory_manager = IngestorFactoryManager()
ingestor_factory = await ingestor_factory_manager.get_factory("html")
ingestor = await ingestor_factory.create("html", [])

# Collect and ingest the PDF
ingested_call = ingestor.ingest(collector.poll())
counter = 0

async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
if ingested != "" or ingested is not None:
counter += 1
assert counter == 1

await poll_and_print()


if __name__ == "__main__":
asyncio.run(test_collect_and_ingest_xml())
39 changes: 39 additions & 0 deletions tests/test_xml_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import asyncio
from pathlib import Path
from querent.collectors.fs.fs_collector import FSCollectorFactory
from querent.config.collector_config import FSCollectorConfig
from querent.common.uri import Uri
from querent.ingestors.ingestor_manager import IngestorFactoryManager
import pytest


@pytest.mark.asyncio
async def test_collect_and_ingest_xml():
# Set up the collector
collector_factory = FSCollectorFactory()
uri = Uri("file://" + str(Path("./tests/data/xml/").resolve()))
config = FSCollectorConfig(root_path=uri.path)
collector = collector_factory.resolve(uri, config)

# Set up the ingestor
ingestor_factory_manager = IngestorFactoryManager()
ingestor_factory = await ingestor_factory_manager.get_factory("xml")
ingestor = await ingestor_factory.create("xml", [])

# Collect and ingest the PDF
ingested_call = ingestor.ingest(collector.poll())
counter = 0

async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
if ingested != "" or ingested is not None:
counter += 1
assert counter == 2

await poll_and_print()


if __name__ == "__main__":
asyncio.run(test_collect_and_ingest_xml())