Skip to content

Commit

Permalink
Pptx support (#296)
Browse files Browse the repository at this point in the history
Co-authored-by: Philip Meier <[email protected]>
  • Loading branch information
davidedigrande and pmeier authored Jan 28, 2024
1 parent 51fcda4 commit 06954f0
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 1 deletion.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ all = [
"pyarrow",
"pymupdf>=1.23.6",
"python-docx",
"python-pptx",
"tiktoken",
]

Expand Down Expand Up @@ -143,6 +144,7 @@ module = [
"param",
"pyarrow",
"docx",
"pptx",
"sentence_transformers",
]
ignore_missing_imports = true
Expand Down
2 changes: 2 additions & 0 deletions ragna/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"DocumentHandler",
"DocumentUploadParameters",
"DocxDocumentHandler",
"PptxDocumentHandler",
"EnvVarRequirement",
"LocalDocument",
"Message",
Expand Down Expand Up @@ -39,6 +40,7 @@
Page,
PdfDocumentHandler,
PlainTextDocumentHandler,
PptxDocumentHandler,
)

# isort: split
Expand Down
30 changes: 30 additions & 0 deletions ragna/core/_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,33 @@ def extract_pages(self, document: Document) -> Iterator[Page]:
text = paragraph.text
if len(text) > 0:
yield Page(text=text)


@DOCUMENT_HANDLERS.load_if_available
class PptxDocumentHandler(DocumentHandler):
"""Document handler for `.pptx` documents.
!!! info "Package requirements"
- [`python-pptx`](https://github.com/scanny/python-pptx)
"""

@classmethod
def requirements(cls) -> list[Requirement]:
return [PackageRequirement("python-pptx")]

@classmethod
def supported_suffixes(cls) -> list[str]:
return [".pptx"]

def extract_pages(self, document: Document) -> Iterator[Page]:
import pptx

document_pptx = pptx.Presentation(io.BytesIO(document.read()))
for number, slide in enumerate(document_pptx.slides, 1):
text = "\n\n".join(
shape.text
for shape in slide.shapes
if shape.has_text_frame and shape.text
)
yield Page(text=text, number=number)
24 changes: 23 additions & 1 deletion tests/core/test_document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import docx
import pptx

from ragna.core import DocxDocumentHandler, LocalDocument
from ragna.core import DocxDocumentHandler, LocalDocument, PptxDocumentHandler


def get_docx_document(tmp_path, docx_text):
Expand All @@ -20,3 +21,24 @@ def test_docx(tmp_path):
assert len(pages) == 2
for page in pages:
assert page.text == docx_text


def get_pptx_document(tmp_path, pptx_text):
document = pptx.Presentation()
document.slides.add_slide(document.slide_layouts[0])
document.slides[0].shapes.title.text = pptx_text
document.slides.add_slide(document.slide_layouts[0])
document.slides[1].shapes.add_textbox(0, 0, 100, 100).text = pptx_text
path = tmp_path / "test_document.pptx"
document.save(path)
return LocalDocument.from_path(path)


def test_pptx(tmp_path):
pptx_text = "ragna is neat!"
tmp_pptx_document = get_pptx_document(tmp_path, pptx_text)
assert isinstance(tmp_pptx_document.handler, PptxDocumentHandler)
pages = list(tmp_pptx_document.extract_pages())
assert len(pages) == 2
for page in pages:
assert page.text == pptx_text

0 comments on commit 06954f0

Please sign in to comment.