Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added pdf transform and tests #502

Merged
merged 15 commits into from
Aug 7, 2023
3 changes: 2 additions & 1 deletion src/autolabel/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging

from .base import BaseTransform
from .pdf import PDFTransform
from typing import Dict, List
from autolabel.schema import TransformType

logger = logging.getLogger(__name__)

TRANSFORM_REGISTRY = {}
TRANSFORM_REGISTRY = {PDFTransform.name(): PDFTransform}
Tyrest marked this conversation as resolved.
Show resolved Hide resolved


class TransformFactory:
Expand Down
3 changes: 2 additions & 1 deletion src/autolabel/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ def __init__(self, output_columns: List[str]) -> None:
super().__init__()
self.output_columns = output_columns

@staticmethod
@abstractmethod
def name(self) -> str:
def name() -> str:
pass

@abstractmethod
Expand Down
46 changes: 46 additions & 0 deletions src/autolabel/transforms/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import List, Dict, Any

from langchain.document_loaders import PyPDFLoader

from autolabel.transforms import BaseTransform


class PDFTransform(BaseTransform):
def __init__(
self,
output_columns: List[str],
file_path_column: str,
page_header: str = "Page {page_num}: ",
page_sep: str = "\n\n",
) -> None:
"""The output columns for this class should be in the order: [content_column, num_pages_column]"""
super().__init__(output_columns)
self.file_path_column = file_path_column
self.page_header = page_header
self.page_sep = page_sep

@staticmethod
def name() -> str:
return "pdf"

def transform(self, row: Dict[str, any]) -> Dict[str, any]:
"""This function transforms a PDF file into a string of text. It uses the PyPDFLoader to load and split the PDF into pages.
Each page is then converted into text and appended to the output string.

Args:
row (Dict[str, any]): The row of data to be transformed.

Returns:
Dict[str, any]: The transformed row of data.
"""
loader = PyPDFLoader(row[self.file_path_column])
page_contents = []
for idx, page in enumerate(loader.load_and_split()):
page_contents.append(
self.page_header.format(page_num=idx + 1) + page.page_content
)
output = self.page_sep.join(page_contents)
return {
self.output_columns[0]: output,
self.output_columns[1]: len(page_contents),
}
Binary file added tests/assets/data_loading/Resume.pdf
Binary file not shown.
67 changes: 67 additions & 0 deletions tests/unit/data_loaders/test_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pytest
from autolabel.transforms.pdf import PDFTransform


RESUME_PDF_CONTENT = """Page 1: Functional Resume Sample

John W. Smith
2002 Front Range Way Fort Collins, CO 80525
[email protected]

Career Summary

Four years experience in early childhood development with a di verse background in the care of
special needs children and adults.

Adult Care Experience

• Determined work placement for 150 special needs adult clients.
• Maintained client databases and records.
• Coordinated client contact with local health care professionals on a monthly basis.
• Managed 25 volunteer workers.

Childcare Experience

• Coordinated service assignments for 20 part -time counselors and 100 client families.
• Oversaw daily activity and outing planning for 100 clients.
• Assisted families of special needs clients with researching financial assistance and
healthcare.
• Assisted teachers with managing daily classroom activities.
• Oversaw daily and special st udent activities.

Employment History
1999-2002 Counseling Supervisor, The Wesley Ce nter, Little Rock, Arkansas.
1997-1999 Client Specialist, Rainbow Special Ca re Center, Little Rock, Arkansas
1996-1997 Teacher’s Assistant, Cowell Elem entary, Conway, Arkansas

Education

University of Arkansas at Little Rock, Little Rock, AR

• BS in Early Childhood Development (1999)
• BA in Elementary Education (1998)
• GPA (4.0 Scale): Early Childhood Developm ent – 3.8, Elementary Education – 3.5,
Overall 3.4.
• Dean’s List, Chancellor’s List"""


def test_pdf_transform():
# Initialize the PDFTransform class
transform = PDFTransform(
output_columns=["content", "num_pages"],
file_path_column="file_path",
page_header="Page {page_num}: ",
page_sep="\n\n",
)

# Create a mock row of data
row = {"file_path": "tests/assets/data_loading/Resume.pdf"}

# Transform the row
transformed_row = transform.transform(row)

assert set(transformed_row.keys()) == set(["content", "num_pages"])
assert isinstance(transformed_row["content"], str)
assert isinstance(transformed_row["num_pages"], int)
assert transformed_row["num_pages"] == 1
assert transformed_row["content"] == RESUME_PDF_CONTENT
Loading