Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added pdf transform and tests #502

Merged
merged 15 commits into from
Aug 7, 2023
Merged
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ all = [
"transformers >= 4.25.0",
"google-cloud-aiplatform>=1.25.0",
"cohere>=4.11.2",
"sentence_transformers"
"sentence_transformers",
"pypdf >= 3.14.0"
Tyrest marked this conversation as resolved.
Show resolved Hide resolved
]

[project.urls]
Expand Down
1 change: 1 addition & 0 deletions src/autolabel/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,4 @@ class TransformType(str, Enum):
"""Enum containing all Transforms supported by autolabel"""

WEBPAGE_TRANSFORM = "webpage_transform"
PDF = "pdf"
3 changes: 2 additions & 1 deletion src/autolabel/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging

from .base import BaseTransform
from .pdf import PDFTransform
from typing import Dict, List
from autolabel.schema import TransformType

logger = logging.getLogger(__name__)

TRANSFORM_REGISTRY = {}
TRANSFORM_REGISTRY = {TransformType.PDF: PDFTransform}


class TransformFactory:
Expand Down
3 changes: 2 additions & 1 deletion src/autolabel/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ def __init__(self, output_columns: List[str]) -> None:
super().__init__()
self.output_columns = output_columns

@staticmethod
@abstractmethod
def name(self) -> str:
def name() -> str:
pass

@abstractmethod
Expand Down
68 changes: 68 additions & 0 deletions src/autolabel/transforms/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from typing import List, Dict, Any

from pdf2image import convert_from_path
Tyrest marked this conversation as resolved.
Show resolved Hide resolved
import pytesseract

from autolabel.transforms import BaseTransform


class PDFTransform(BaseTransform):
def __init__(
self,
output_columns: List[str],
file_path_column: str,
page_header: str = "Page {page_num}: ",
page_sep: str = "\n\n",
) -> None:
"""The output columns for this class should be in the order: [content_column, num_pages_column]"""
super().__init__(output_columns)
self.file_path_column = file_path_column
self.page_header = page_header
self.page_sep = page_sep

@staticmethod
def name() -> str:
return "pdf"

@staticmethod
def extract_text(path: str) -> List[str]:
"""This function extracts text from a PDF file using the pdf2image and pytesseract libraries.

Args:
path (str): The path to the PDF file.

Returns:
List[str]: A list of strings, one for each page of the PDF file.
"""
pages = convert_from_path(path)
texts = []
for page in pages:
text = pytesseract.image_to_string(page)
texts.append(text)
return texts

def transform(self, row: Dict[str, any]) -> Dict[str, any]:
"""This function transforms a PDF file into a string of text. It uses the PyPDFLoader to load and split the PDF into pages.
Each page is then converted into text and appended to the output string.

Args:
row (Dict[str, any]): The row of data to be transformed.

Returns:
Dict[str, any]: The transformed row of data.
"""
try:
from langchain.document_loaders import PyPDFLoader
Tyrest marked this conversation as resolved.
Show resolved Hide resolved
except ImportError:
raise ImportError(
"pypdf is required to use the pdf transform. Please install pypdf with the following command: pip install pypdf"
)
pages = self.extract_text(row[self.file_path_column])
Tyrest marked this conversation as resolved.
Show resolved Hide resolved
page_contents = []
for idx, page in enumerate(pages):
page_contents.append(self.page_header.format(page_num=idx + 1) + page)
output = self.page_sep.join(page_contents)
return {
self.output_columns[0]: output,
self.output_columns[1]: len(page_contents),
}
Binary file added tests/assets/data_loading/Resume.pdf
Binary file not shown.
66 changes: 66 additions & 0 deletions tests/unit/data_loaders/test_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from autolabel.transforms.pdf import PDFTransform


RESUME_PDF_CONTENT = """Page 1: Functional Resume Sample

John W. Smith
2002 Front Range Way Fort Collins, CO 80525
[email protected]

Career Summary

Four years experience in early childhood development with a di verse background in the care of
special needs children and adults.

Adult Care Experience

• Determined work placement for 150 special needs adult clients.
• Maintained client databases and records.
• Coordinated client contact with local health care professionals on a monthly basis.
• Managed 25 volunteer workers.

Childcare Experience

• Coordinated service assignments for 20 part -time counselors and 100 client families.
• Oversaw daily activity and outing planning for 100 clients.
• Assisted families of special needs clients with researching financial assistance and
healthcare.
• Assisted teachers with managing daily classroom activities.
• Oversaw daily and special st udent activities.

Employment History
1999-2002 Counseling Supervisor, The Wesley Ce nter, Little Rock, Arkansas.
1997-1999 Client Specialist, Rainbow Special Ca re Center, Little Rock, Arkansas
1996-1997 Teacher’s Assistant, Cowell Elem entary, Conway, Arkansas

Education

University of Arkansas at Little Rock, Little Rock, AR

• BS in Early Childhood Development (1999)
• BA in Elementary Education (1998)
• GPA (4.0 Scale): Early Childhood Developm ent – 3.8, Elementary Education – 3.5,
Overall 3.4.
• Dean’s List, Chancellor’s List"""


def test_pdf_transform():
# Initialize the PDFTransform class
transform = PDFTransform(
output_columns=["content", "num_pages"],
file_path_column="file_path",
page_header="Page {page_num}: ",
page_sep="\n\n",
)

# Create a mock row of data
row = {"file_path": "tests/assets/data_loading/Resume.pdf"}

# Transform the row
transformed_row = transform.transform(row)

assert set(transformed_row.keys()) == set(["content", "num_pages"])
assert isinstance(transformed_row["content"], str)
assert isinstance(transformed_row["num_pages"], int)
assert transformed_row["num_pages"] == 1
assert transformed_row["content"] == RESUME_PDF_CONTENT
Loading