Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added pdf transform and tests #502

Merged
merged 15 commits into from
Aug 7, 2023
Merged
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ all = [
"transformers >= 4.25.0",
"google-cloud-aiplatform>=1.25.0",
"cohere>=4.11.2",
"sentence_transformers"
"sentence_transformers",
"pdfplumber >= 0.10.2"
]

[project.urls]
Expand Down
1 change: 1 addition & 0 deletions src/autolabel/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,4 @@ class TransformType(str, Enum):
"""Enum containing all Transforms supported by autolabel"""

WEBPAGE_TRANSFORM = "webpage_transform"
PDF = "pdf"
3 changes: 2 additions & 1 deletion src/autolabel/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import logging

from .base import BaseTransform
from .pdf import PDFTransform
from typing import Dict, List
from autolabel.schema import TransformType

logger = logging.getLogger(__name__)

TRANSFORM_REGISTRY = {}
TRANSFORM_REGISTRY = {TransformType.PDF: PDFTransform}


class TransformFactory:
Expand Down
3 changes: 2 additions & 1 deletion src/autolabel/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ def __init__(self, output_columns: List[str]) -> None:
super().__init__()
self.output_columns = output_columns

@staticmethod
@abstractmethod
def name(self) -> str:
def name() -> str:
pass

@abstractmethod
Expand Down
102 changes: 102 additions & 0 deletions src/autolabel/transforms/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import List, Dict, Any

from autolabel.transforms import BaseTransform


class PDFTransform(BaseTransform):
def __init__(
self,
output_columns: List[str],
file_path_column: str,
ocr_enabled: bool = False,
page_header: str = "Page {page_num}: ",
page_sep: str = "\n\n",
) -> None:
"""The output columns for this class should be in the order: [content_column, num_pages_column]"""
super().__init__(output_columns)
self.file_path_column = file_path_column
self.ocr_enabled = ocr_enabled
self.page_header = page_header
self.page_sep = page_sep

if self.ocr_enabled:
rajasbansal marked this conversation as resolved.
Show resolved Hide resolved
try:
from pdf2image import convert_from_path
import pytesseract

self.convert_from_path = convert_from_path
self.pytesseract = pytesseract
except ImportError:
raise ImportError(
"pdf2image and pytesseract are required to use the pdf transform with ocr. Please install pdf2image and pytesseract with the following command: pip install pdf2image pytesseract"
rajasbansal marked this conversation as resolved.
Show resolved Hide resolved
)
else:
try:
from langchain.document_loaders import PDFPlumberLoader

self.PDFPlumberLoader = PDFPlumberLoader
except ImportError:
raise ImportError(
"pdfplumber is required to use the pdf transform. Please install pdfplumber with the following command: pip install pdfplumber"
)

@staticmethod
def name() -> str:
return "pdf"

def extract_text(self, path: str) -> List[str]:
"""
This function extracts text from a PDF file using the pdfplumber library.

Args:
path (str): The path to the PDF file.

Returns:
List[str]: A list of strings, each index containing the extracted text from each page of the PDF file.
"""
loader = self.PDFPlumberLoader(path)
return [doc.page_content for doc in loader.load()]

def extract_text_ocr(self, path: str) -> List[str]:
"""This function extracts text from a PDF file using the pdf2image and pytesseract libraries.

Args:
path (str): The path to the PDF file.

Returns:
List[str]: A list of strings, one for each page of the PDF file.
"""
pages = self.convert_from_path(path)
try:
texts = []
for page in pages:
text = self.pytesseract.image_to_string(page)
texts.append(text)
return texts
except Exception as e:
raise ImportError(
"The tesseract engine is required to use the pdf transform with ocr. Please see https://tesseract-ocr.github.io/tessdoc/Installation.html for installation instructions."
)

def transform(self, row: Dict[str, any]) -> Dict[str, any]:
"""This function transforms a PDF file into a string of text. It uses the PyPDFLoader to load and split the PDF into pages.
Each page is then converted into text and appended to the output string.

Args:
row (Dict[str, any]): The row of data to be transformed.

Returns:
Dict[str, any]: The transformed row of data.
"""
if self.ocr_enabled:
pages = self.extract_text_ocr(row[self.file_path_column])
else:
pages = self.extract_text(row[self.file_path_column])
page_contents = []
for idx, page in enumerate(pages):
page_contents.append(self.page_header.format(page_num=idx + 1) + page)
output = self.page_sep.join(page_contents)
return {
self.output_columns[0]: output,
self.output_columns[1]: len(page_contents),
}
Binary file added tests/assets/data_loading/Resume.pdf
Binary file not shown.
55 changes: 55 additions & 0 deletions tests/unit/data_loaders/test_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from autolabel.transforms.pdf import PDFTransform


RESUME_PDF_CONTENT = """Page 1: Functional Resume Sample
John W. Smith
2002 Front Range Way Fort Collins, CO 80525
[email protected]
Career Summary
Four years experience in early childhood development with a diverse background in the care of
special needs children and adults.
Adult Care Experience
• Determined work placement for 150 special needs adult clients.
• Maintained client databases and records.
• Coordinated client contact with local health care professionals on a monthly basis.
• Managed 25 volunteer workers.
Childcare Experience
• Coordinated service assignments for 20 part-time counselors and 100 client families.
• Oversaw daily activity and outing planning for 100 clients.
• Assisted families of special needs clients with researching financial assistance and
healthcare.
• Assisted teachers with managing daily classroom activities.
• Oversaw daily and special student activities.
Employment History
1999-2002 Counseling Supervisor, The Wesley Center, Little Rock, Arkansas.
1997-1999 Client Specialist, Rainbow Special Care Center, Little Rock, Arkansas
1996-1997 Teacher’s Assistant, Cowell Elementary, Conway, Arkansas
Education
University of Arkansas at Little Rock, Little Rock, AR
• BS in Early Childhood Development (1999)
• BA in Elementary Education (1998)
• GPA (4.0 Scale): Early Childhood Development – 3.8, Elementary Education – 3.5,
Overall 3.4.
• Dean’s List, Chancellor’s List"""


def test_pdf_transform():
# Initialize the PDFTransform class
transform = PDFTransform(
output_columns=["content", "num_pages"],
file_path_column="file_path",
page_header="Page {page_num}: ",
page_sep="\n\n",
)

# Create a mock row of data
row = {"file_path": "tests/assets/data_loading/Resume.pdf"}

# Transform the row
transformed_row = transform.transform(row)

assert set(transformed_row.keys()) == set(["content", "num_pages"])
assert isinstance(transformed_row["content"], str)
assert isinstance(transformed_row["num_pages"], int)
assert transformed_row["num_pages"] == 1
assert transformed_row["content"] == RESUME_PDF_CONTENT
Loading