refuel-ai · Tyrest · Aug 7, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,7 +90,8 @@ all = [
     "transformers >= 4.25.0",
     "google-cloud-aiplatform>=1.25.0",
     "cohere>=4.11.2",
-    "sentence_transformers"
+    "sentence_transformers",
+    "pdfplumber >= 0.10.2"
 ]
 
 [project.urls]

diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py
@@ -218,3 +218,4 @@ class TransformType(str, Enum):
     """Enum containing all Transforms supported by autolabel"""
 
     WEBPAGE_TRANSFORM = "webpage_transform"
+    PDF = "pdf"
diff --git a/src/autolabel/transforms/__init__.py b/src/autolabel/transforms/__init__.py
@@ -1,12 +1,13 @@
 import logging
 
 from .base import BaseTransform
+from .pdf import PDFTransform
 from typing import Dict, List
 from autolabel.schema import TransformType
 
 logger = logging.getLogger(__name__)
 
-TRANSFORM_REGISTRY = {}
+TRANSFORM_REGISTRY = {TransformType.PDF: PDFTransform}
 
 
 class TransformFactory:

diff --git a/src/autolabel/transforms/base.py b/src/autolabel/transforms/base.py
@@ -7,8 +7,9 @@ def __init__(self, output_columns: List[str]) -> None:
         super().__init__()
         self.output_columns = output_columns
 
+    @staticmethod
     @abstractmethod
-    def name(self) -> str:
+    def name() -> str:
         pass
 
     @abstractmethod

diff --git a/src/autolabel/transforms/pdf.py b/src/autolabel/transforms/pdf.py
@@ -0,0 +1,102 @@
+from typing import List, Dict, Any
+
+from autolabel.transforms import BaseTransform
+
+
+class PDFTransform(BaseTransform):
+    def __init__(
+        self,
+        output_columns: List[str],
+        file_path_column: str,
+        ocr_enabled: bool = False,
+        page_header: str = "Page {page_num}: ",
+        page_sep: str = "\n\n",
+    ) -> None:
+        """The output columns for this class should be in the order: [content_column, num_pages_column]"""
+        super().__init__(output_columns)
+        self.file_path_column = file_path_column
+        self.ocr_enabled = ocr_enabled
+        self.page_header = page_header
+        self.page_sep = page_sep
+
+        if self.ocr_enabled:
+            try:
+                from pdf2image import convert_from_path
+                import pytesseract
+
+                self.convert_from_path = convert_from_path
+                self.pytesseract = pytesseract
+            except ImportError:
+                raise ImportError(
+                    "pdf2image and pytesseract are required to use the pdf transform with ocr. Please install pdf2image and pytesseract with the following command: pip install pdf2image pytesseract"
+                )
+        else:
+            try:
+                from langchain.document_loaders import PDFPlumberLoader
+
+                self.PDFPlumberLoader = PDFPlumberLoader
+            except ImportError:
+                raise ImportError(
+                    "pdfplumber is required to use the pdf transform. Please install pdfplumber with the following command: pip install pdfplumber"
+                )
+
+    @staticmethod
+    def name() -> str:
+        return "pdf"
+
+    def extract_text(self, path: str) -> List[str]:
+        """
+        This function extracts text from a PDF file using the pdfplumber library.
+
+        Args:
+            path (str): The path to the PDF file.
+
+        Returns:
+            List[str]: A list of strings, each index containing the extracted text from each page of the PDF file.
+        """
+        loader = self.PDFPlumberLoader(path)
+        return [doc.page_content for doc in loader.load()]
+
+    def extract_text_ocr(self, path: str) -> List[str]:
+        """This function extracts text from a PDF file using the pdf2image and pytesseract libraries.
+
+        Args:
+            path (str): The path to the PDF file.
+
+        Returns:
+            List[str]: A list of strings, one for each page of the PDF file.
+        """
+        pages = self.convert_from_path(path)
+        try:
+            texts = []
+            for page in pages:
+                text = self.pytesseract.image_to_string(page)
+                texts.append(text)
+            return texts
+        except Exception as e:
+            raise ImportError(
+                "The tesseract engine is required to use the pdf transform with ocr. Please see https://tesseract-ocr.github.io/tessdoc/Installation.html for installation instructions."
+            )
+
+    def transform(self, row: Dict[str, any]) -> Dict[str, any]:
+        """This function transforms a PDF file into a string of text. It uses the PyPDFLoader to load and split the PDF into pages.
+        Each page is then converted into text and appended to the output string.
+
+        Args:
+            row (Dict[str, any]): The row of data to be transformed.
+
+        Returns:
+            Dict[str, any]: The transformed row of data.
+        """
+        if self.ocr_enabled:
+            pages = self.extract_text_ocr(row[self.file_path_column])
+        else:
+            pages = self.extract_text(row[self.file_path_column])
+        page_contents = []
+        for idx, page in enumerate(pages):
+            page_contents.append(self.page_header.format(page_num=idx + 1) + page)
+        output = self.page_sep.join(page_contents)
+        return {
+            self.output_columns[0]: output,
+            self.output_columns[1]: len(page_contents),
+        }
diff --git a/tests/assets/data_loading/Resume.pdf b/tests/assets/data_loading/Resume.pdf
diff --git a/tests/unit/data_loaders/test_transform.py b/tests/unit/data_loaders/test_transform.py
@@ -0,0 +1,55 @@
+from autolabel.transforms.pdf import PDFTransform
+
+
+RESUME_PDF_CONTENT = """Page 1: Functional Resume Sample
+John W. Smith
+2002 Front Range Way Fort Collins, CO 80525
+[email protected]
+Career Summary
+Four years experience in early childhood development with a diverse background in the care of
+special needs children and adults.
+Adult Care Experience
+• Determined work placement for 150 special needs adult clients.
+• Maintained client databases and records.
+• Coordinated client contact with local health care professionals on a monthly basis.
+• Managed 25 volunteer workers.
+Childcare Experience
+• Coordinated service assignments for 20 part-time counselors and 100 client families.
+• Oversaw daily activity and outing planning for 100 clients.
+• Assisted families of special needs clients with researching financial assistance and
+healthcare.
+• Assisted teachers with managing daily classroom activities.
+• Oversaw daily and special student activities.
+Employment History
+1999-2002 Counseling Supervisor, The Wesley Center, Little Rock, Arkansas.
+1997-1999 Client Specialist, Rainbow Special Care Center, Little Rock, Arkansas
+1996-1997 Teacher’s Assistant, Cowell Elementary, Conway, Arkansas
+Education
+University of Arkansas at Little Rock, Little Rock, AR
+• BS in Early Childhood Development (1999)
+• BA in Elementary Education (1998)
+• GPA (4.0 Scale): Early Childhood Development – 3.8, Elementary Education – 3.5,
+Overall 3.4.
+• Dean’s List, Chancellor’s List"""
+
+
+def test_pdf_transform():
+    # Initialize the PDFTransform class
+    transform = PDFTransform(
+        output_columns=["content", "num_pages"],
+        file_path_column="file_path",
+        page_header="Page {page_num}: ",
+        page_sep="\n\n",
+    )
+
+    # Create a mock row of data
+    row = {"file_path": "tests/assets/data_loading/Resume.pdf"}
+
+    # Transform the row
+    transformed_row = transform.transform(row)
+
+    assert set(transformed_row.keys()) == set(["content", "num_pages"])
+    assert isinstance(transformed_row["content"], str)
+    assert isinstance(transformed_row["num_pages"], int)
+    assert transformed_row["num_pages"] == 1
+    assert transformed_row["content"] == RESUME_PDF_CONTENT