refuel-ai · Tyrest · Aug 7, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,7 +90,8 @@ all = [
     "transformers >= 4.25.0",
     "google-cloud-aiplatform>=1.25.0",
     "cohere>=4.11.2",
-    "sentence_transformers"
+    "sentence_transformers",
+    "pypdf >= 3.14.0"
 ]
 
 [project.urls]

diff --git a/src/autolabel/schema.py b/src/autolabel/schema.py
@@ -218,3 +218,4 @@ class TransformType(str, Enum):
     """Enum containing all Transforms supported by autolabel"""
 
     WEBPAGE_TRANSFORM = "webpage_transform"
+    PDF = "pdf"
diff --git a/src/autolabel/transforms/__init__.py b/src/autolabel/transforms/__init__.py
@@ -1,12 +1,13 @@
 import logging
 
 from .base import BaseTransform
+from .pdf import PDFTransform
 from typing import Dict, List
 from autolabel.schema import TransformType
 
 logger = logging.getLogger(__name__)
 
-TRANSFORM_REGISTRY = {}
+TRANSFORM_REGISTRY = {TransformType.PDF: PDFTransform}
 
 
 class TransformFactory:

diff --git a/src/autolabel/transforms/base.py b/src/autolabel/transforms/base.py
@@ -7,8 +7,9 @@ def __init__(self, output_columns: List[str]) -> None:
         super().__init__()
         self.output_columns = output_columns
 
+    @staticmethod
     @abstractmethod
-    def name(self) -> str:
+    def name() -> str:
         pass
 
     @abstractmethod

diff --git a/src/autolabel/transforms/pdf.py b/src/autolabel/transforms/pdf.py
@@ -0,0 +1,68 @@
+from typing import List, Dict, Any
+
+from pdf2image import convert_from_path
+import pytesseract
+
+from autolabel.transforms import BaseTransform
+
+
+class PDFTransform(BaseTransform):
+    def __init__(
+        self,
+        output_columns: List[str],
+        file_path_column: str,
+        page_header: str = "Page {page_num}: ",
+        page_sep: str = "\n\n",
+    ) -> None:
+        """The output columns for this class should be in the order: [content_column, num_pages_column]"""
+        super().__init__(output_columns)
+        self.file_path_column = file_path_column
+        self.page_header = page_header
+        self.page_sep = page_sep
+
+    @staticmethod
+    def name() -> str:
+        return "pdf"
+
+    @staticmethod
+    def extract_text(path: str) -> List[str]:
+        """This function extracts text from a PDF file using the pdf2image and pytesseract libraries.
+
+        Args:
+            path (str): The path to the PDF file.
+
+        Returns:
+            List[str]: A list of strings, one for each page of the PDF file.
+        """
+        pages = convert_from_path(path)
+        texts = []
+        for page in pages:
+            text = pytesseract.image_to_string(page)
+            texts.append(text)
+        return texts
+
+    def transform(self, row: Dict[str, any]) -> Dict[str, any]:
+        """This function transforms a PDF file into a string of text. It uses the PyPDFLoader to load and split the PDF into pages.
+        Each page is then converted into text and appended to the output string.
+
+        Args:
+            row (Dict[str, any]): The row of data to be transformed.
+
+        Returns:
+            Dict[str, any]: The transformed row of data.
+        """
+        try:
+            from langchain.document_loaders import PyPDFLoader
+        except ImportError:
+            raise ImportError(
+                "pypdf is required to use the pdf transform. Please install pypdf with the following command: pip install pypdf"
+            )
+        pages = self.extract_text(row[self.file_path_column])
+        page_contents = []
+        for idx, page in enumerate(pages):
+            page_contents.append(self.page_header.format(page_num=idx + 1) + page)
+        output = self.page_sep.join(page_contents)
+        return {
+            self.output_columns[0]: output,
+            self.output_columns[1]: len(page_contents),
+        }
diff --git a/tests/assets/data_loading/Resume.pdf b/tests/assets/data_loading/Resume.pdf
diff --git a/tests/unit/data_loaders/test_transform.py b/tests/unit/data_loaders/test_transform.py
@@ -0,0 +1,66 @@
+from autolabel.transforms.pdf import PDFTransform
+
+
+RESUME_PDF_CONTENT = """Page 1: Functional Resume Sample 
+
+John W. Smith   
+2002 Front Range Way Fort Collins, CO 80525  
+[email protected]  
+
+Career Summary 
+
+Four years experience in early childhood development with a di verse background in the care of 
+special needs children and adults.  
+
+Adult Care Experience  
+
+• Determined work placement for 150 special needs adult clients.  
+• Maintained client databases and records.  
+• Coordinated client contact with local health care professionals on a monthly basis.     
+• Managed 25 volunteer workers.     
+
+Childcare Experience  
+
+• Coordinated service assignments for 20 part -time counselors and 100 client families. 
+• Oversaw daily activity and outing planning for 100 clients.  
+• Assisted families of special needs clients with researching financial assistance and 
+healthcare. 
+• Assisted teachers with managing daily classroom activities.    
+• Oversaw daily and special st udent activities.     
+
+Employment History  
+ 1999-2002  Counseling Supervisor, The Wesley Ce nter, Little Rock, Arkansas.    
+1997-1999  Client Specialist, Rainbow Special Ca re Center, Little Rock, Arkansas  
+1996-1997 Teacher’s Assistant, Cowell Elem entary, Conway, Arkansas     
+
+Education 
+
+University of Arkansas at Little Rock, Little Rock, AR  
+
+• BS in Early Childhood Development (1999) 
+• BA in Elementary Education (1998) 
+• GPA (4.0 Scale):  Early Childhood Developm ent – 3.8, Elementary Education – 3.5, 
+Overall 3.4.  
+• Dean’s List, Chancellor’s List"""
+
+
+def test_pdf_transform():
+    # Initialize the PDFTransform class
+    transform = PDFTransform(
+        output_columns=["content", "num_pages"],
+        file_path_column="file_path",
+        page_header="Page {page_num}: ",
+        page_sep="\n\n",
+    )
+
+    # Create a mock row of data
+    row = {"file_path": "tests/assets/data_loading/Resume.pdf"}
+
+    # Transform the row
+    transformed_row = transform.transform(row)
+
+    assert set(transformed_row.keys()) == set(["content", "num_pages"])
+    assert isinstance(transformed_row["content"], str)
+    assert isinstance(transformed_row["num_pages"], int)
+    assert transformed_row["num_pages"] == 1
+    assert transformed_row["content"] == RESUME_PDF_CONTENT
Original file line number	Diff line number	Diff line change
Expand Up		@@ -218,3 +218,4 @@ class TransformType(str, Enum):
		"""Enum containing all Transforms supported by autolabel"""

		WEBPAGE_TRANSFORM = "webpage_transform"
		PDF = "pdf"