-
Notifications
You must be signed in to change notification settings - Fork 10
Working with PDF and OCR
Somkiat Puisungnoen edited this page Jul 13, 2025
·
4 revisions
# Ref: https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python
from openai import OpenAI
client = OpenAI()
file_path = "./data/doc-scan.pdf"
file = client.files.create(
file=open(file_path, "rb"),
purpose="user_data"
)
completion = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{
"type": "file",
"file": {
"file_id": file.id,
}
},
{
"type": "text",
"text": "Extract the text from the PDF file",
},
]
}
]
)
print(completion.choices[0].message.content)
import base64
import os
from mistralai import Mistral
def encode_pdf(pdf_path):
"""Encode the pdf to base64."""
try:
with open(pdf_path, "rb") as pdf_file:
return base64.b64encode(pdf_file.read()).decode('utf-8')
except FileNotFoundError:
print(f"Error: The file {pdf_path} was not found.")
return None
except Exception as e: # Added general exception handling
print(f"Error: {e}")
return None
# Create a markdown file from the OCR response
def create_markdown_file(ocr_response, output_filename = "output.md"):
with open(output_filename, "wt") as f:
for page in ocr_response.pages:
f.write(page.markdown)
if __name__ == "__main__":
if "MISTRAL_API_KEY" not in os.environ:
print("Error: MISTRAL_API_KEY environment variable is not set.")
exit(1)
# Path to your pdf
pdf_path = "doc-scan.pdf"
# API key and client initialization
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)
# Getting the base64 string
base64_pdf = encode_pdf(pdf_path)
# Check if the base64 encoding was successful
if base64_pdf is None:
print("Error: Failed to encode the PDF file.")
exit(1)
# Process the OCR request
ocr_response = client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "document_url",
"document_url": f"data:application/pdf;base64,{base64_pdf}"
},
include_image_base64=True
)
# Print the OCR response
print("OCR Response:", ocr_response)
# Create a markdown file from the OCR response
create_markdown_file(ocr_response)
print("OCR processing complete. Markdown file created.")
from typhoon_ocr import ocr_document
import os
# please set env TYPHOON_API_KEY or OPENAI_API_KEY to use this function
image_path = "./data/doc-scan.pdf"
markdown = ocr_document(image_path)
print(markdown)