Skip to content

Commit

Permalink
Merge pull request #52 from Menghuan1918/dev
Browse files Browse the repository at this point in the history
V0.4.9
  • Loading branch information
Menghuan1918 authored Nov 18, 2024
2 parents 1861fb3 + 768fbbd commit 13e3a0c
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 3 deletions.
19 changes: 19 additions & 0 deletions examples/convert_folder_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This is an example of how to convert all PDF files in a folder to DOCX files.
# 这是一个将文件夹中的所有 PDF 文件转换为 DOCX 文件的示例。

from pdfdeal import Doc2X

# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数

# client = Doc2X(apikey="Your API key",debug=True)
client = Doc2X(debug=True)

success, failed, flag = client.pdf2file(
pdf_file="tests/pdf/test",
output_path="./Output",
output_format="docx",
)
print(success)
print(failed)
print(flag)
19 changes: 19 additions & 0 deletions examples/convert_pdfs_multiple_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This is an example of how to convert all PDF files in a folder to multiple types of files.
# 这是一个将文件夹中的所有 PDF 文件转换为多种类型文件的示例。

from pdfdeal import Doc2X

# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数

# client = Doc2X(apikey="Your API key",debug=True)
client = Doc2X(debug=True)

success, failed, flag = client.pdf2file(
pdf_file="/home/menghuan/文档/Test/pdf",
output_path="./Output",
output_format="docx,md",
)
print(success)
print(failed)
print(flag)
19 changes: 19 additions & 0 deletions examples/convert_single_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This is an example of how to convert a single PDF file to a single DOCX file.
# 这是一个将单个 PDF 文件转换为单个 DOCX 文件的示例。

from pdfdeal import Doc2X

# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数

# client = Doc2X(apikey="Your API key",debug=True)
client = Doc2X(debug=True)

success, failed, flag = client.pdf2file(
pdf_file="tests/pdf/sample.pdf",
output_path="Output",
output_format="docx",
)
print(success)
print(failed)
print(flag)
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdfdeal"
version = "0.4.8"
version = "0.4.9"
authors = [{ name = "Menghuan1918", email = "[email protected]" }]
description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
readme = "README.md"
Expand All @@ -13,8 +13,8 @@ classifiers = [
dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"]

[project.optional-dependencies]
rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3"]
dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3"]
rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"]
dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"]

[project.urls]
Issues = "https://github.com/Menghuan1918/pdfdeal/issues"
Expand Down
84 changes: 84 additions & 0 deletions src/pdfdeal/FileTools/Img/MinIO.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
from minio import Minio, S3Error
import logging
from urllib.parse import urlparse
import json


class MINIO:
def __init__(self, minio_address, minio_admin, minio_password, bucket_name):
# 通过ip 账号 密码 连接minio server
# 根据地址自动判断是否使用安全连接
parsed_url = urlparse(minio_address)
secure = parsed_url.scheme == "https"
self.minioClient = Minio(
endpoint=parsed_url.netloc,
access_key=minio_admin,
secret_key=minio_password,
secure=secure,
)
self.bucket_name = bucket_name
self.minio_address = minio_address

def upload_file(self, local_file_path, remote_file_path):
"""Upload a file
Args:
local_file_path (str): The path of the local file to upload.
remote_file_path (str): The path of the remote file to upload to.
Returns:
tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful.
"""
# 检查桶是否存在,不存在则新建并设置为公开只读
check_bucket = self.minioClient.bucket_exists(self.bucket_name)
if not check_bucket:
self.minioClient.make_bucket(self.bucket_name)
# 设置桶策略为公开只读
policy = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": ["s3:GetBucketLocation", "s3:ListBucket"],
"Resource": f"arn:aws:s3:::{self.bucket_name}",
},
{
"Effect": "Allow",
"Principal": {"AWS": "*"},
"Action": "s3:GetObject",
"Resource": f"arn:aws:s3:::{self.bucket_name}/*",
},
],
}
self.minioClient.set_bucket_policy(self.bucket_name, json.dumps(policy))
try:
path, file_name = os.path.split(local_file_path)
self.minioClient.fput_object(
bucket_name=self.bucket_name,
object_name=file_name,
file_path=local_file_path,
)
parsed_url = urlparse(self.minio_address)
scheme = parsed_url.scheme or "http"
remote_file_path = (
f"{scheme}://{parsed_url.netloc}/{self.bucket_name}/{file_name}"
)
return (remote_file_path, True)
except FileNotFoundError as err:
logging.exception(f"Error to upload the file: {local_file_path}, {err}")
return err, False
except S3Error as err:
logging.exception(f"Error to upload the file: {local_file_path}, {err}")
return err, False


def Min(minio_address, minio_admin, minio_password, bucket_name) -> callable:
Min_uploader = MINIO(
minio_address=minio_address,
minio_admin=minio_admin,
minio_password=minio_password,
bucket_name=bucket_name,
)
return Min_uploader.upload_file

0 comments on commit 13e3a0c

Please sign in to comment.