-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #52 from Menghuan1918/dev
V0.4.9
- Loading branch information
Showing
5 changed files
with
144 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# This is an example of how to convert all PDF files in a folder to DOCX files. | ||
# 这是一个将文件夹中的所有 PDF 文件转换为 DOCX 文件的示例。 | ||
|
||
from pdfdeal import Doc2X | ||
|
||
# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter | ||
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 | ||
|
||
# client = Doc2X(apikey="Your API key",debug=True) | ||
client = Doc2X(debug=True) | ||
|
||
success, failed, flag = client.pdf2file( | ||
pdf_file="tests/pdf/test", | ||
output_path="./Output", | ||
output_format="docx", | ||
) | ||
print(success) | ||
print(failed) | ||
print(flag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# This is an example of how to convert all PDF files in a folder to multiple types of files. | ||
# 这是一个将文件夹中的所有 PDF 文件转换为多种类型文件的示例。 | ||
|
||
from pdfdeal import Doc2X | ||
|
||
# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter | ||
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 | ||
|
||
# client = Doc2X(apikey="Your API key",debug=True) | ||
client = Doc2X(debug=True) | ||
|
||
success, failed, flag = client.pdf2file( | ||
pdf_file="/home/menghuan/文档/Test/pdf", | ||
output_path="./Output", | ||
output_format="docx,md", | ||
) | ||
print(success) | ||
print(failed) | ||
print(flag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# This is an example of how to convert a single PDF file to a single DOCX file. | ||
# 这是一个将单个 PDF 文件转换为单个 DOCX 文件的示例。 | ||
|
||
from pdfdeal import Doc2X | ||
|
||
# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter | ||
# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 | ||
|
||
# client = Doc2X(apikey="Your API key",debug=True) | ||
client = Doc2X(debug=True) | ||
|
||
success, failed, flag = client.pdf2file( | ||
pdf_file="tests/pdf/sample.pdf", | ||
output_path="Output", | ||
output_format="docx", | ||
) | ||
print(success) | ||
print(failed) | ||
print(flag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[project] | ||
name = "pdfdeal" | ||
version = "0.4.8" | ||
version = "0.4.9" | ||
authors = [{ name = "Menghuan1918", email = "[email protected]" }] | ||
description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)." | ||
readme = "README.md" | ||
|
@@ -13,8 +13,8 @@ classifiers = [ | |
dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"] | ||
|
||
[project.optional-dependencies] | ||
rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3"] | ||
dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3"] | ||
rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] | ||
dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] | ||
|
||
[project.urls] | ||
Issues = "https://github.com/Menghuan1918/pdfdeal/issues" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import os | ||
from minio import Minio, S3Error | ||
import logging | ||
from urllib.parse import urlparse | ||
import json | ||
|
||
|
||
class MINIO: | ||
def __init__(self, minio_address, minio_admin, minio_password, bucket_name): | ||
# 通过ip 账号 密码 连接minio server | ||
# 根据地址自动判断是否使用安全连接 | ||
parsed_url = urlparse(minio_address) | ||
secure = parsed_url.scheme == "https" | ||
self.minioClient = Minio( | ||
endpoint=parsed_url.netloc, | ||
access_key=minio_admin, | ||
secret_key=minio_password, | ||
secure=secure, | ||
) | ||
self.bucket_name = bucket_name | ||
self.minio_address = minio_address | ||
|
||
def upload_file(self, local_file_path, remote_file_path): | ||
"""Upload a file | ||
Args: | ||
local_file_path (str): The path of the local file to upload. | ||
remote_file_path (str): The path of the remote file to upload to. | ||
Returns: | ||
tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. | ||
""" | ||
# 检查桶是否存在,不存在则新建并设置为公开只读 | ||
check_bucket = self.minioClient.bucket_exists(self.bucket_name) | ||
if not check_bucket: | ||
self.minioClient.make_bucket(self.bucket_name) | ||
# 设置桶策略为公开只读 | ||
policy = { | ||
"Version": "2012-10-17", | ||
"Statement": [ | ||
{ | ||
"Effect": "Allow", | ||
"Principal": {"AWS": "*"}, | ||
"Action": ["s3:GetBucketLocation", "s3:ListBucket"], | ||
"Resource": f"arn:aws:s3:::{self.bucket_name}", | ||
}, | ||
{ | ||
"Effect": "Allow", | ||
"Principal": {"AWS": "*"}, | ||
"Action": "s3:GetObject", | ||
"Resource": f"arn:aws:s3:::{self.bucket_name}/*", | ||
}, | ||
], | ||
} | ||
self.minioClient.set_bucket_policy(self.bucket_name, json.dumps(policy)) | ||
try: | ||
path, file_name = os.path.split(local_file_path) | ||
self.minioClient.fput_object( | ||
bucket_name=self.bucket_name, | ||
object_name=file_name, | ||
file_path=local_file_path, | ||
) | ||
parsed_url = urlparse(self.minio_address) | ||
scheme = parsed_url.scheme or "http" | ||
remote_file_path = ( | ||
f"{scheme}://{parsed_url.netloc}/{self.bucket_name}/{file_name}" | ||
) | ||
return (remote_file_path, True) | ||
except FileNotFoundError as err: | ||
logging.exception(f"Error to upload the file: {local_file_path}, {err}") | ||
return err, False | ||
except S3Error as err: | ||
logging.exception(f"Error to upload the file: {local_file_path}, {err}") | ||
return err, False | ||
|
||
|
||
def Min(minio_address, minio_admin, minio_password, bucket_name) -> callable: | ||
Min_uploader = MINIO( | ||
minio_address=minio_address, | ||
minio_admin=minio_admin, | ||
minio_password=minio_password, | ||
bucket_name=bucket_name, | ||
) | ||
return Min_uploader.upload_file |