From 2e4b55e085482469a4e36e266dd22ecd3b493539 Mon Sep 17 00:00:00 2001 From: minyuli Date: Wed, 13 Nov 2024 15:28:59 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=BC=80=E6=BA=90?= =?UTF-8?q?=E5=AF=B9=E8=B1=A1=E5=AD=98=E5=82=A8=E6=9C=8D=E5=8A=A1MinIO?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81=EF=BC=8C=E5=8F=AF=E4=BB=A5=E8=AE=A9?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E5=AD=98=E5=82=A8=E8=87=B3=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=EF=BC=8C=E5=B9=B6=E4=BB=A5URL=E9=A2=84=E8=A7=88=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/pdfdeal/FileTools/Img/MinIO.py | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 src/pdfdeal/FileTools/Img/MinIO.py diff --git a/src/pdfdeal/FileTools/Img/MinIO.py b/src/pdfdeal/FileTools/Img/MinIO.py new file mode 100644 index 0000000..444534b --- /dev/null +++ b/src/pdfdeal/FileTools/Img/MinIO.py @@ -0,0 +1,61 @@ + + +import os +from minio import Minio, S3Error + + +# MinIO使用bucket(桶)来组织对象。 +# bucket类似于文件夹或目录,其中每个bucket可以容纳任意数量的对象。 +class MINIO: + + def __init__(self, minio_address, minio_admin, minio_password, bucket_name): + # 通过ip 账号 密码 连接minio server + # Http连接 将secure设置为False + self.minioClient = Minio(endpoint=minio_address, + access_key=minio_admin, + secret_key=minio_password, + secure=False, + ) + self.bucket_name = bucket_name + + def upload_file(self, local_file_path, remote_file_path): + """Upload a file + + Args: + local_file_path (str): The path of the local file to upload. + remote_file_path (str): The path of the remote file to upload to. + + Returns: + tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. + """ + # 桶是否存在 不存在则新建 + check_bucket = self.minioClient.bucket_exists(self.bucket_name) + if not check_bucket: + self.minioClient.make_bucket(self.bucket_name) + + try: + path, file_name = os.path.split(local_file_path) + self.minioClient.fput_object(bucket_name=self.bucket_name, + object_name=file_name, + file_path=local_file_path) + remote_file_path = "http://127.0.0.1:9000" + '/' + self.bucket_name + '/' + file_name + return (remote_file_path, True) + except FileNotFoundError as err: + print('upload_failed: ' + str(err)) + except S3Error as err: + print("upload_failed:", err) + +def MiN(minio_address, minio_admin, minio_password, bucket_name) -> callable: + Min_uploader = MINIO( + minio_address = minio_address, + minio_admin = minio_admin, + minio_password = minio_password, + bucket_name = bucket_name) + return Min_uploader.upload_file + + + + + + + From be9cc5ce748a2a9e32a4eb6cbb095bbb0660d1d8 Mon Sep 17 00:00:00 2001 From: minyuli Date: Wed, 13 Nov 2024 16:28:30 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E6=8F=90=E4=BE=9B=E5=BC=80=E6=BA=90?= =?UTF-8?q?=E5=AF=B9=E8=B1=A1=E5=AD=98=E5=82=A8=E6=9C=8D=E5=8A=A1MinIO?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81=EF=BC=8C=E5=8F=AF=E4=BB=A5=E8=AE=A9?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E5=AD=98=E5=82=A8=E8=87=B3=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=EF=BC=8C=E5=B9=B6=E4=BB=A5URL=E9=A2=84=E8=A7=88=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/pdfdeal/FileTools/Img/MinIO.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/pdfdeal/FileTools/Img/MinIO.py b/src/pdfdeal/FileTools/Img/MinIO.py index 444534b..ec3cc49 100644 --- a/src/pdfdeal/FileTools/Img/MinIO.py +++ b/src/pdfdeal/FileTools/Img/MinIO.py @@ -1,11 +1,5 @@ - - import os from minio import Minio, S3Error - - -# MinIO使用bucket(桶)来组织对象。 -# bucket类似于文件夹或目录,其中每个bucket可以容纳任意数量的对象。 class MINIO: def __init__(self, minio_address, minio_admin, minio_password, bucket_name): @@ -32,7 +26,6 @@ def upload_file(self, local_file_path, remote_file_path): check_bucket = self.minioClient.bucket_exists(self.bucket_name) if not check_bucket: self.minioClient.make_bucket(self.bucket_name) - try: path, file_name = os.path.split(local_file_path) self.minioClient.fput_object(bucket_name=self.bucket_name, @@ -45,7 +38,7 @@ def upload_file(self, local_file_path, remote_file_path): except S3Error as err: print("upload_failed:", err) -def MiN(minio_address, minio_admin, minio_password, bucket_name) -> callable: +def Min(minio_address, minio_admin, minio_password, bucket_name) -> callable: Min_uploader = MINIO( minio_address = minio_address, minio_admin = minio_admin, From 78c5dafb020b0cfb6159d2c51cbd639a8a9117a9 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Wed, 13 Nov 2024 22:40:01 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 适配https部署的minio - 自动创建仅可读储存筒 --- src/pdfdeal/FileTools/Img/MinIO.py | 94 ++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/src/pdfdeal/FileTools/Img/MinIO.py b/src/pdfdeal/FileTools/Img/MinIO.py index ec3cc49..6cce38d 100644 --- a/src/pdfdeal/FileTools/Img/MinIO.py +++ b/src/pdfdeal/FileTools/Img/MinIO.py @@ -1,54 +1,84 @@ import os from minio import Minio, S3Error -class MINIO: +import logging +from urllib.parse import urlparse +import json + +class MINIO: def __init__(self, minio_address, minio_admin, minio_password, bucket_name): # 通过ip 账号 密码 连接minio server - # Http连接 将secure设置为False - self.minioClient = Minio(endpoint=minio_address, - access_key=minio_admin, - secret_key=minio_password, - secure=False, - ) + # 根据地址自动判断是否使用安全连接 + parsed_url = urlparse(minio_address) + secure = parsed_url.scheme == "https" + self.minioClient = Minio( + endpoint=parsed_url.netloc, + access_key=minio_admin, + secret_key=minio_password, + secure=secure, + ) self.bucket_name = bucket_name + self.minio_address = minio_address def upload_file(self, local_file_path, remote_file_path): """Upload a file - Args: - local_file_path (str): The path of the local file to upload. - remote_file_path (str): The path of the remote file to upload to. + Args: + local_file_path (str): The path of the local file to upload. + remote_file_path (str): The path of the remote file to upload to. - Returns: - tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. + Returns: + tuple: A tuple containing the URL of the uploaded file and a boolean indicating whether the upload was successful. """ - # 桶是否存在 不存在则新建 + # 检查桶是否存在,不存在则新建并设置为公开只读 check_bucket = self.minioClient.bucket_exists(self.bucket_name) if not check_bucket: self.minioClient.make_bucket(self.bucket_name) + # 设置桶策略为公开只读 + policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"AWS": "*"}, + "Action": ["s3:GetBucketLocation", "s3:ListBucket"], + "Resource": f"arn:aws:s3:::{self.bucket_name}", + }, + { + "Effect": "Allow", + "Principal": {"AWS": "*"}, + "Action": "s3:GetObject", + "Resource": f"arn:aws:s3:::{self.bucket_name}/*", + }, + ], + } + self.minioClient.set_bucket_policy(self.bucket_name, json.dumps(policy)) try: path, file_name = os.path.split(local_file_path) - self.minioClient.fput_object(bucket_name=self.bucket_name, - object_name=file_name, - file_path=local_file_path) - remote_file_path = "http://127.0.0.1:9000" + '/' + self.bucket_name + '/' + file_name + self.minioClient.fput_object( + bucket_name=self.bucket_name, + object_name=file_name, + file_path=local_file_path, + ) + parsed_url = urlparse(self.minio_address) + scheme = parsed_url.scheme or "http" + remote_file_path = ( + f"{scheme}://{parsed_url.netloc}/{self.bucket_name}/{file_name}" + ) return (remote_file_path, True) except FileNotFoundError as err: - print('upload_failed: ' + str(err)) + logging.exception(f"Error to upload the file: {local_file_path}, {err}") + return err, False except S3Error as err: - print("upload_failed:", err) - -def Min(minio_address, minio_admin, minio_password, bucket_name) -> callable: - Min_uploader = MINIO( - minio_address = minio_address, - minio_admin = minio_admin, - minio_password = minio_password, - bucket_name = bucket_name) - return Min_uploader.upload_file - - - - - + logging.exception(f"Error to upload the file: {local_file_path}, {err}") + return err, False +def Min(minio_address, minio_admin, minio_password, bucket_name) -> callable: + Min_uploader = MINIO( + minio_address=minio_address, + minio_admin=minio_admin, + minio_password=minio_password, + bucket_name=bucket_name, + ) + return Min_uploader.upload_file From 768fbbd02ddfb55c5e73920586b8a45cb2811538 Mon Sep 17 00:00:00 2001 From: Menghuan1918 Date: Mon, 18 Nov 2024 12:38:49 +0800 Subject: [PATCH 4/4] [example] Add examples --- examples/convert_folder_pdfs.py | 19 +++++++++++++++++++ examples/convert_pdfs_multiple_types.py | 19 +++++++++++++++++++ examples/convert_single_pdf.py | 19 +++++++++++++++++++ pyproject.toml | 6 +++--- 4 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 examples/convert_folder_pdfs.py create mode 100644 examples/convert_pdfs_multiple_types.py create mode 100644 examples/convert_single_pdf.py diff --git a/examples/convert_folder_pdfs.py b/examples/convert_folder_pdfs.py new file mode 100644 index 0000000..1501ec9 --- /dev/null +++ b/examples/convert_folder_pdfs.py @@ -0,0 +1,19 @@ +# This is an example of how to convert all PDF files in a folder to DOCX files. +# 这是一个将文件夹中的所有 PDF 文件转换为 DOCX 文件的示例。 + +from pdfdeal import Doc2X + +# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter +# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 + +# client = Doc2X(apikey="Your API key",debug=True) +client = Doc2X(debug=True) + +success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/test", + output_path="./Output", + output_format="docx", +) +print(success) +print(failed) +print(flag) diff --git a/examples/convert_pdfs_multiple_types.py b/examples/convert_pdfs_multiple_types.py new file mode 100644 index 0000000..5434ea1 --- /dev/null +++ b/examples/convert_pdfs_multiple_types.py @@ -0,0 +1,19 @@ +# This is an example of how to convert all PDF files in a folder to multiple types of files. +# 这是一个将文件夹中的所有 PDF 文件转换为多种类型文件的示例。 + +from pdfdeal import Doc2X + +# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter +# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 + +# client = Doc2X(apikey="Your API key",debug=True) +client = Doc2X(debug=True) + +success, failed, flag = client.pdf2file( + pdf_file="/home/menghuan/文档/Test/pdf", + output_path="./Output", + output_format="docx,md", +) +print(success) +print(failed) +print(flag) diff --git a/examples/convert_single_pdf.py b/examples/convert_single_pdf.py new file mode 100644 index 0000000..2341c7c --- /dev/null +++ b/examples/convert_single_pdf.py @@ -0,0 +1,19 @@ +# This is an example of how to convert a single PDF file to a single DOCX file. +# 这是一个将单个 PDF 文件转换为单个 DOCX 文件的示例。 + +from pdfdeal import Doc2X + +# gets API Key from environment variable DOC2X_APIKEY, or you can pass it as a string to the apikey parameter +# 从环境变量 DOC2X_APIKEY 获取 API Key, 或者您可以将其作为字符串传递给 apikey 参数 + +# client = Doc2X(apikey="Your API key",debug=True) +client = Doc2X(debug=True) + +success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_path="Output", + output_format="docx", +) +print(success) +print(failed) +print(flag) diff --git a/pyproject.toml b/pyproject.toml index a1f4b6d..cb77a48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "pdfdeal" -version = "0.4.8" +version = "0.4.9" authors = [{ name = "Menghuan1918", email = "menghuan@menghuan1918.com" }] description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)." readme = "README.md" @@ -13,8 +13,8 @@ classifiers = [ dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"] [project.optional-dependencies] -rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3"] -dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3"] +rag = ["emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] +dev = ["pytest", "emoji", "Pillow", "reportlab", "oss2", "boto3", "minio"] [project.urls] Issues = "https://github.com/Menghuan1918/pdfdeal/issues"