Skip to content

Commit

Permalink
Merge pull request #35 from Menghuan1918/dev
Browse files Browse the repository at this point in the history
[ALL] Change `print` to `logging`
  • Loading branch information
Menghuan1918 authored Aug 29, 2024
2 parents 05c9503 + 99caa19 commit 0cbabdb
Show file tree
Hide file tree
Showing 14 changed files with 110 additions and 85 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdfdeal"
version = "0.3.0"
version = "0.3.1"
authors = [{ name = "Menghuan1918", email = "[email protected]" }]
description = "A python wrapper for the Doc2X API and comes with native PDF processing (to improve PDF recall in RAG)."
readme = "README.md"
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
log_cli = true
log_cli_level = INFO
6 changes: 6 additions & 0 deletions src/pdfdeal/CLI/doc2x.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pdfdeal import Doc2X
from pdfdeal.file_tools import get_files
from pdfdeal.Watch.config import curses_select
import logging

LANGUAGES = ["简体中文", "Enlish"]
WORDS_CN = [
Expand Down Expand Up @@ -153,6 +154,11 @@ def main():
args = parser.parse_args()
rpm = None

# logging set to info
logging.basicConfig(level=logging.INFO)
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.WARNING)

if args.clear:
delete_one_global_setting("Doc2X_Key")
delete_one_global_setting("Doc2X_RPM")
Expand Down
61 changes: 23 additions & 38 deletions src/pdfdeal/Doc2X/Exception.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import asyncio
from functools import wraps
import time
import traceback
import sys
from concurrent.futures import ThreadPoolExecutor
import logging


class RateLimit(Exception):
Expand Down Expand Up @@ -41,42 +41,35 @@ def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
while retries < max_retries + 1:
try:
return await func(*args, **kwargs)
except RateLimit:
raise RateLimit
except FileError as e:
print(
f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
logging.exception(
f"Error in function '{func.__name__}': {type(e).__name__}:"
)
print("Error details:\n")
print(traceback.format_exc())
print("===================")
raise FileError(e)
except RequestError as e:
print(
f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
logging.exception(
f"Error in function '{func.__name__}': {type(e).__name__}:"
)
print("Error details:\n")
print(traceback.format_exc())
print("===================")
raise RequestError(f"{e} \nThis usually means the file is broken.")
except Exception as e:
last_exception = e
if retries == max_retries:
logging.exception(
f"Error in function '{func.__name__}': {type(e).__name__}:"
)
break
wait_time = backoff_factor**retries
print("===================")
print(
logging.warning(
f"⚠️ Exception in function '{func.__name__}': {type(e).__name__} - {e}"
)
print(f"⌛ Retrying in {wait_time} seconds.")
logging.warning(f"⌛ Retrying in {wait_time} seconds.")
await asyncio.sleep(wait_time)
retries += 1
if retries == max_retries:
print("===================")
print("Error details:\n")
print(traceback.format_exc())
print("===================")
raise last_exception

return wrapper
Expand All @@ -95,38 +88,30 @@ def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
retries = 0
while retries < max_retries:
while retries < max_retries + 1:
try:
return func(*args, **kwargs)
except RateLimit:
raise RateLimit
except FileError as e:
print(
f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
logging.exception(
f"Error in function '{func.__name__}': {type(e).__name__}:"
)
print("===================")
print("Error details:\n")
print(traceback.format_exc())
print("===================")
raise e
except Exception as e:
print(
f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
)
last_exception = e
if retries == max_retries:
logging.exception(
f"Error in function '{func.__name__}': {type(e).__name__}:"
)
break
wait_time = backoff_factor**retries
print("===================")
print(
logging.warning(
f"⚠️ Exception in function '{func.__name__}': {type(e).__name__} - {e}"
)
print(f"⌛ Retrying in {wait_time} seconds.")
logging.warning(f"⌛ Retrying in {wait_time} seconds.")
time.sleep(wait_time)
retries += 1
if retries == max_retries:
print("===================")
print("Error details:\n")
print(traceback.format_exc())
print("===================")
raise last_exception

return wrapper
Expand Down
3 changes: 2 additions & 1 deletion src/pdfdeal/FileTools/Img/Ali_OSS.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import oss2
import logging


class OSS:
Expand Down Expand Up @@ -32,7 +33,7 @@ def upload_file(self, local_file_path, remote_file_path):
True,
)
except Exception as e:
print(f"Error to upload the file: {local_file_path}, {e}")
logging.error(f"Error to upload the file: {local_file_path}, {e}")
return e, False


Expand Down
3 changes: 2 additions & 1 deletion src/pdfdeal/FileTools/Img/S3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import boto3
from botocore.exceptions import NoCredentialsError, ClientError
import logging


class S3_Settings:
Expand Down Expand Up @@ -52,7 +53,7 @@ def upload_file(self, local_file_path, remote_file_path):
True,
)
except (NoCredentialsError, ClientError) as e:
print(f"Error to upload the file: {local_file_path}, {e}")
logging.exception(f"Error to upload the file: {local_file_path}, {e}")
return e, False


Expand Down
7 changes: 4 additions & 3 deletions src/pdfdeal/FileTools/dealpdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import uuid
from typing import Tuple, Callable
from .file_tools import list_rename
import logging


def strore_pdf(pdf_path, Text):
Expand Down Expand Up @@ -141,17 +142,17 @@ def deal_pdf(
success_file.append("")
failed_file.append({"error": str(e), "file": pdf_path})
error_flag = True
print(
logging.info(
f"PDFDEAL Progress: {sum(1 for s in success_file if s != '')}/{len(pdf_file)} files successfully processed."
)
if All_Done is False:
print(
logging.warning(
"Some pictures are failed to OCR, but the text and reset pictures is extracted"
)
if error_flag:
for f in failed_file:
if f["error"] != "":
print(
logging.error(
f"-----\nFailed to process file: {f['file']} with error: {f['error']}\n-----"
)

Expand Down
49 changes: 28 additions & 21 deletions src/pdfdeal/FileTools/extract_img.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from ..Doc2X.Exception import nomal_retry
import concurrent.futures
import logging


def get_imgcdnlink_list(text: str) -> Tuple[list, list]:
Expand Down Expand Up @@ -84,7 +85,7 @@ def md_replace_imgs(

imglist, imgpath = get_imgcdnlink_list(content)
if len(imglist) == 0:
print("No image links found in the markdown file.")
logging.warning("No image links found in the markdown file.")
return True

no_outputppath_flag = False
Expand All @@ -93,14 +94,14 @@ def md_replace_imgs(
outputpath = os.path.splitext(mdfile)[0] + "_img"
os.makedirs(outputpath, exist_ok=True)

print(f"Start to download images from file {mdfile}")
logging.info(f"Start to download images from file {mdfile}")

def download_image(i, imgurl, outputpath, relative, mdfile):
if not imgurl.startswith("http"):
print(f"Not a valid url: {imgurl}, Skip it.")
logging.info(f"Not a valid url: {imgurl}, Skip it.")
return None
elif skip and imgurl.startswith(skip):
print(f"Skip the image: {imgurl}, because it starts with {skip}.")
logging.info(f"Skip the image: {imgurl}, because it starts with {skip}.")
return None
try:
savepath = f"{outputpath}/img{i}"
Expand All @@ -113,7 +114,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
savepath = os.path.abspath(savepath)
return (imglist[i], f"![{imgurl}](<{savepath}>)\n")
except Exception as e:
print(
logging.warning(
f"Error to download the image: {imgurl}, continue to download the next image:\n {e}"
)
return None
Expand Down Expand Up @@ -141,7 +142,9 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
content = content.replace(old, new)

if len(replacements) < len(imglist):
print("Some images may not be downloaded successfully. Please check the log.")
logging.info(
"Some images may not be downloaded successfully. Please check the log."
)
flag = False

if isinstance(replace, Callable):
Expand All @@ -150,7 +153,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
@nomal_retry()
def upload_task(i, img_path, replace):
if img_path.startswith("http://") or img_path.startswith("https://"):
print(f"Skip the image: {img_path}, because it is a url.")
logging.info(f"Skip the image: {img_path}, because it is a url.")
return None, None, None
if os.path.isabs(img_path) is False:
img_path = os.path.join(os.path.dirname(mdfile), img_path)
Expand All @@ -161,12 +164,14 @@ def upload_task(i, img_path, replace):
img_url = f"![{os.path.splitext(os.path.basename(mdfile))[0]}](<{new_url}>)\n"
return img_url, True, i
else:
print(f"=====\nError to upload the image: {img_path}, {new_url}")
print("Continue to upload the next image.")
logging.error(
f"Error to upload the image: {img_path}, {new_url}, continue to upload the next image."
)
return new_url, False, i
except Exception as e:
print(f"=====\nError to upload the image: {img_path}, {e}")
print("Continue to upload the next image.")
except Exception:
logging.exception(
f"=====\nError to upload the image: {img_path}, Continue to upload the next image:"
)
return new_url, False, i

with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
Expand All @@ -181,8 +186,9 @@ def upload_task(i, img_path, replace):
elif flag is None:
pass
else:
print(f"=====\nError to upload the image: {imgpath[i]}, {new_url}")
print("Continue to upload the next image.")
logging.warning(
f"=====\nError to upload the image: {imgpath[i]}, {new_url}, continue to upload the next image."
)
flag = False

if no_outputppath_flag:
Expand All @@ -194,12 +200,12 @@ def upload_task(i, img_path, replace):
try:
os.rmdir(outputpath)
except Exception as e:
print(f"\nError to remove the folder: {outputpath}, {e}")
logging.error(f"\nError to remove the folder: {outputpath}, {e}")

with open(mdfile, "w", encoding="utf-8") as file:
file.write(content)

print(f"Finish to process images in file {mdfile}.")
logging.info(f"Finish to process images in file {mdfile}.")
return flag


Expand Down Expand Up @@ -240,7 +246,7 @@ def mds_replace_imgs(

mdfiles = gen_folder_list(path=path, mode="md", recursive=True)
if len(mdfiles) == 0:
print("No markdown file found in the path.")
logging.warning("No markdown file found in the path.")
return [], [], True

import concurrent.futures
Expand Down Expand Up @@ -273,17 +279,18 @@ def process_mdfile(mdfile, replace, outputpath, relative):
if error:
Fail_flag = False
fail_files.append({"error": str(error), "path": mdfile})
print(f"Error to process the markdown file: {mdfile}, {error}")
print("Continue to process the next markdown file.")
logging.warning(
f"Error to process the markdown file: {mdfile}, {error}, continue to process the next markdown file."
)
else:
success_files.append(mdfile)

print(
logging.info(
f"\n[MARKDOWN REPLACE] Successfully processed {len(success_files)}/{len(mdfiles)} markdown files."
)

if Fail_flag is False:
print("Some markdown files process failed.")
logging.info("Some markdown files process failed.")
return success_files, fail_files, True

return success_files, fail_files, False
11 changes: 5 additions & 6 deletions src/pdfdeal/FileTools/file_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import Tuple
from ..Doc2X.Types import Support_File_Type, OutputFormat
from .dealmd import split_of_md
import traceback
import logging


def clean_text(text):
Expand Down Expand Up @@ -341,8 +341,7 @@ def auto_split_md(
try:
new_content = split_of_md(mdfile=mdfile, mode="title")
except Exception as e:
print(traceback.format_exc())
print(f"=====\nError deal with {mdfile} : {e}")
logging.exception(f"Error deal with {mdfile} :")
return f"Error deal with {mdfile} : {e}", False

if out_type == "multi":
Expand Down Expand Up @@ -433,14 +432,14 @@ def auto_split_mds(
success.append("")
failed.append({"error": e, "file": mdfile})
flag = True
print(
logging.info(
f"MD SPLIT: {sum([1 for i in success if i != ''])}/{len(success)} files are successfully splited."
)
print(f"Note the split string is :\n{split_str}")
logging.warning(f"Note the split string is :\n{split_str}")
if flag:
for failed_file in failed:
if failed_file["error"] != "":
print(
logging.warning(
f"=====\nError deal with {failed_file['file']} : {failed_file['error']}"
)
return success, failed, flag
Loading

0 comments on commit 0cbabdb

Please sign in to comment.