Merge pull request #35 from Menghuan1918/dev

[ALL] Change `print` to `logging`
NoEdgeAI · Aug 29, 2024 · 0cbabdb · 0cbabdb
2 parents 05c9503 + 99caa19
commit 0cbabdb
Show file tree

Hide file tree

Showing 14 changed files with 110 additions and 85 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pdfdeal"
-version = "0.3.0"
+version = "0.3.1"
 authors = [{ name = "Menghuan1918", email = "[email protected]" }]
 description = "A python wrapper for the Doc2X API and comes with native PDF processing (to improve PDF recall in RAG)."
 readme = "README.md"

diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+log_cli = true
+log_cli_level = INFO
diff --git a/src/pdfdeal/CLI/doc2x.py b/src/pdfdeal/CLI/doc2x.py
@@ -8,6 +8,7 @@
 from pdfdeal import Doc2X
 from pdfdeal.file_tools import get_files
 from pdfdeal.Watch.config import curses_select
+import logging
 
 LANGUAGES = ["简体中文", "Enlish"]
 WORDS_CN = [
@@ -153,6 +154,11 @@ def main():
     args = parser.parse_args()
     rpm = None
 
+    # logging set to info
+    logging.basicConfig(level=logging.INFO)
+    httpx_logger = logging.getLogger("httpx")
+    httpx_logger.setLevel(logging.WARNING)
+
     if args.clear:
         delete_one_global_setting("Doc2X_Key")
         delete_one_global_setting("Doc2X_RPM")

diff --git a/src/pdfdeal/Doc2X/Exception.py b/src/pdfdeal/Doc2X/Exception.py
@@ -1,9 +1,9 @@
 import asyncio
 from functools import wraps
 import time
-import traceback
 import sys
 from concurrent.futures import ThreadPoolExecutor
+import logging
 
 
 class RateLimit(Exception):
@@ -41,42 +41,35 @@ def decorator(func):
         @wraps(func)
         async def wrapper(*args, **kwargs):
             retries = 0
-            while retries < max_retries:
+            while retries < max_retries + 1:
                 try:
                     return await func(*args, **kwargs)
                 except RateLimit:
                     raise RateLimit
                 except FileError as e:
-                    print(
-                        f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
+                    logging.exception(
+                        f"Error in function '{func.__name__}': {type(e).__name__}:"
                     )
-                    print("Error details:\n")
-                    print(traceback.format_exc())
-                    print("===================")
                     raise FileError(e)
                 except RequestError as e:
-                    print(
-                        f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
+                    logging.exception(
+                        f"Error in function '{func.__name__}': {type(e).__name__}:"
                     )
-                    print("Error details:\n")
-                    print(traceback.format_exc())
-                    print("===================")
                     raise RequestError(f"{e} \nThis usually means the file is broken.")
                 except Exception as e:
                     last_exception = e
+                    if retries == max_retries:
+                        logging.exception(
+                            f"Error in function '{func.__name__}': {type(e).__name__}:"
+                        )
+                        break
                     wait_time = backoff_factor**retries
-                    print("===================")
-                    print(
+                    logging.warning(
                         f"⚠️ Exception in function '{func.__name__}': {type(e).__name__} - {e}"
                     )
-                    print(f"⌛ Retrying in {wait_time} seconds.")
+                    logging.warning(f"⌛ Retrying in {wait_time} seconds.")
                     await asyncio.sleep(wait_time)
                     retries += 1
-                    if retries == max_retries:
-                        print("===================")
-                        print("Error details:\n")
-                        print(traceback.format_exc())
-            print("===================")
             raise last_exception
 
         return wrapper
@@ -95,38 +88,30 @@ def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             retries = 0
-            while retries < max_retries:
+            while retries < max_retries + 1:
                 try:
                     return func(*args, **kwargs)
                 except RateLimit:
                     raise RateLimit
                 except FileError as e:
-                    print(
-                        f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
+                    logging.exception(
+                        f"Error in function '{func.__name__}': {type(e).__name__}:"
                     )
-                    print("===================")
-                    print("Error details:\n")
-                    print(traceback.format_exc())
-                    print("===================")
                     raise e
                 except Exception as e:
-                    print(
-                        f"Error in function '{func.__name__}': {type(e).__name__} - {e}"
-                    )
                     last_exception = e
+                    if retries == max_retries:
+                        logging.exception(
+                            f"Error in function '{func.__name__}': {type(e).__name__}:"
+                        )
+                        break
                     wait_time = backoff_factor**retries
-                    print("===================")
-                    print(
+                    logging.warning(
                         f"⚠️ Exception in function '{func.__name__}': {type(e).__name__} - {e}"
                     )
-                    print(f"⌛ Retrying in {wait_time} seconds.")
+                    logging.warning(f"⌛ Retrying in {wait_time} seconds.")
                     time.sleep(wait_time)
                     retries += 1
-                    if retries == max_retries:
-                        print("===================")
-                        print("Error details:\n")
-                        print(traceback.format_exc())
-            print("===================")
             raise last_exception
 
         return wrapper

diff --git a/src/pdfdeal/FileTools/Img/Ali_OSS.py b/src/pdfdeal/FileTools/Img/Ali_OSS.py
@@ -1,4 +1,5 @@
 import oss2
+import logging
 
 
 class OSS:
@@ -32,7 +33,7 @@ def upload_file(self, local_file_path, remote_file_path):
                 True,
             )
         except Exception as e:
-            print(f"Error to upload the file: {local_file_path}, {e}")
+            logging.error(f"Error to upload the file: {local_file_path}, {e}")
             return e, False
 
 

diff --git a/src/pdfdeal/FileTools/Img/S3.py b/src/pdfdeal/FileTools/Img/S3.py
@@ -1,5 +1,6 @@
 import boto3
 from botocore.exceptions import NoCredentialsError, ClientError
+import logging
 
 
 class S3_Settings:
@@ -52,7 +53,7 @@ def upload_file(self, local_file_path, remote_file_path):
                 True,
             )
         except (NoCredentialsError, ClientError) as e:
-            print(f"Error to upload the file: {local_file_path}, {e}")
+            logging.exception(f"Error to upload the file: {local_file_path}, {e}")
             return e, False
 
 

diff --git a/src/pdfdeal/FileTools/dealpdfs.py b/src/pdfdeal/FileTools/dealpdfs.py
@@ -9,6 +9,7 @@
 import uuid
 from typing import Tuple, Callable
 from .file_tools import list_rename
+import logging
 
 
 def strore_pdf(pdf_path, Text):
@@ -141,17 +142,17 @@ def deal_pdf(
             success_file.append("")
             failed_file.append({"error": str(e), "file": pdf_path})
             error_flag = True
-    print(
+    logging.info(
         f"PDFDEAL Progress: {sum(1 for s in success_file if s != '')}/{len(pdf_file)} files successfully processed."
     )
     if All_Done is False:
-        print(
+        logging.warning(
             "Some pictures are failed to OCR, but the text and reset pictures is extracted"
         )
     if error_flag:
         for f in failed_file:
             if f["error"] != "":
-                print(
+                logging.error(
                     f"-----\nFailed to process file: {f['file']} with error: {f['error']}\n-----"
                 )
 

diff --git a/src/pdfdeal/FileTools/extract_img.py b/src/pdfdeal/FileTools/extract_img.py
@@ -4,6 +4,7 @@
 import os
 from ..Doc2X.Exception import nomal_retry
 import concurrent.futures
+import logging
 
 
 def get_imgcdnlink_list(text: str) -> Tuple[list, list]:
@@ -84,7 +85,7 @@ def md_replace_imgs(
 
     imglist, imgpath = get_imgcdnlink_list(content)
     if len(imglist) == 0:
-        print("No image links found in the markdown file.")
+        logging.warning("No image links found in the markdown file.")
         return True
 
     no_outputppath_flag = False
@@ -93,14 +94,14 @@ def md_replace_imgs(
         outputpath = os.path.splitext(mdfile)[0] + "_img"
     os.makedirs(outputpath, exist_ok=True)
 
-    print(f"Start to download images from file {mdfile}")
+    logging.info(f"Start to download images from file {mdfile}")
 
     def download_image(i, imgurl, outputpath, relative, mdfile):
         if not imgurl.startswith("http"):
-            print(f"Not a valid url: {imgurl}, Skip it.")
+            logging.info(f"Not a valid url: {imgurl}, Skip it.")
             return None
         elif skip and imgurl.startswith(skip):
-            print(f"Skip the image: {imgurl}, because it starts with {skip}.")
+            logging.info(f"Skip the image: {imgurl}, because it starts with {skip}.")
             return None
         try:
             savepath = f"{outputpath}/img{i}"
@@ -113,7 +114,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
                 savepath = os.path.abspath(savepath)
                 return (imglist[i], f"![{imgurl}](<{savepath}>)\n")
         except Exception as e:
-            print(
+            logging.warning(
                 f"Error to download the image: {imgurl}, continue to download the next image:\n {e}"
             )
             return None
@@ -141,7 +142,9 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
         content = content.replace(old, new)
 
     if len(replacements) < len(imglist):
-        print("Some images may not be downloaded successfully. Please check the log.")
+        logging.info(
+            "Some images may not be downloaded successfully. Please check the log."
+        )
         flag = False
 
     if isinstance(replace, Callable):
@@ -150,7 +153,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
         @nomal_retry()
         def upload_task(i, img_path, replace):
             if img_path.startswith("http://") or img_path.startswith("https://"):
-                print(f"Skip the image: {img_path}, because it is a url.")
+                logging.info(f"Skip the image: {img_path}, because it is a url.")
                 return None, None, None
             if os.path.isabs(img_path) is False:
                 img_path = os.path.join(os.path.dirname(mdfile), img_path)
@@ -161,12 +164,14 @@ def upload_task(i, img_path, replace):
                     img_url = f"![{os.path.splitext(os.path.basename(mdfile))[0]}](<{new_url}>)\n"
                     return img_url, True, i
                 else:
-                    print(f"=====\nError to upload the image: {img_path}, {new_url}")
-                    print("Continue to upload the next image.")
+                    logging.error(
+                        f"Error to upload the image: {img_path}, {new_url}, continue to upload the next image."
+                    )
                     return new_url, False, i
-            except Exception as e:
-                print(f"=====\nError to upload the image: {img_path}, {e}")
-                print("Continue to upload the next image.")
+            except Exception:
+                logging.exception(
+                    f"=====\nError to upload the image: {img_path}, Continue to upload the next image:"
+                )
                 return new_url, False, i
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
@@ -181,8 +186,9 @@ def upload_task(i, img_path, replace):
                 elif flag is None:
                     pass
                 else:
-                    print(f"=====\nError to upload the image: {imgpath[i]}, {new_url}")
-                    print("Continue to upload the next image.")
+                    logging.warning(
+                        f"=====\nError to upload the image: {imgpath[i]}, {new_url}, continue to upload the next image."
+                    )
                     flag = False
 
         if no_outputppath_flag:
@@ -194,12 +200,12 @@ def upload_task(i, img_path, replace):
             try:
                 os.rmdir(outputpath)
             except Exception as e:
-                print(f"\nError to remove the folder: {outputpath}, {e}")
+                logging.error(f"\nError to remove the folder: {outputpath}, {e}")
 
     with open(mdfile, "w", encoding="utf-8") as file:
         file.write(content)
 
-    print(f"Finish to process images in file {mdfile}.")
+    logging.info(f"Finish to process images in file {mdfile}.")
     return flag
 
 
@@ -240,7 +246,7 @@ def mds_replace_imgs(
 
     mdfiles = gen_folder_list(path=path, mode="md", recursive=True)
     if len(mdfiles) == 0:
-        print("No markdown file found in the path.")
+        logging.warning("No markdown file found in the path.")
         return [], [], True
 
     import concurrent.futures
@@ -273,17 +279,18 @@ def process_mdfile(mdfile, replace, outputpath, relative):
             if error:
                 Fail_flag = False
                 fail_files.append({"error": str(error), "path": mdfile})
-                print(f"Error to process the markdown file: {mdfile}, {error}")
-                print("Continue to process the next markdown file.")
+                logging.warning(
+                    f"Error to process the markdown file: {mdfile}, {error}, continue to process the next markdown file."
+                )
             else:
                 success_files.append(mdfile)
 
-    print(
+    logging.info(
         f"\n[MARKDOWN REPLACE] Successfully processed {len(success_files)}/{len(mdfiles)} markdown files."
     )
 
     if Fail_flag is False:
-        print("Some markdown files process failed.")
+        logging.info("Some markdown files process failed.")
         return success_files, fail_files, True
 
     return success_files, fail_files, False
diff --git a/src/pdfdeal/FileTools/file_tools.py b/src/pdfdeal/FileTools/file_tools.py
@@ -10,7 +10,7 @@
 from typing import Tuple
 from ..Doc2X.Types import Support_File_Type, OutputFormat
 from .dealmd import split_of_md
-import traceback
+import logging
 
 
 def clean_text(text):
@@ -341,8 +341,7 @@ def auto_split_md(
     try:
         new_content = split_of_md(mdfile=mdfile, mode="title")
     except Exception as e:
-        print(traceback.format_exc())
-        print(f"=====\nError deal with {mdfile} : {e}")
+        logging.exception(f"Error deal with {mdfile} :")
         return f"Error deal with {mdfile} : {e}", False
 
     if out_type == "multi":
@@ -433,14 +432,14 @@ def auto_split_mds(
             success.append("")
             failed.append({"error": e, "file": mdfile})
             flag = True
-    print(
+    logging.info(
         f"MD SPLIT: {sum([1 for i in success if i != ''])}/{len(success)} files are successfully splited."
     )
-    print(f"Note the split string is :\n{split_str}")
+    logging.warning(f"Note the split string is :\n{split_str}")
     if flag:
         for failed_file in failed:
             if failed_file["error"] != "":
-                print(
+                logging.warning(
                     f"=====\nError deal with {failed_file['file']} : {failed_file['error']}"
                 )
     return success, failed, flag