Skip to content

Commit

Permalink
[filetools] Bug fix and change for API translate
Browse files Browse the repository at this point in the history
  • Loading branch information
Menghuan1918 committed Dec 20, 2024
1 parent 8324c39 commit 97a8d36
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 23 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdfdeal"
version = "1.0.1"
version = "1.0.2"
authors = [{ name = "Menghuan1918", email = "[email protected]" }]
description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
readme = "README.md"
Expand Down
43 changes: 21 additions & 22 deletions src/pdfdeal/FileTools/extract_img.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hashlib
from ..Doc2X.Exception import nomal_retry
import concurrent.futures
import logging

import uuid


Expand Down Expand Up @@ -79,6 +79,7 @@ def md_replace_imgs(
Returns:
bool: If all images are downloaded successfully, return True, else return False.
"""
flag = True
if isinstance(replace, str) and replace == "local":
pass
elif isinstance(replace, Callable):
Expand All @@ -91,7 +92,7 @@ def md_replace_imgs(

imglist, imgpath = get_imgcdnlink_list(content)
if len(imglist) == 0:
logging.warning("No image links found in the markdown file.")
print("No image links found in the markdown file.")
return True

no_outputppath_flag = False
Expand All @@ -100,14 +101,14 @@ def md_replace_imgs(
outputpath = os.path.splitext(mdfile)[0] + "_img"
os.makedirs(outputpath, exist_ok=True)

logging.info(f"Start to download images from file {mdfile}")
print(f"Start to download images from file {mdfile}")

def download_image(i, imgurl, outputpath, relative, mdfile):
if not imgurl.startswith("http"):
logging.info(f"Not a valid url: {imgurl}, Skip it.")
print(f"Not a valid url: {imgurl}, Skip it.")
return None
elif skip and imgurl.startswith(skip):
logging.info(f"Skip the image: {imgurl}, because it starts with {skip}.")
print(f"Skip the image: {imgurl}, because it starts with {skip}.")
return None
try:
savepath = f"{outputpath}/img{i}"
Expand All @@ -120,9 +121,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
savepath = os.path.abspath(savepath)
return (imglist[i], f"![{imgurl}](<{savepath}>)\n")
except Exception as e:
logging.warning(
f"Error to download the image: {imgurl}, keep original url:\n {e}"
)
print(f"Error to download the image: {imgurl}, keep original url:\n {e}")
return None

replacements = []
Expand All @@ -147,7 +146,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
content = content.replace(old, new)

if len(replacements) < len(imglist):
logging.info(
print(
"Some images were not downloaded successfully. Original URLs have been kept."
)
flag = False
Expand All @@ -158,7 +157,7 @@ def download_image(i, imgurl, outputpath, relative, mdfile):
@nomal_retry()
def upload_task(i, img_path, replace):
if img_path.startswith("http://") or img_path.startswith("https://"):
logging.info(f"Skip the image: {img_path}, because it is a url.")
print(f"Skip the image: {img_path}, because it is a url.")
return None, None, None
if os.path.isabs(img_path) is False:
img_path = os.path.join(os.path.dirname(mdfile), img_path)
Expand All @@ -172,7 +171,7 @@ def upload_task(i, img_path, replace):
os.rename(img_path, new_local_path)
img_path = new_local_path # 更新img_path为新的路径
except Exception as e:
logging.warning(
print(
f"Failed to rename file {img_path} to {new_local_path}: {e}"
)
if path_style:
Expand All @@ -187,13 +186,13 @@ def upload_task(i, img_path, replace):
img_url = f"![{os.path.splitext(os.path.basename(mdfile))[0]}](<{new_url}>)\n"
return img_url, True, i
else:
logging.error(
print(
f"Error to upload the image: {img_path}, {new_url}, keeping original path."
)
return new_url, False, i
except Exception:
logging.exception(
f"=====\nError to upload the image: {img_path}, keeping original path:"
except Exception as e:
print(
f"=====\nError to upload the image: {img_path}, as {e} keeping original path:"
)
return None, False, i

Expand All @@ -209,7 +208,7 @@ def upload_task(i, img_path, replace):
elif flag is None:
pass
else:
logging.warning(
print(
f"=====\nError to upload the image: {imgpath[i]}, keeping original path."
)
flag = False
Expand All @@ -225,12 +224,12 @@ def upload_task(i, img_path, replace):
if not os.listdir(outputpath):
os.rmdir(outputpath)
except Exception as e:
logging.error(f"\nError to remove the folder: {outputpath}, {e}")
print(f"\nError to remove the folder: {outputpath}, {e}")

with open(mdfile, "w", encoding="utf-8") as file:
file.write(content)

logging.info(f"Finish to process images in file {mdfile}.")
print(f"Finish to process images in file {mdfile}.")
return flag


Expand Down Expand Up @@ -275,7 +274,7 @@ def mds_replace_imgs(

mdfiles = gen_folder_list(path=path, mode="md", recursive=True)
if len(mdfiles) == 0:
logging.warning("No markdown file found in the path.")
print("No markdown file found in the path.")
return [], [], True

import concurrent.futures
Expand Down Expand Up @@ -310,18 +309,18 @@ def process_mdfile(mdfile, replace, outputpath, relative):
if error:
Fail_flag = False
fail_files.append({"error": str(error), "path": mdfile})
logging.warning(
print(
f"Error to process the markdown file: {mdfile}, {error}, continue to process the next markdown file."
)
else:
success_files.append(mdfile)

logging.info(
print(
f"\n[MARKDOWN REPLACE] Successfully processed {len(success_files)}/{len(mdfiles)} markdown files."
)

if Fail_flag is False:
logging.info("Some markdown files process failed.")
print("Some markdown files process failed.")
return success_files, fail_files, True

return success_files, fail_files, False

0 comments on commit 97a8d36

Please sign in to comment.