From e8acf80d49e30cd483c14fd470a556f7b2239fab Mon Sep 17 00:00:00 2001 From: ShangDaFen-GD Date: Mon, 6 Jan 2025 18:34:11 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=86=E5=88=AB?= =?UTF-8?q?=E9=93=B6=E8=A1=8C=E5=8D=A1=E5=88=B0Excel=E3=80=81=E8=AF=86?= =?UTF-8?q?=E5=88=AB=E8=BD=A6=E7=89=8C=E5=88=B0Excel=EF=BC=8C=E4=BD=BFinpu?= =?UTF-8?q?t=5Fpath=E6=97=A2=E5=8F=AF=E4=BB=A5=E5=A1=AB=E5=8D=95=E4=B8=AA?= =?UTF-8?q?=E6=96=87=E4=BB=B6=EF=BC=8C=E5=8F=88=E5=8F=AF=E4=BB=A5=E5=A1=AB?= =?UTF-8?q?=E4=B8=80=E4=B8=AA=E7=9B=AE=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- poocr/api/ocr2excel.py | 66 +++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/poocr/api/ocr2excel.py b/poocr/api/ocr2excel.py index 7c6879d..c99cea2 100644 --- a/poocr/api/ocr2excel.py +++ b/poocr/api/ocr2excel.py @@ -238,7 +238,7 @@ def BankCardOCR2Excel(input_path, output_path=None, output_excel='BankCardOCR2Ex configPath=None, id=None, key=None): """ 识别银行卡,自动保存为Excel文件 - :param input_path: 必填,银行卡图片的位置 + :param input_path: 必填,银行卡图片存放位置,可以填单个文件,也可以填一个目录 :param output_path: 选填,输出Excel的位置 :param output_excel: 选填,输出Excel的名称 :param img_url: 选填,可以是网络图片 @@ -247,22 +247,43 @@ def BankCardOCR2Excel(input_path, output_path=None, output_excel='BankCardOCR2Ex :param key: 你的腾讯账号的密钥,获取方式:https://curl.qcloud.com/fuOGcm2R :return: """ - test_json = poocr.ocr.BankCardOCR(img_path=input_path, img_url=img_url, configPath=configPath, id=id, - key=key) - df = pd.DataFrame(json.loads(str(test_json)), index=[0]) + bankcard_img_files = get_files(input_path) + if bankcard_img_files == None: + raise BaseException(f'{input_path}这个路径下,没有存放任何银行卡,请确认后重新运行') if output_path == None: output_path = './' mkdir(Path(output_path).absolute()) # 如果不存在,则创建输出目录 if output_excel.endswith('.xlsx') or output_excel.endswith('xls'): # 如果指定的输出excel结尾不正确,则报错退出 abs_output_excel = Path(output_path).absolute() / output_excel - df.to_excel(str(abs_output_excel), index=False) + else: # 指定了,但不是xlsx或者xls结束 + raise BaseException( + f'输出结果名:output_excel参数,必须以xls或者xlsx结尾,您的输入:{output_excel}有误,请修改后重新运行') + res_df = [] # 装全部识别的结果 + for bankcard_img in simple_progress(bankcard_img_files): + try: + test_json = poocr.ocr.BankCardOCR(img_path=bankcard_img, img_url=img_url, configPath=configPath, id=id, + key=key) + res_df.append(pd.DataFrame(json.loads(str(test_json)), index=[0])) + except Exception as e: + logger.error(e) + continue + # 整理全部识别结果 + if len(res_df) > 0: + res_excel = res_df[0] + for index, line_df in enumerate(res_df): + if index == 0: + continue + res_excel = res_excel._append(line_df) + pd.DataFrame(res_excel).to_excel(str(abs_output_excel)) # 写入Excel + else: + logger.info(f'该文件夹下,没有任何符合条件的银行卡图片') def LicensePlateOCR2Excel(input_path, output_path=None, output_excel='LicensePlateOCR2Excel.xlsx', img_url=None, configPath=None, id=None, key=None): """ - 识别银行卡,自动保存为Excel文件 - :param input_path: 必填,银行卡图片的位置 + 识别车牌,自动保存为Excel文件 + :param input_path: 必填,车牌图片存放位置,可以填单个文件,也可以填一个目录 :param output_path: 选填,输出Excel的位置 :param output_excel: 选填,输出Excel的名称 :param img_url: 选填,可以是网络图片 @@ -271,15 +292,34 @@ def LicensePlateOCR2Excel(input_path, output_path=None, output_excel='LicensePla :param key: 你的腾讯账号的密钥,获取方式:https://curl.qcloud.com/fuOGcm2R :return: """ - test_json = poocr.ocr.LicensePlateOCR(img_path=input_path, img_url=img_url, configPath=configPath, id=id, - key=key) - df = pd.DataFrame(json.loads(str(test_json)), index=[0]) - if output_path == None: - output_path = './' + license_plates_img_files = get_files(input_path) + if license_plates_img_files == None: + raise BaseException(f'{input_path}这个路径下,没有存放任何车牌,请确认后重新运行') mkdir(Path(output_path).absolute()) # 如果不存在,则创建输出目录 if output_excel.endswith('.xlsx') or output_excel.endswith('xls'): # 如果指定的输出excel结尾不正确,则报错退出 abs_output_excel = Path(output_path).absolute() / output_excel - df.to_excel(str(abs_output_excel), index=False) + else: # 指定了,但不是xlsx或者xls结束 + raise BaseException( + f'输出结果名:output_excel参数,必须以xls或者xlsx结尾,您的输入:{output_excel}有误,请修改后重新运行') + res_df = [] # 装全部识别的结果 + for license_plates_img in simple_progress(license_plates_img_files): + try: + test_json = poocr.ocr.LicensePlateOCR(img_path=license_plates_img, img_url=img_url, configPath=configPath, + id=id, key=key) + res_df.append(pd.DataFrame(json.loads(str(test_json)), index=[0])) + except Exception as e: + logger.error(e) + continue + # 整理全部识别结果 + if len(res_df) > 0: + res_excel = res_df[0] + for index, line_df in enumerate(res_df): + if index == 0: + continue + res_excel = res_excel._append(line_df) + pd.DataFrame(res_excel).to_excel(str(abs_output_excel)) # 写入Excel + else: + logger.info(f'该文件夹下,没有任何符合条件的车牌图片') def household2excel(ak, sk, img_path, output_excel='household2excel.xlsx'): From 2a7751778bc861cab0d31f5a0ba6c61a53f92a55 Mon Sep 17 00:00:00 2001 From: Chryssolion Chen <151327326+chen66-chen@users.noreply.github.com> Date: Wed, 5 Mar 2025 21:18:35 +0800 Subject: [PATCH 2/2] Add files via upload add multi-page PDF --- multi-page PDF.py | 17 +++++++++++ pdf_invoice_utils.py | 68 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 multi-page PDF.py create mode 100644 pdf_invoice_utils.py diff --git a/multi-page PDF.py b/multi-page PDF.py new file mode 100644 index 0000000..7b59aff --- /dev/null +++ b/multi-page PDF.py @@ -0,0 +1,17 @@ +import poocr +from pdf_invoice_utils import process_pdf_invoice + +# 设置腾讯云API密钥 +SecretId = '' +SecretKey = '' + +# PDF发票识别多页并保存到Exce +process_pdf_invoice( + pdf_path=r'C:\Users\Lenovo\Desktop\temp\增值税发票-test.pdf', + output_excel='./晚枫.xlsx', + id=SecretId, + key=SecretKey +) + + + diff --git a/pdf_invoice_utils.py b/pdf_invoice_utils.py new file mode 100644 index 0000000..57ccd6f --- /dev/null +++ b/pdf_invoice_utils.py @@ -0,0 +1,68 @@ +import os +import poocr +from PyPDF2 import PdfReader, PdfWriter +import pandas as pd + +def process_pdf_invoice(pdf_path, output_excel, id=None, key=None): + """ + 将多页PDF发票拆分为单页,分别识别后合并到一个Excel文件 + :param pdf_path: PDF文件路径 + :param output_excel: 输出Excel文件路径 + :param id: 腾讯云API SecretId + :param key: 腾讯云API SecretKey + :return: None + """ + try: + # 创建临时目录 + temp_dir = os.path.join(os.path.dirname(pdf_path), "temp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + + # 拆分 + pdf = PdfReader(pdf_path) + temp_excels = [] + for i in range(len(pdf.pages)): + writer = PdfWriter() + writer.add_page(pdf.pages[i]) + temp_pdf = os.path.join(temp_dir, f"temp_{i}.pdf") + temp_excel = os.path.join(temp_dir, f"temp_{i}.xlsx") + temp_excels.append(temp_excel) + + with open(temp_pdf, 'wb') as f: + writer.write(f) + try: + poocr.ocr2excel.VatInvoiceOCR2Excel( + input_path=temp_pdf, + output_excel=temp_excel, + id=id, + key=key + ) + print(f"第 {i+1} 页处理完成") + except Exception as e: + print(f"第 {i+1} 页处理失败: {e}") + + # 合并所有Excel结果 + try: + valid_excels = [excel for excel in temp_excels if os.path.exists(excel)] + if valid_excels: + result_df = pd.read_excel(valid_excels[0]) + for excel in valid_excels[1:]: + df = pd.read_excel(excel) + result_df = pd.concat([result_df, df], ignore_index=True) + + # 保存合并结果 + result_df.to_excel(output_excel, index=False) + else: + print("没有成功识别的发票页面,无法生成Excel") + except Exception as e: + print(f"合并结果失败: {e}") + + # 清理临时文件 + for file in os.listdir(temp_dir): + try: + os.remove(os.path.join(temp_dir, file)) + except: + pass + + except Exception as e: + print(f"处理失败: {e}") \ No newline at end of file