CoderWanFeng · chen66-chen · Jan 6, 2025 · Jan 6, 2025 · Mar 5, 2025
diff --git a/multi-page PDF.py b/multi-page PDF.py
@@ -0,0 +1,17 @@
+import poocr
+from pdf_invoice_utils import process_pdf_invoice
+
+# 设置腾讯云API密钥
+SecretId = ''
+SecretKey = ''
+
+# PDF发票识别多页并保存到Exce
+process_pdf_invoice(
+    pdf_path=r'C:\Users\Lenovo\Desktop\temp\增值税发票-test.pdf',
+    output_excel='./晚枫.xlsx',
+    id=SecretId,
+    key=SecretKey
+)
+
+
+
diff --git a/pdf_invoice_utils.py b/pdf_invoice_utils.py
@@ -0,0 +1,68 @@
+import os
+import poocr
+from PyPDF2 import PdfReader, PdfWriter
+import pandas as pd
+
+def process_pdf_invoice(pdf_path, output_excel, id=None, key=None):
+    """
+    将多页PDF发票拆分为单页，分别识别后合并到一个Excel文件
+    :param pdf_path: PDF文件路径
+    :param output_excel: 输出Excel文件路径
+    :param id: 腾讯云API SecretId
+    :param key: 腾讯云API SecretKey
+    :return: None
+    """
+    try:
+        # 创建临时目录
+        temp_dir = os.path.join(os.path.dirname(pdf_path), "temp")
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+
+        # 拆分
+        pdf = PdfReader(pdf_path)
+        temp_excels = []
+        for i in range(len(pdf.pages)):
+            writer = PdfWriter()
+            writer.add_page(pdf.pages[i])
+            temp_pdf = os.path.join(temp_dir, f"temp_{i}.pdf")
+            temp_excel = os.path.join(temp_dir, f"temp_{i}.xlsx")
+            temp_excels.append(temp_excel)
+
+            with open(temp_pdf, 'wb') as f:
+                writer.write(f)
+            try:
+                poocr.ocr2excel.VatInvoiceOCR2Excel(
+                    input_path=temp_pdf,
+                    output_excel=temp_excel,
+                    id=id,
+                    key=key
+                )
+                print(f"第 {i+1} 页处理完成")
+            except Exception as e:
+                print(f"第 {i+1} 页处理失败: {e}")
+
+        # 合并所有Excel结果
+        try:
+            valid_excels = [excel for excel in temp_excels if os.path.exists(excel)]
+            if valid_excels:
+                result_df = pd.read_excel(valid_excels[0])
+                for excel in valid_excels[1:]:
+                    df = pd.read_excel(excel)
+                    result_df = pd.concat([result_df, df], ignore_index=True)
+
+                # 保存合并结果
+                result_df.to_excel(output_excel, index=False)
+            else:
+                print("没有成功识别的发票页面，无法生成Excel")
+        except Exception as e:
+            print(f"合并结果失败: {e}")
+
+        # 清理临时文件
+        for file in os.listdir(temp_dir):
+            try:
+                os.remove(os.path.join(temp_dir, file))
+            except:
+                pass
+
+    except Exception as e:
+        print(f"处理失败: {e}") 
diff --git a/poocr/api/ocr2excel.py b/poocr/api/ocr2excel.py
@@ -238,7 +238,7 @@ def BankCardOCR2Excel(input_path, output_path=None, output_excel='BankCardOCR2Ex
                       configPath=None, id=None, key=None):
     """
     识别银行卡，自动保存为Excel文件
-    :param input_path: 必填，银行卡图片的位置
+    :param input_path: 必填，银行卡图片存放位置，可以填单个文件，也可以填一个目录
     :param output_path: 选填，输出Excel的位置
     :param output_excel: 选填，输出Excel的名称
     :param img_url: 选填，可以是网络图片
@@ -247,22 +247,43 @@ def BankCardOCR2Excel(input_path, output_path=None, output_excel='BankCardOCR2Ex
     :param key: 你的腾讯账号的密钥，获取方式：https://curl.qcloud.com/fuOGcm2R
     :return:
     """
-    test_json = poocr.ocr.BankCardOCR(img_path=input_path, img_url=img_url, configPath=configPath, id=id,
-                                      key=key)
-    df = pd.DataFrame(json.loads(str(test_json)), index=[0])
+    bankcard_img_files = get_files(input_path)
+    if bankcard_img_files == None:
+        raise BaseException(f'{input_path}这个路径下，没有存放任何银行卡，请确认后重新运行')
     if output_path == None:
         output_path = './'
     mkdir(Path(output_path).absolute())  # 如果不存在，则创建输出目录
     if output_excel.endswith('.xlsx') or output_excel.endswith('xls'):  # 如果指定的输出excel结尾不正确，则报错退出
         abs_output_excel = Path(output_path).absolute() / output_excel
-    df.to_excel(str(abs_output_excel), index=False)
+    else:  # 指定了，但不是xlsx或者xls结束
+        raise BaseException(
+            f'输出结果名：output_excel参数，必须以xls或者xlsx结尾，您的输入:{output_excel}有误，请修改后重新运行')
+    res_df = []  # 装全部识别的结果
+    for bankcard_img in simple_progress(bankcard_img_files):
+        try:
+            test_json = poocr.ocr.BankCardOCR(img_path=bankcard_img, img_url=img_url, configPath=configPath, id=id,
+                                              key=key)
+            res_df.append(pd.DataFrame(json.loads(str(test_json)), index=[0]))
+        except Exception as e:
+            logger.error(e)
+            continue
+    # 整理全部识别结果
+    if len(res_df) > 0:
+        res_excel = res_df[0]
+        for index, line_df in enumerate(res_df):
+            if index == 0:
+                continue
+            res_excel = res_excel._append(line_df)
+        pd.DataFrame(res_excel).to_excel(str(abs_output_excel))  # 写入Excel
+    else:
+        logger.info(f'该文件夹下，没有任何符合条件的银行卡图片')
 
 
 def LicensePlateOCR2Excel(input_path, output_path=None, output_excel='LicensePlateOCR2Excel.xlsx', img_url=None,
                           configPath=None, id=None, key=None):
     """
-    识别银行卡，自动保存为Excel文件
-    :param input_path: 必填，银行卡图片的位置
+    识别车牌，自动保存为Excel文件
+    :param input_path: 必填，车牌图片存放位置，可以填单个文件，也可以填一个目录
     :param output_path: 选填，输出Excel的位置
     :param output_excel: 选填，输出Excel的名称
     :param img_url: 选填，可以是网络图片
@@ -271,15 +292,34 @@ def LicensePlateOCR2Excel(input_path, output_path=None, output_excel='LicensePla
     :param key: 你的腾讯账号的密钥，获取方式：https://curl.qcloud.com/fuOGcm2R
     :return:
     """
-    test_json = poocr.ocr.LicensePlateOCR(img_path=input_path, img_url=img_url, configPath=configPath, id=id,
-                                          key=key)
-    df = pd.DataFrame(json.loads(str(test_json)), index=[0])
-    if output_path == None:
-        output_path = './'
+    license_plates_img_files = get_files(input_path)
+    if license_plates_img_files == None:
+        raise BaseException(f'{input_path}这个路径下，没有存放任何车牌，请确认后重新运行')
     mkdir(Path(output_path).absolute())  # 如果不存在，则创建输出目录
     if output_excel.endswith('.xlsx') or output_excel.endswith('xls'):  # 如果指定的输出excel结尾不正确，则报错退出
         abs_output_excel = Path(output_path).absolute() / output_excel
-    df.to_excel(str(abs_output_excel), index=False)
+    else:  # 指定了，但不是xlsx或者xls结束
+        raise BaseException(
+            f'输出结果名：output_excel参数，必须以xls或者xlsx结尾，您的输入:{output_excel}有误，请修改后重新运行')
+    res_df = []  # 装全部识别的结果
+    for license_plates_img in simple_progress(license_plates_img_files):
+        try:
+            test_json = poocr.ocr.LicensePlateOCR(img_path=license_plates_img, img_url=img_url, configPath=configPath,
+                                                  id=id, key=key)
+            res_df.append(pd.DataFrame(json.loads(str(test_json)), index=[0]))
+        except Exception as e:
+            logger.error(e)
+            continue
+    # 整理全部识别结果
+    if len(res_df) > 0:
+        res_excel = res_df[0]
+        for index, line_df in enumerate(res_df):
+            if index == 0:
+                continue
+            res_excel = res_excel._append(line_df)
+        pd.DataFrame(res_excel).to_excel(str(abs_output_excel))  # 写入Excel
+    else:
+        logger.info(f'该文件夹下，没有任何符合条件的车牌图片')
 
 
 def household2excel(ak, sk, img_path, output_excel='household2excel.xlsx'):