8
8
from pathlib import Path
9
9
10
10
import pymupdf
11
- from joblib import Parallel , delayed
12
11
from natsort import natsorted , ns
13
12
from PIL import Image
14
13
@@ -34,7 +33,7 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None:
34
33
[
35
34
"bash" ,
36
35
"-c" ,
37
- f"ocrmypdf --jobs 1 { ' ' .join (args )} { input_file } { output_file } " ,
36
+ f"ocrmypdf { ' ' .join (args )} { input_file } { output_file } " ,
38
37
],
39
38
check = True ,
40
39
)
@@ -45,50 +44,77 @@ def predict(base: Path, input_file: Path, args: list[str]) -> None:
45
44
pass
46
45
47
46
47
+ def cleanup (root : str , files : list [str ]) -> None :
48
+ """
49
+ Removes empty directory
50
+
51
+ Args:
52
+ root (str): The root directory
53
+ files (list[str]): The list of files
54
+ """
55
+ if not files :
56
+ try :
57
+ os .rmdir (root )
58
+ except Exception :
59
+ pass
60
+
61
+
62
+ def merge (base : Path , root : str , files : list [str ]) -> None :
63
+ """
64
+ Merges the PDFs in the list
65
+
66
+ Args:
67
+ base (Path): The base directory
68
+ root (str): The root directory
69
+ files (list[str]): The list of files
70
+ """
71
+ proot = Path (root )
72
+ if proot == base / "done" :
73
+ return
74
+
75
+ pdf_list = [
76
+ pymupdf .open (proot / file ) for file in files if file .lower ().endswith (".pdf" )
77
+ ]
78
+ if not pdf_list :
79
+ return
80
+
81
+ merged = pymupdf .open ()
82
+ for pdf in natsorted (pdf_list , key = lambda x : x .name , alg = ns .IGNORECASE ):
83
+ merged .insert_pdf (pdf )
84
+
85
+ merged .save (Path (root + ".pdf" ), garbage = 4 , deflate = True )
86
+ merged .close ()
87
+
88
+ for pdf in pdf_list :
89
+ pdf .close ()
90
+
91
+
48
92
if __name__ == "__main__" :
49
93
pdfs = Path (sys .argv [1 ] if len (sys .argv ) > 1 else "." )
50
94
pdfs .mkdir (exist_ok = True , parents = True )
51
95
(pdfs / "todo" ).mkdir (exist_ok = True , parents = True )
52
96
(pdfs / "done" ).mkdir (exist_ok = True , parents = True )
53
97
54
- Parallel (n_jobs = - 1 )(
55
- delayed (predict )(
56
- pdfs ,
57
- Path (root ) / file ,
58
- sys .argv [2 :] if len (sys .argv ) > 2 else ["--rotate-pages" , "--deskew" , "--skip-text" , "--invalidate-digital-signatures" , "--clean" ],
59
- )
60
- for root , _ , files in os .walk (pdfs / "todo" )
61
- for file in files
62
- )
98
+ for root , _ , files in os .walk (pdfs / "todo" ):
99
+ for file in files :
100
+ predict (
101
+ pdfs ,
102
+ Path (root ) / file ,
103
+ (
104
+ sys .argv [2 :]
105
+ if len (sys .argv ) > 2
106
+ else [
107
+ "--rotate-pages" ,
108
+ "--deskew" ,
109
+ "--skip-text" ,
110
+ "--invalidate-digital-signatures" ,
111
+ "--clean" ,
112
+ ]
113
+ ),
114
+ )
63
115
64
- # Remove empty directories
65
116
for root , _ , files in os .walk (pdfs / "todo" ):
66
- if not files :
67
- try :
68
- os .rmdir (root )
69
- except Exception :
70
- pass
117
+ cleanup (root , files )
71
118
72
- # Merge PDFs
73
119
for root , _ , files in os .walk (pdfs / "done" ):
74
- proot = Path (root )
75
- if proot == pdfs / "done" :
76
- continue
77
-
78
- pdf_list = [
79
- pymupdf .open (proot / file )
80
- for file in files
81
- if file .lower ().endswith (".pdf" )
82
- ]
83
- if not pdf_list :
84
- continue
85
-
86
- merged = pymupdf .open ()
87
- for pdf in natsorted (pdf_list , key = lambda x : x .name , alg = ns .IGNORECASE ):
88
- merged .insert_pdf (pdf )
89
-
90
- merged .save (Path (root + ".pdf" ), garbage = 4 , deflate = True )
91
- merged .close ()
92
-
93
- for pdf in pdf_list :
94
- pdf .close ()
120
+ merge (pdfs , root , files )
0 commit comments