forked from VikParuchuri/marker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_single.py
executable file
·45 lines (33 loc) · 1.9 KB
/
convert_single.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import time
import pypdfium2 # Needs to be at the top to avoid warnings
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
import argparse
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.output import save_markdown
configure_logging()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Optional languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
parser.add_argument("--debug", action="store_true", help="Enable debug logging", default=False)
parser.add_argument("--ocr_all_pages", action="store_true", help="Force OCR on all pages", default=False)
args = parser.parse_args()
langs = args.langs.split(",") if args.langs else None
fname = args.filename
model_lst = load_all_models()
start = time.time()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page, ocr_all_pages=args.ocr_all_pages)
fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
print(f"Saved markdown to the {subfolder_path} folder")
if args.debug:
print(f"Total time: {time.time() - start}")
if __name__ == "__main__":
main()