1+ from contextlib import contextmanager
2+ import io
13import os
24import sys
35from pathlib import Path
4- from typing import Union
6+ from typing import Union , Any , IO , TypeVar
57
68from pypdf import PdfReader
79from pypdf import PdfWriter
8- from pypdf ._utils import StrByteType
910
1011from .core import TableList
1112from .parsers import Lattice
1213from .parsers import Stream
1314from .utils import TemporaryDirectory
14- from .utils import download_url
15+ from .utils import InvalidArguments
16+ from .utils import get_url_bytes
1517from .utils import get_page_layout
1618from .utils import get_rotation
1719from .utils import get_text_objects
1820from .utils import is_url
1921
22+ FilePathType = TypeVar (Union [str , IO [Any ], Path , None ])
2023
2124class PDFHandler :
2225 """Handles all operations like temp directory creation, splitting
@@ -25,21 +28,35 @@ class PDFHandler:
2528
2629 Parameters
2730 ----------
28- filepath : str
29- Filepath or URL of the PDF file.
31+ filepath : str | pathlib.Path, optional (default: None)
32+ Filepath or URL of the PDF file. Required if file_bytes is not given
3033 pages : str, optional (default: '1')
3134 Comma-separated page numbers.
3235 Example: '1,3,4' or '1,4-end' or 'all'.
3336 password : str, optional (default: None)
3437 Password for decryption.
38+ file_bytes : io.IOBase, optional (default: None)
39+ A file-like stream. Required if filepath is not given
3540
3641 """
3742
38- def __init__ (self , filepath : Union [ StrByteType , Path ] , pages = "1" , password = None ):
43+ def __init__ (self , filepath : FilePathType = None , pages = "1" , password = None , file_bytes = None ):
3944 if is_url (filepath ):
40- filepath = download_url (filepath )
41- self .filepath : Union [StrByteType , Path ] = filepath
42-
45+ file_bytes = get_url_bytes (filepath )
46+
47+ if not filepath and not file_bytes :
48+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
49+ if not filepath :
50+ # filepath must either be passed, or taken from the name attribute
51+ try :
52+ filepath = getattr (file_bytes , 'name' )
53+ except AttributeError :
54+ msg = ('Either pass a `filepath`, or give the '
55+ '`file_bytes` argument a name attribute' )
56+ raise InvalidArguments (msg )
57+ self .file_bytes = file_bytes # ok to be None
58+
59+ self .filepath = filepath
4360 if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
4461 raise NotImplementedError ("File format not supported" )
4562
@@ -51,13 +68,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5168 self .password = self .password .encode ("ascii" )
5269 self .pages = self ._get_pages (pages )
5370
71+ @contextmanager
72+ def managed_file_context (self ):
73+ """Reads from either the `filepath` or `file_bytes`
74+ attribute of this instance, to return a file-like object.
75+ Closes any open file handles on exit or error.
76+
77+ Returns
78+ -------
79+ file_bytes : io.IOBase
80+ A readable, seekable, file-like object
81+ """
82+ if self .file_bytes :
83+ # if we can't seek, write to a BytesIO object that can,
84+ # then seek to the beginning before yielding
85+ if not hasattr (self .file_bytes , 'seek' ):
86+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
87+ self .file_bytes .seek (0 )
88+ yield self .file_bytes
89+ else :
90+ with open (self .filepath , "rb" ) as file_bytes :
91+ yield file_bytes
92+
5493 def _get_pages (self , pages ):
5594 """Converts pages string to list of ints.
5695
5796 Parameters
5897 ----------
59- filepath : str
60- Filepath or URL of the PDF file.
98+ managed_file_context : io.IOBase
99+ A readable, seekable, file-like object
61100 pages : str, optional (default: '1')
62101 Comma-separated page numbers.
63102 Example: '1,3,4' or '1,4-end' or 'all'.
@@ -73,74 +112,77 @@ def _get_pages(self, pages):
73112 if pages == "1" :
74113 page_numbers .append ({"start" : 1 , "end" : 1 })
75114 else :
76- infile = PdfReader (self .filepath , strict = False )
77-
78- if infile .is_encrypted :
79- infile .decrypt (self .password )
80-
81- if pages == "all" :
82- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
83- else :
84- for r in pages .split ("," ):
85- if "-" in r :
86- a , b = r .split ("-" )
87- if b == "end" :
88- b = len (infile .pages )
89- page_numbers .append ({"start" : int (a ), "end" : int (b )})
90- else :
91- page_numbers .append ({"start" : int (r ), "end" : int (r )})
115+ with self .managed_file_context () as f :
116+ infile = PdfReader (f , strict = False )
117+
118+ if infile .is_encrypted :
119+ infile .decrypt (self .password )
120+
121+ if pages == "all" :
122+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
123+ else :
124+ for r in pages .split ("," ):
125+ if "-" in r :
126+ a , b = r .split ("-" )
127+ if b == "end" :
128+ b = len (infile .pages )
129+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
130+ else :
131+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
92132
93133 result = []
94134 for p in page_numbers :
95135 result .extend (range (p ["start" ], p ["end" ] + 1 ))
96136 return sorted (set (result ))
97137
98- def _save_page (self , filepath : Union [ StrByteType , Path ], page , temp ):
138+ def _save_page (self , page , temp ):
99139 """Saves specified page from PDF into a temporary directory.
100140
101141 Parameters
102142 ----------
103- filepath : str
104- Filepath or URL of the PDF file.
143+ managed_file_context : io.IOBase
144+ A readable, seekable, file-like object
105145 page : int
106146 Page number.
107147 temp : str
108148 Tmp directory.
109149
110150 """
111- infile = PdfReader (filepath , strict = False )
112- if infile .is_encrypted :
113- infile .decrypt (self .password )
114- fpath = os .path .join (temp , f"page-{ page } .pdf" )
115- froot , fext = os .path .splitext (fpath )
116- p = infile .pages [page - 1 ]
117- outfile = PdfWriter ()
118- outfile .add_page (p )
119- with open (fpath , "wb" ) as f :
120- outfile .write (f )
121- layout , dim = get_page_layout (fpath )
122- # fix rotated PDF
123- chars = get_text_objects (layout , ltype = "char" )
124- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
125- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
126- rotation = get_rotation (chars , horizontal_text , vertical_text )
127- if rotation != "" :
128- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
129- os .rename (fpath , fpath_new )
130- instream = open (fpath_new , "rb" )
131- infile = PdfReader (instream , strict = False )
151+
152+ with self .managed_file_context () as fileobj :
153+ infile = PdfReader (fileobj , strict = False )
132154 if infile .is_encrypted :
133155 infile .decrypt (self .password )
156+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
157+ froot , fext = os .path .splitext (fpath )
158+ p = infile .pages [page - 1 ]
134159 outfile = PdfWriter ()
135- p = infile .pages [0 ]
136- if rotation == "anticlockwise" :
137- p .rotate (90 )
138- elif rotation == "clockwise" :
139- p .rotate (- 90 )
140160 outfile .add_page (p )
141161 with open (fpath , "wb" ) as f :
142162 outfile .write (f )
143- instream .close ()
163+ layout , dim = get_page_layout (fpath )
164+ # fix rotated PDF
165+ chars = get_text_objects (layout , ltype = "char" )
166+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
167+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
168+ rotation = get_rotation (chars , horizontal_text , vertical_text )
169+ if rotation != "" :
170+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
171+ os .rename (fpath , fpath_new )
172+ instream = open (fpath_new , "rb" )
173+ infile = PdfReader (instream , strict = False )
174+ if infile .is_encrypted :
175+ infile .decrypt (self .password )
176+ outfile = PdfWriter ()
177+ p = infile .pages [0 ]
178+ if rotation == "anticlockwise" :
179+ p .rotate (90 )
180+ elif rotation == "clockwise" :
181+ p .rotate (- 90 )
182+ outfile .add_page (p )
183+ with open (fpath , "wb" ) as f :
184+ outfile .write (f )
185+ instream .close ()
144186
145187 def parse (
146188 self , flavor = "lattice" , suppress_stdout = False , layout_kwargs = None , ** kwargs
@@ -155,7 +197,7 @@ def parse(
155197 Lattice is used by default.
156198 suppress_stdout : str (default: False)
157199 Suppress logs and warnings.
158- layout_kwargs : dict, optional (default: {} )
200+ layout_kwargs : dict, optional (default: None )
159201 A dict of `pdfminer.layout.LAParams
160202 <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
161203 kwargs : dict
@@ -173,7 +215,7 @@ def parse(
173215 tables = []
174216 with TemporaryDirectory () as tempdir :
175217 for p in self .pages :
176- self ._save_page (self . filepath , p , tempdir )
218+ self ._save_page (p , tempdir )
177219 pages = [os .path .join (tempdir , f"page-{ p } .pdf" ) for p in self .pages ]
178220 parser = Lattice (** kwargs ) if flavor == "lattice" else Stream (** kwargs )
179221 for p in pages :
0 commit comments