11"""Common IO api utilities"""
22
3- import sys
43import os
54import csv
65import codecs
76import mmap
8- import zipfile
97from contextlib import contextmanager , closing
108
119from pandas .compat import StringIO , BytesIO , string_types , text_type
@@ -141,39 +139,6 @@ def _is_s3_url(url):
141139 return False
142140
143141
144- def maybe_read_encoded_stream (reader , encoding = None , compression = None ):
145- """read an encoded stream from the reader and transform the bytes to
146- unicode if required based on the encoding
147-
148- Parameters
149- ----------
150- reader : a streamable file-like object
151- encoding : optional, the encoding to attempt to read
152-
153- Returns
154- -------
155- a tuple of (a stream of decoded bytes, the encoding which was used)
156-
157- """
158-
159- if compat .PY3 or encoding is not None : # pragma: no cover
160- if encoding :
161- errors = 'strict'
162- else :
163- errors = 'replace'
164- encoding = 'utf-8'
165-
166- if compression == 'gzip' :
167- reader = BytesIO (reader .read ())
168- else :
169- reader = StringIO (reader .read ().decode (encoding , errors ))
170- else :
171- if compression == 'gzip' :
172- reader = BytesIO (reader .read ())
173- encoding = None
174- return reader , encoding
175-
176-
177142def _expand_user (filepath_or_buffer ):
178143 """Return the argument with an initial component of ~ or ~user
179144 replaced by that user's home directory.
@@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
237202 """
238203
239204 if _is_url (filepath_or_buffer ):
240- req = _urlopen (str (filepath_or_buffer ))
241- if compression == 'infer' :
242- content_encoding = req .headers .get ('Content-Encoding' , None )
243- if content_encoding == 'gzip' :
244- compression = 'gzip'
245- else :
246- compression = None
247- # cat on the compression to the tuple returned by the function
248- to_return = (list (maybe_read_encoded_stream (req , encoding ,
249- compression )) +
250- [compression ])
251- return tuple (to_return )
205+ url = str (filepath_or_buffer )
206+ req = _urlopen (url )
207+ content_encoding = req .headers .get ('Content-Encoding' , None )
208+ if content_encoding == 'gzip' :
209+ # Override compression based on Content-Encoding header
210+ compression = 'gzip'
211+ reader = BytesIO (req .read ())
212+ return reader , encoding , compression
252213
253214 if _is_s3_url (filepath_or_buffer ):
254215 from pandas .io .s3 import get_filepath_or_buffer
@@ -276,95 +237,145 @@ def file_path_to_url(path):
276237 return urljoin ('file:' , pathname2url (path ))
277238
278239
279- # ZipFile is not a context manager for <= 2.6
280- # must be tuple index here since 2.6 doesn't use namedtuple for version_info
281- if sys .version_info [1 ] <= 6 :
282- @contextmanager
283- def ZipFile (* args , ** kwargs ):
284- with closing (zipfile .ZipFile (* args , ** kwargs )) as zf :
285- yield zf
286- else :
287- ZipFile = zipfile .ZipFile
240+ _compression_to_extension = {
241+ 'gzip' : '.gz' ,
242+ 'bz2' : '.bz2' ,
243+ 'zip' : '.zip' ,
244+ 'xz' : '.xz' ,
245+ }
288246
289247
290- def _get_handle (source , mode , encoding = None , compression = None , memory_map = False ):
291- """Gets file handle for given path and mode.
248+ def _infer_compression (filepath_or_buffer , compression ):
249+ """
250+ If compression='infer', infer compression. If compression
292251 """
293252
294- f = source
295- is_path = isinstance (source , compat .string_types )
253+ # No compression has been explicitly specified
254+ if compression is None :
255+ return None
296256
297- # in Python 3, convert BytesIO or fileobjects passed with an encoding
298- if compat .PY3 and isinstance (source , compat .BytesIO ):
299- from io import TextIOWrapper
257+ # Cannot infer compression of a buffer. Hence assume no compression.
258+ is_path = isinstance (filepath_or_buffer , compat .string_types )
259+ if compression == 'infer' and not is_path :
260+ return None
261+
262+ # Infer compression from the filename/URL extension
263+ if compression == 'infer' :
264+ for compression , extension in _compression_to_extension .items ():
265+ if filepath_or_buffer .endswith (extension ):
266+ return compression
267+ return None
300268
301- return TextIOWrapper (source , encoding = encoding )
269+ # Compression has been specified. Check that it's valid
270+ if compression in _compression_to_extension :
271+ return compression
302272
303- elif compression is not None :
304- compression = compression .lower ()
305- if encoding is not None and not compat .PY3 and not is_path :
306- msg = 'encoding + compression not yet supported in Python 2'
273+ msg = 'Unrecognized compression type: {}' .format (compression )
274+ valid = ['infer' , None ] + sorted (_compression_to_extension )
275+ msg += '\n Valid compression types are {}' .format (valid )
276+ raise ValueError (msg )
277+
278+
279+ def _get_handle (path_or_buf , mode , encoding = None , compression = None ,
280+ memory_map = False ):
281+ """
282+ Get file handle for given path/buffer and mode.
283+
284+ Parameters
285+ ----------
286+ path_or_buf :
287+ a path (str) or buffer
288+ mode : str
289+ mode to open path_or_buf with
290+ encoding : str or None
291+ compression : str or None
292+ Supported compression protocols are gzip, bz2, zip, and xz
293+ memory_map : boolean, default False
294+ See parsers._parser_params for more information.
295+
296+ Returns
297+ -------
298+ f : file-like
299+ A file-like object
300+ handles : list of file-like objects
301+ A list of file-like object that were openned in this function.
302+ """
303+
304+ handles = list ()
305+ f = path_or_buf
306+ is_path = isinstance (path_or_buf , compat .string_types )
307+
308+ if compression :
309+
310+ if compat .PY2 and not is_path and encoding :
311+ msg = 'compression with encoding is not yet supported in Python 2'
307312 raise ValueError (msg )
308313
309314 # GZ Compression
310315 if compression == 'gzip' :
311316 import gzip
312-
313- f = gzip .GzipFile (source , mode ) \
314- if is_path else gzip .GzipFile (fileobj = source )
317+ if is_path :
318+ f = gzip .open (path_or_buf , mode )
319+ else :
320+ f = gzip .GzipFile (fileobj = path_or_buf )
315321
316322 # BZ Compression
317323 elif compression == 'bz2' :
318324 import bz2
319-
320325 if is_path :
321- f = bz2 .BZ2File (source , mode )
322-
323- else :
324- f = bz2 .BZ2File (source ) if compat .PY3 else StringIO (
325- bz2 .decompress (source .read ()))
326+ f = bz2 .BZ2File (path_or_buf , mode )
327+ elif compat .PY2 :
326328 # Python 2's bz2 module can't take file objects, so have to
327329 # run through decompress manually
330+ f = StringIO (bz2 .decompress (path_or_buf .read ()))
331+ path_or_buf .close ()
332+ else :
333+ f = bz2 .BZ2File (path_or_buf )
328334
329335 # ZIP Compression
330336 elif compression == 'zip' :
331337 import zipfile
332- zip_file = zipfile .ZipFile (source )
338+ zip_file = zipfile .ZipFile (path_or_buf )
333339 zip_names = zip_file .namelist ()
334-
335340 if len (zip_names ) == 1 :
336341 f = zip_file .open (zip_names .pop ())
337342 elif len (zip_names ) == 0 :
338343 raise ValueError ('Zero files found in ZIP file {}'
339- .format (source ))
344+ .format (path_or_buf ))
340345 else :
341346 raise ValueError ('Multiple files found in ZIP file.'
342- ' Only one file per ZIP : {}'
347+ ' Only one file per ZIP: {}'
343348 .format (zip_names ))
344349
345350 # XZ Compression
346351 elif compression == 'xz' :
347352 lzma = compat .import_lzma ()
348- f = lzma .LZMAFile (source , mode )
353+ f = lzma .LZMAFile (path_or_buf , mode )
349354
355+ # Unrecognized Compression
350356 else :
351- raise ValueError ('Unrecognized compression: %s' % compression )
352-
353- if compat .PY3 :
354- from io import TextIOWrapper
355-
356- f = TextIOWrapper (f , encoding = encoding )
357+ msg = 'Unrecognized compression type: {}' .format (compression )
358+ raise ValueError (msg )
357359
358- return f
360+ handles . append ( f )
359361
360362 elif is_path :
361- if compat .PY3 :
362- if encoding :
363- f = open (source , mode , encoding = encoding )
364- else :
365- f = open (source , mode , errors = 'replace' )
363+ if compat .PY2 :
364+ # Python 2
365+ f = open (path_or_buf , mode )
366+ elif encoding :
367+ # Python 3 and encoding
368+ f = open (path_or_buf , mode , encoding = encoding )
366369 else :
367- f = open (source , mode )
370+ # Python 3 and no explicit encoding
371+ f = open (path_or_buf , mode , errors = 'replace' )
372+ handles .append (f )
373+
374+ # in Python 3, convert BytesIO or fileobjects passed with an encoding
375+ if compat .PY3 and (compression or isinstance (f , compat .BytesIO )):
376+ from io import TextIOWrapper
377+ f = TextIOWrapper (f , encoding = encoding )
378+ handles .append (f )
368379
369380 if memory_map and hasattr (f , 'fileno' ):
370381 try :
@@ -378,7 +389,7 @@ def _get_handle(source, mode, encoding=None, compression=None, memory_map=False)
378389 # leave the file handler as is then
379390 pass
380391
381- return f
392+ return f , handles
382393
383394
384395class MMapWrapper (BaseIterator ):
0 commit comments