@@ -375,60 +375,25 @@ def read_excel(io,
375375 ** kwds )
376376
377377
378- class _XlrdReader (object ):
379-
380- def __init__ (self , filepath_or_buffer ):
381- """Reader using xlrd engine.
382-
383- Parameters
384- ----------
385- filepath_or_buffer : string, path object or Workbook
386- Object to be parsed.
387- """
388- err_msg = "Install xlrd >= 1.0.0 for Excel support"
389-
390- try :
391- import xlrd
392- except ImportError :
393- raise ImportError (err_msg )
394- else :
395- if xlrd .__VERSION__ < LooseVersion ("1.0.0" ):
396- raise ImportError (err_msg +
397- ". Current version " + xlrd .__VERSION__ )
378+ @add_metaclass (abc .ABCMeta )
379+ class _BaseExcelReader (object ):
398380
399- # If filepath_or_buffer is a url, want to keep the data as bytes so
400- # can't pass to get_filepath_or_buffer()
401- if _is_url (filepath_or_buffer ):
402- filepath_or_buffer = _urlopen (filepath_or_buffer )
403- elif not isinstance (filepath_or_buffer , (ExcelFile , xlrd .Book )):
404- filepath_or_buffer , _ , _ , _ = get_filepath_or_buffer (
405- filepath_or_buffer )
381+ @property
382+ @abc .abstractmethod
383+ def sheet_names (self ):
384+ pass
406385
407- if isinstance (filepath_or_buffer , xlrd .Book ):
408- self .book = filepath_or_buffer
409- elif not isinstance (filepath_or_buffer , xlrd .Book ) and hasattr (
410- filepath_or_buffer , "read" ):
411- # N.B. xlrd.Book has a read attribute too
412- if hasattr (filepath_or_buffer , 'seek' ):
413- try :
414- # GH 19779
415- filepath_or_buffer .seek (0 )
416- except UnsupportedOperation :
417- # HTTPResponse does not support seek()
418- # GH 20434
419- pass
386+ @abc .abstractmethod
387+ def get_sheet_by_name (self , name ):
388+ pass
420389
421- data = filepath_or_buffer .read ()
422- self .book = xlrd .open_workbook (file_contents = data )
423- elif isinstance (filepath_or_buffer , compat .string_types ):
424- self .book = xlrd .open_workbook (filepath_or_buffer )
425- else :
426- raise ValueError ('Must explicitly set engine if not passing in'
427- ' buffer or path for io.' )
390+ @abc .abstractmethod
391+ def get_sheet_by_index (self , index ):
392+ pass
428393
429- @property
430- def sheet_names (self ):
431- return self . book . sheet_names ()
394+ @abc . abstractmethod
395+ def get_sheet_data (self , sheet , convert_float ):
396+ pass
432397
433398 def parse (self ,
434399 sheet_name = 0 ,
@@ -455,56 +420,14 @@ def parse(self,
455420
456421 _validate_header_arg (header )
457422
458- from xlrd import (xldate , XL_CELL_DATE ,
459- XL_CELL_ERROR , XL_CELL_BOOLEAN ,
460- XL_CELL_NUMBER )
461-
462- epoch1904 = self .book .datemode
463-
464- def _parse_cell (cell_contents , cell_typ ):
465- """converts the contents of the cell into a pandas
466- appropriate object"""
467-
468- if cell_typ == XL_CELL_DATE :
469-
470- # Use the newer xlrd datetime handling.
471- try :
472- cell_contents = xldate .xldate_as_datetime (
473- cell_contents , epoch1904 )
474- except OverflowError :
475- return cell_contents
476-
477- # Excel doesn't distinguish between dates and time,
478- # so we treat dates on the epoch as times only.
479- # Also, Excel supports 1900 and 1904 epochs.
480- year = (cell_contents .timetuple ())[0 :3 ]
481- if ((not epoch1904 and year == (1899 , 12 , 31 )) or
482- (epoch1904 and year == (1904 , 1 , 1 ))):
483- cell_contents = time (cell_contents .hour ,
484- cell_contents .minute ,
485- cell_contents .second ,
486- cell_contents .microsecond )
487-
488- elif cell_typ == XL_CELL_ERROR :
489- cell_contents = np .nan
490- elif cell_typ == XL_CELL_BOOLEAN :
491- cell_contents = bool (cell_contents )
492- elif convert_float and cell_typ == XL_CELL_NUMBER :
493- # GH5394 - Excel 'numbers' are always floats
494- # it's a minimal perf hit and less surprising
495- val = int (cell_contents )
496- if val == cell_contents :
497- cell_contents = val
498- return cell_contents
499-
500423 ret_dict = False
501424
502425 # Keep sheetname to maintain backwards compatibility.
503426 if isinstance (sheet_name , list ):
504427 sheets = sheet_name
505428 ret_dict = True
506429 elif sheet_name is None :
507- sheets = self .book . sheet_names ()
430+ sheets = self .sheet_names
508431 ret_dict = True
509432 else :
510433 sheets = [sheet_name ]
@@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ):
519442 print ("Reading sheet {sheet}" .format (sheet = asheetname ))
520443
521444 if isinstance (asheetname , compat .string_types ):
522- sheet = self .book . sheet_by_name (asheetname )
445+ sheet = self .get_sheet_by_name (asheetname )
523446 else : # assume an integer if not a string
524- sheet = self .book . sheet_by_index (asheetname )
447+ sheet = self .get_sheet_by_index (asheetname )
525448
526- data = []
449+ data = self . get_sheet_data ( sheet , convert_float )
527450 usecols = _maybe_convert_usecols (usecols )
528451
529- for i in range (sheet .nrows ):
530- row = [_parse_cell (value , typ )
531- for value , typ in zip (sheet .row_values (i ),
532- sheet .row_types (i ))]
533- data .append (row )
534-
535452 if sheet .nrows == 0 :
536453 output [asheetname ] = DataFrame ()
537454 continue
@@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ):
620537 return output [asheetname ]
621538
622539
540+ class _XlrdReader (_BaseExcelReader ):
541+
542+ def __init__ (self , filepath_or_buffer ):
543+ """Reader using xlrd engine.
544+
545+ Parameters
546+ ----------
547+ filepath_or_buffer : string, path object or Workbook
548+ Object to be parsed.
549+ """
550+ err_msg = "Install xlrd >= 1.0.0 for Excel support"
551+
552+ try :
553+ import xlrd
554+ except ImportError :
555+ raise ImportError (err_msg )
556+ else :
557+ if xlrd .__VERSION__ < LooseVersion ("1.0.0" ):
558+ raise ImportError (err_msg +
559+ ". Current version " + xlrd .__VERSION__ )
560+
561+ # If filepath_or_buffer is a url, want to keep the data as bytes so
562+ # can't pass to get_filepath_or_buffer()
563+ if _is_url (filepath_or_buffer ):
564+ filepath_or_buffer = _urlopen (filepath_or_buffer )
565+ elif not isinstance (filepath_or_buffer , (ExcelFile , xlrd .Book )):
566+ filepath_or_buffer , _ , _ , _ = get_filepath_or_buffer (
567+ filepath_or_buffer )
568+
569+ if isinstance (filepath_or_buffer , xlrd .Book ):
570+ self .book = filepath_or_buffer
571+ elif hasattr (filepath_or_buffer , "read" ):
572+ # N.B. xlrd.Book has a read attribute too
573+ if hasattr (filepath_or_buffer , 'seek' ):
574+ try :
575+ # GH 19779
576+ filepath_or_buffer .seek (0 )
577+ except UnsupportedOperation :
578+ # HTTPResponse does not support seek()
579+ # GH 20434
580+ pass
581+
582+ data = filepath_or_buffer .read ()
583+ self .book = xlrd .open_workbook (file_contents = data )
584+ elif isinstance (filepath_or_buffer , compat .string_types ):
585+ self .book = xlrd .open_workbook (filepath_or_buffer )
586+ else :
587+ raise ValueError ('Must explicitly set engine if not passing in'
588+ ' buffer or path for io.' )
589+
590+ @property
591+ def sheet_names (self ):
592+ return self .book .sheet_names ()
593+
594+ def get_sheet_by_name (self , name ):
595+ return self .book .sheet_by_name (name )
596+
597+ def get_sheet_by_index (self , index ):
598+ return self .book .sheet_by_index (index )
599+
600+ def get_sheet_data (self , sheet , convert_float ):
601+ from xlrd import (xldate , XL_CELL_DATE ,
602+ XL_CELL_ERROR , XL_CELL_BOOLEAN ,
603+ XL_CELL_NUMBER )
604+
605+ epoch1904 = self .book .datemode
606+
607+ def _parse_cell (cell_contents , cell_typ ):
608+ """converts the contents of the cell into a pandas
609+ appropriate object"""
610+
611+ if cell_typ == XL_CELL_DATE :
612+
613+ # Use the newer xlrd datetime handling.
614+ try :
615+ cell_contents = xldate .xldate_as_datetime (
616+ cell_contents , epoch1904 )
617+ except OverflowError :
618+ return cell_contents
619+
620+ # Excel doesn't distinguish between dates and time,
621+ # so we treat dates on the epoch as times only.
622+ # Also, Excel supports 1900 and 1904 epochs.
623+ year = (cell_contents .timetuple ())[0 :3 ]
624+ if ((not epoch1904 and year == (1899 , 12 , 31 )) or
625+ (epoch1904 and year == (1904 , 1 , 1 ))):
626+ cell_contents = time (cell_contents .hour ,
627+ cell_contents .minute ,
628+ cell_contents .second ,
629+ cell_contents .microsecond )
630+
631+ elif cell_typ == XL_CELL_ERROR :
632+ cell_contents = np .nan
633+ elif cell_typ == XL_CELL_BOOLEAN :
634+ cell_contents = bool (cell_contents )
635+ elif convert_float and cell_typ == XL_CELL_NUMBER :
636+ # GH5394 - Excel 'numbers' are always floats
637+ # it's a minimal perf hit and less surprising
638+ val = int (cell_contents )
639+ if val == cell_contents :
640+ cell_contents = val
641+ return cell_contents
642+
643+ data = []
644+
645+ for i in range (sheet .nrows ):
646+ row = [_parse_cell (value , typ )
647+ for value , typ in zip (sheet .row_values (i ),
648+ sheet .row_types (i ))]
649+ data .append (row )
650+
651+ return data
652+
653+
623654class ExcelFile (object ):
624655 """
625656 Class for parsing tabular excel sheets into DataFrame objects.
0 commit comments