BUG, ENH: Add support for parsing duplicate columns

gfyoung · jreback · commit 9a6ce07ce19a · 2016-05-23T17:41:42.000-04:00
Closes #7160 Closes #9424 Author: gfyoung <gfyoung17@gmail.com> Closes #12935 from gfyoung/dupe-col-names and squashes the following commits: ef7636f [gfyoung] BUG, ENH: Add support for parsing duplicate columns
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -120,7 +120,8 @@ header : int or list of ints, default ``'infer'``
   rather than the first line of the file.
 names : array-like, default ``None``
   List of column names to use. If file contains no header row, then you should
-  explicitly pass ``header=None``.
+  explicitly pass ``header=None``. Duplicates in this list are not allowed unless
+  ``mangle_dupe_cols=True``, which is the default.
 index_col :  int or sequence or ``False``, default ``None``
   Column to use as the row labels of the DataFrame. If a sequence is given, a
   MultiIndex is used. If you have a malformed file with delimiters at the end of
@@ -139,6 +140,8 @@ prefix : str, default ``None``
   Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
 mangle_dupe_cols : boolean, default ``True``
   Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'.
+  Passing in False will cause data to be overwritten if there are duplicate
+  names in the columns.
 
 General Parsing Configuration
 +++++++++++++++++++++++++++++
@@ -432,6 +435,42 @@ If the header is in a row other than the first, pass the row number to
     data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9'
     pd.read_csv(StringIO(data), header=1)
 
+.. _io.dupe_names:
+
+Duplicate names parsing
+'''''''''''''''''''''''
+
+If the file or header contains duplicate names, pandas by default will deduplicate
+these names so as to prevent data overwrite:
+
+.. ipython :: python
+
+   data = 'a,b,a\n0,1,2\n3,4,5'
+   pd.read_csv(StringIO(data))
+
+There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies
+a series of duplicate columns 'X'...'X' to become 'X.0'...'X.N'.  If ``mangle_dupe_cols
+=False``, duplicate data can arise:
+
+.. code-block :: python
+
+   In [2]: data = 'a,b,a\n0,1,2\n3,4,5'
+   In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False)
+   Out[3]:
+      a  b  a
+   0  2  1  2
+   1  5  4  5
+
+To prevent users from encountering this problem with duplicate data, a ``ValueError``
+exception is raised if ``mangle_dupe_cols != True``:
+
+.. code-block :: python
+
+   In [2]: data = 'a,b,a\n0,1,2\n3,4,5'
+   In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False)
+   ...
+   ValueError: Setting mangle_dupe_cols=False is not supported yet
+
 .. _io.usecols:
 
 Filtering columns (``usecols``)
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -19,10 +19,37 @@ Highlights include:
 New features
 ~~~~~~~~~~~~
 
+.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support:
 
+``pd.read_csv`` has improved support for duplicate column names
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+:ref:`Duplicate column names <io.dupe_names>` are now supported in ``pd.read_csv()`` whether
+they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`)
 
+.. ipython :: python
 
+   data = '0,1,2\n3,4,5'
+   names = ['a', 'b', 'a']
+
+Previous behaviour:
+
+.. code-block:: ipython
+
+   In [2]: pd.read_csv(StringIO(data), names=names)
+   Out[2]:
+      a  b  a
+   0  2  1  2
+   1  5  4  5
+
+The first 'a' column contains the same data as the second 'a' column, when it should have
+contained the array ``[0, 3]``.
+
+New behaviour:
+
+.. ipython :: python
+
+   In [2]: pd.read_csv(StringIO(data), names=names)
 
 .. _whatsnew_0182.enhancements.other:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -73,7 +73,8 @@
     rather than the first line of the file.
 names : array-like, default None
     List of column names to use. If file contains no header row, then you
-    should explicitly pass header=None
+    should explicitly pass header=None. Duplicates in this list are not
+    allowed unless mangle_dupe_cols=True, which is the default.
 index_col : int or sequence or False, default None
     Column to use as the row labels of the DataFrame. If a sequence is given, a
     MultiIndex is used. If you have a malformed file with delimiters at the end
@@ -91,7 +92,9 @@
 prefix : str, default None
     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
 mangle_dupe_cols : boolean, default True
-    Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
+    Duplicate columns will be specified as 'X.0'...'X.N', rather than
+    'X'...'X'. Passing in False will cause data to be overwritten if there
+    are duplicate names in the columns.
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python'). Use `str` or `object` to preserve and
@@ -655,7 +658,14 @@ def _get_options_with_defaults(self, engine):
         options = {}
 
         for argname, default in compat.iteritems(_parser_defaults):
-            options[argname] = kwds.get(argname, default)
+            value = kwds.get(argname, default)
+
+            # see gh-12935
+            if argname == 'mangle_dupe_cols' and not value:
+                raise ValueError('Setting mangle_dupe_cols=False is '
+                                 'not supported yet')
+            else:
+                options[argname] = value
 
         for argname, default in compat.iteritems(_c_parser_defaults):
             if argname in kwds:
@@ -899,6 +909,7 @@ def __init__(self, kwds):
         self.true_values = kwds.get('true_values')
         self.false_values = kwds.get('false_values')
         self.tupleize_cols = kwds.get('tupleize_cols', False)
+        self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
 
         self._date_conv = _make_date_converter(
@@ -1012,6 +1023,26 @@ def tostr(x):
 
         return names, index_names, col_names, passed_names
 
+    def _maybe_dedup_names(self, names):
+        # see gh-7160 and gh-9424: this helps to provide
+        # immediate alleviation of the duplicate names
+        # issue and appears to be satisfactory to users,
+        # but ultimately, not needing to butcher the names
+        # would be nice!
+        if self.mangle_dupe_cols:
+            names = list(names)  # so we can index
+            counts = {}
+
+            for i, col in enumerate(names):
+                cur_count = counts.get(col, 0)
+
+                if cur_count > 0:
+                    names[i] = '%s.%d' % (col, cur_count)
+
+                counts[col] = cur_count + 1
+
+        return names
+
     def _maybe_make_multi_index_columns(self, columns, col_names=None):
         # possibly create a column mi here
         if (not self.tupleize_cols and len(columns) and
@@ -1314,10 +1345,11 @@ def read(self, nrows=None):
         except StopIteration:
             if self._first_chunk:
                 self._first_chunk = False
+                names = self._maybe_dedup_names(self.orig_names)
 
                 index, columns, col_dict = _get_empty_meta(
-                    self.orig_names, self.index_col,
-                    self.index_names, dtype=self.kwds.get('dtype'))
+                    names, self.index_col, self.index_names,
+                    dtype=self.kwds.get('dtype'))
 
                 if self.usecols is not None:
                     columns = self._filter_usecols(columns)
@@ -1361,6 +1393,8 @@ def read(self, nrows=None):
             if self.usecols is not None:
                 names = self._filter_usecols(names)
 
+            names = self._maybe_dedup_names(names)
+
             # rename dict keys
             data = sorted(data.items())
             data = dict((k, v) for k, (i, v) in zip(names, data))
@@ -1373,6 +1407,7 @@ def read(self, nrows=None):
 
             # ugh, mutation
             names = list(self.orig_names)
+            names = self._maybe_dedup_names(names)
 
             if self.usecols is not None:
                 names = self._filter_usecols(names)
@@ -1567,7 +1602,6 @@ def __init__(self, f, **kwds):
         self.skipinitialspace = kwds['skipinitialspace']
         self.lineterminator = kwds['lineterminator']
         self.quoting = kwds['quoting']
-        self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
         self.usecols = _validate_usecols_arg(kwds['usecols'])
         self.skip_blank_lines = kwds['skip_blank_lines']
 
@@ -1756,8 +1790,8 @@ def read(self, rows=None):
         columns = list(self.orig_names)
         if not len(content):  # pragma: no cover
             # DataFrame with the right metadata, even though it's length 0
-            return _get_empty_meta(self.orig_names,
-                                   self.index_col,
+            names = self._maybe_dedup_names(self.orig_names)
+            return _get_empty_meta(names, self.index_col,
                                    self.index_names)
 
         # handle new style for names in index
@@ -1770,26 +1804,28 @@ def read(self, rows=None):
         alldata = self._rows_to_cols(content)
         data = self._exclude_implicit_index(alldata)
 
-        columns, data = self._do_date_conversions(self.columns, data)
+        columns = self._maybe_dedup_names(self.columns)
+        columns, data = self._do_date_conversions(columns, data)
 
         data = self._convert_data(data)
         index, columns = self._make_index(data, alldata, columns, indexnamerow)
 
         return index, columns, data
 
     def _exclude_implicit_index(self, alldata):
+        names = self._maybe_dedup_names(self.orig_names)
 
         if self._implicit_index:
             excl_indices = self.index_col
 
             data = {}
             offset = 0
-            for i, col in enumerate(self.orig_names):
+            for i, col in enumerate(names):
                 while i + offset in excl_indices:
                     offset += 1
                 data[col] = alldata[i + offset]
         else:
-            data = dict((k, v) for k, v in zip(self.orig_names, alldata))
+            data = dict((k, v) for k, v in zip(names, alldata))
 
         return data
 
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -293,23 +293,18 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
             {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
-    def test_empty_with_dup_column_pass_dtype_by_names(self):
-        data = 'one,one'
-        result = self.read_csv(
-            StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'})
-        expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1)
-        tm.assert_frame_equal(result, expected, check_index_type=False)
-
     def test_empty_with_dup_column_pass_dtype_by_indexes(self):
-        # FIXME in gh-9424
-        raise nose.SkipTest(
-            "gh-9424; known failure read_csv with duplicate columns")
+        # see gh-9424
+        expected = pd.concat([Series([], name='one', dtype='u1'),
+                              Series([], name='one.1', dtype='f')], axis=1)
 
         data = 'one,one'
-        result = self.read_csv(
-            StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'})
-        expected = pd.concat([Series([], name='one', dtype='u1'),
-                              Series([], name='one', dtype='f')], axis=1)
+        result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+        data = ''
+        result = self.read_csv(StringIO(data), names=['one', 'one'],
+                               dtype={0: 'u1', 1: 'f'})
         tm.assert_frame_equal(result, expected, check_index_type=False)
 
     def test_usecols_dtypes(self):
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -243,6 +243,8 @@ def test_unnamed_columns(self):
                                        'Unnamed: 4'])
 
     def test_duplicate_columns(self):
+        # TODO: add test for condition 'mangle_dupe_cols=False'
+        # once it is actually supported (gh-12935)
         data = """A,A,B,B,B
 1,2,3,4,5
 6,7,8,9,10
@@ -256,11 +258,6 @@ def test_duplicate_columns(self):
             self.assertEqual(list(df.columns),
                              ['A', 'A.1', 'B', 'B.1', 'B.2'])
 
-            df = getattr(self, method)(StringIO(data), sep=',',
-                                       mangle_dupe_cols=False)
-            self.assertEqual(list(df.columns),
-                             ['A', 'A', 'B', 'B', 'B'])
-
             df = getattr(self, method)(StringIO(data), sep=',',
                                        mangle_dupe_cols=True)
             self.assertEqual(list(df.columns),
@@ -1281,3 +1278,17 @@ def test_euro_decimal_format(self):
         self.assertEqual(df2['Number1'].dtype, float)
         self.assertEqual(df2['Number2'].dtype, float)
         self.assertEqual(df2['Number3'].dtype, float)
+
+    def test_read_duplicate_names(self):
+        # See gh-7160
+        data = "a,b,a\n0,1,2\n3,4,5"
+        df = self.read_csv(StringIO(data))
+        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+                             columns=['a', 'b', 'a.1'])
+        tm.assert_frame_equal(df, expected)
+
+        data = "0,1,2\n3,4,5"
+        df = self.read_csv(StringIO(data), names=["a", "b", "a"])
+        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
+                             columns=['a', 'b', 'a.1'])
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py
@@ -84,13 +84,6 @@ def read_table(self, *args, **kwds):
 
 
 class TestPythonParser(BaseParser, PythonParserTests, tm.TestCase):
-    """
-    Class for Python parser testing. Unless specifically stated
-    as a PythonParser-specific issue, the goal is to eventually move
-    as many of these tests into ParserTests as soon as the C parser
-    can accept further specific arguments when parsing.
-    """
-
     engine = 'python'
     float_precision_choices = [None]
 
diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py
@@ -20,6 +20,16 @@
 
 
 class TestUnsupportedFeatures(tm.TestCase):
+    def test_mangle_dupe_cols_false(self):
+        # see gh-12935
+        data = 'a b c\n1 2 3'
+        msg = 'is not supported'
+
+        for engine in ('c', 'python'):
+            with tm.assertRaisesRegexp(ValueError, msg):
+                read_csv(StringIO(data), engine=engine,
+                         mangle_dupe_cols=False)
+
     def test_c_engine(self):
         # see gh-6607
         data = 'a b c\n1 2 3'