apache · jorisvandenbossche · Oct 29, 2019 · Oct 29, 2019 · Oct 30, 2019 · Nov 5, 2019
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -410,22 +410,28 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             col = df[name]
             is_index = False
         except KeyError:
-            if preserve_index is not False and name in df.index.names:
-                col = df.index.get_level_values(name)
-                if (preserve_index is None and
-                        isinstance(col, _pandas_api.pd.RangeIndex)):
-                    raise ValueError(
-                        "name '{}' is present in the schema, but it is a "
-                        "RangeIndex which will not be converted as a column "
-                        "in the Table, but saved as metadata-only not in "
-                        "columns. Specify 'preserve_index=True' to force it "
-                        "being added as a column, or remove it from the "
-                        "specified schema".format(name))
-                is_index = True
-            else:
+            try:
+                col = _get_index_level(df, name)
+            except (KeyError, IndexError):
+                # name not found as index level
                 raise KeyError(
                     "name '{}' present in the specified schema is not found "
                     "in the columns or index".format(name))
+            if preserve_index is False:
+                raise ValueError(
+                    "name '{}' present in the specified schema corresponds "
+                    "to the index, but 'preserve_index=False' was "
+                    "specified".format(name))
+            elif (preserve_index is None and
+                    isinstance(col, _pandas_api.pd.RangeIndex)):
+                raise ValueError(
+                    "name '{}' is present in the schema, but it is a "
+                    "RangeIndex which will not be converted as a column "
+                    "in the Table, but saved as metadata-only not in "
+                    "columns. Specify 'preserve_index=True' to force it "
+                    "being added as a column, or remove it from the "
+                    "specified schema".format(name))
+            is_index = True
 
         name = _column_name_to_strings(name)
 
@@ -449,6 +455,19 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index):
             index_levels, columns_to_convert, convert_fields)
 
 
+def _get_index_level(df, name):
+    """
+    Get the index level of a DataFrame given 'name' (column name in an arrow
+    Schema).
+    """
+    key = name
+    if name not in df.index.names and _is_generated_index_name(name):
+        # we know we have an autogenerated name => extract number and get
+        # the index level positionally
+        key = int(name[len("__index_level_"):-2])
+    return df.index.get_level_values(key)
+
+
 def _get_range_index_descriptor(level):
     # public start/stop/step attributes added in pandas 0.25.0
     return {

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -2804,7 +2804,7 @@ def test_table_from_pandas_schema_index_columns():
                             expected_schema=schema)
 
     # schema includes correct index name but preserve_index=False
-    with pytest.raises(KeyError):
+    with pytest.raises(ValueError, match="'preserve_index=False' was"):
         pa.Table.from_pandas(df, schema=schema, preserve_index=False)
 
     # in case of preserve_index=None -> RangeIndex serialized as metadata
@@ -2873,6 +2873,27 @@ def test_table_from_pandas_schema_index_columns():
                             expected_schema=schema, expected=expected)
 
 
+def test_table_from_pandas_schema_index_columns__unnamed_index():
+    # ARROW-6999 - unnamed indices in specified schema
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]})
+
+    expected_schema = pa.schema([
+        ('a', pa.int64()),
+        ('b', pa.float64()),
+        ('__index_level_0__', pa.int64()),
+    ])
+
+    schema = pa.Schema.from_pandas(df, preserve_index=True)
+    table = pa.Table.from_pandas(df, preserve_index=True, schema=schema)
+    assert table.schema.remove_metadata().equals(expected_schema)
+
+    # non-RangeIndex (preserved by default)
+    df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}, index=[0, 1, 2])
+    schema = pa.Schema.from_pandas(df)
+    table = pa.Table.from_pandas(df, schema=schema)
+    assert table.schema.remove_metadata().equals(expected_schema)
+
+
 # ----------------------------------------------------------------------
 # RecordBatch, Table