diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 944f92260e6..5b17e03d2fc 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -410,22 +410,28 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): col = df[name] is_index = False except KeyError: - if preserve_index is not False and name in df.index.names: - col = df.index.get_level_values(name) - if (preserve_index is None and - isinstance(col, _pandas_api.pd.RangeIndex)): - raise ValueError( - "name '{}' is present in the schema, but it is a " - "RangeIndex which will not be converted as a column " - "in the Table, but saved as metadata-only not in " - "columns. Specify 'preserve_index=True' to force it " - "being added as a column, or remove it from the " - "specified schema".format(name)) - is_index = True - else: + try: + col = _get_index_level(df, name) + except (KeyError, IndexError): + # name not found as index level raise KeyError( "name '{}' present in the specified schema is not found " "in the columns or index".format(name)) + if preserve_index is False: + raise ValueError( + "name '{}' present in the specified schema corresponds " + "to the index, but 'preserve_index=False' was " + "specified".format(name)) + elif (preserve_index is None and + isinstance(col, _pandas_api.pd.RangeIndex)): + raise ValueError( + "name '{}' is present in the schema, but it is a " + "RangeIndex which will not be converted as a column " + "in the Table, but saved as metadata-only not in " + "columns. Specify 'preserve_index=True' to force it " + "being added as a column, or remove it from the " + "specified schema".format(name)) + is_index = True name = _column_name_to_strings(name) @@ -449,6 +455,19 @@ def _get_columns_to_convert_given_schema(df, schema, preserve_index): index_levels, columns_to_convert, convert_fields) +def _get_index_level(df, name): + """ + Get the index level of a DataFrame given 'name' (column name in an arrow + Schema). + """ + key = name + if name not in df.index.names and _is_generated_index_name(name): + # we know we have an autogenerated name => extract number and get + # the index level positionally + key = int(name[len("__index_level_"):-2]) + return df.index.get_level_values(key) + + def _get_range_index_descriptor(level): # public start/stop/step attributes added in pandas 0.25.0 return { diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index fa1f4bc770b..957dfe0d3aa 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2804,7 +2804,7 @@ def test_table_from_pandas_schema_index_columns(): expected_schema=schema) # schema includes correct index name but preserve_index=False - with pytest.raises(KeyError): + with pytest.raises(ValueError, match="'preserve_index=False' was"): pa.Table.from_pandas(df, schema=schema, preserve_index=False) # in case of preserve_index=None -> RangeIndex serialized as metadata @@ -2873,6 +2873,27 @@ def test_table_from_pandas_schema_index_columns(): expected_schema=schema, expected=expected) +def test_table_from_pandas_schema_index_columns__unnamed_index(): + # ARROW-6999 - unnamed indices in specified schema + df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}) + + expected_schema = pa.schema([ + ('a', pa.int64()), + ('b', pa.float64()), + ('__index_level_0__', pa.int64()), + ]) + + schema = pa.Schema.from_pandas(df, preserve_index=True) + table = pa.Table.from_pandas(df, preserve_index=True, schema=schema) + assert table.schema.remove_metadata().equals(expected_schema) + + # non-RangeIndex (preserved by default) + df = pd.DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}, index=[0, 1, 2]) + schema = pa.Schema.from_pandas(df) + table = pa.Table.from_pandas(df, schema=schema) + assert table.schema.remove_metadata().equals(expected_schema) + + # ---------------------------------------------------------------------- # RecordBatch, Table