diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index f366317d04c..5af93fb5865 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -49,5 +49,4 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table, - from_pandas_dataframe) +from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 45cf7beccee..0a9805cfdf4 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -415,52 +415,6 @@ cdef class RecordBatch: return result -def dataframe_from_batches(batches): - """ - Convert a list of Arrow RecordBatches to a pandas.DataFrame - - Parameters - ---------- - - batches: list of RecordBatch - RecordBatch list to be converted, schemas must be equal - """ - - cdef: - vector[shared_ptr[CArray]] c_array_chunks - vector[shared_ptr[CColumn]] c_columns - shared_ptr[CTable] c_table - Array arr - Schema schema - - import pandas as pd - - schema = batches[0].schema - - # check schemas are equal - if any((not schema.equals(other.schema) for other in batches[1:])): - raise ArrowException("Error converting list of RecordBatches to " - "DataFrame, not all schemas are equal") - - cdef int K = batches[0].num_columns - - # create chunked columns from the batches - c_columns.resize(K) - for i in range(K): - for batch in batches: - arr = batch[i] - c_array_chunks.push_back(arr.sp_array) - c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i), - c_array_chunks)) - c_array_chunks.clear() - - # create a Table from columns and convert to DataFrame - c_table.reset(new CTable('', schema.sp_schema, c_columns)) - table = Table() - table.init(c_table) - return table.to_pandas() - - cdef class Table: """ A collection of top-level named, equal length Arrow arrays. @@ -567,6 +521,54 @@ cdef class Table: return result + @staticmethod + def from_batches(batches): + """ + Construct a Table from a list of Arrow RecordBatches + + Parameters + ---------- + + batches: list of RecordBatch + RecordBatch list to be converted, schemas must be equal + """ + + cdef: + vector[shared_ptr[CArray]] c_array_chunks + vector[shared_ptr[CColumn]] c_columns + shared_ptr[CTable] c_table + Array arr + Schema schema + + import pandas as pd + + schema = batches[0].schema + + # check schemas are equal + for other in batches[1:]: + if not schema.equals(other.schema): + raise ArrowException("Error converting list of RecordBatches " + "to DataFrame, not all schemas are equal: {%s} != {%s}" + % (str(schema), str(other.schema))) + + cdef int K = batches[0].num_columns + + # create chunked columns from the batches + c_columns.resize(K) + for i in range(K): + for batch in batches: + arr = batch[i] + c_array_chunks.push_back(arr.sp_array) + c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i), + c_array_chunks)) + c_array_chunks.clear() + + # create a Table from columns and convert to DataFrame + c_table.reset(new CTable('', schema.sp_schema, c_columns)) + table = Table() + table.init(c_table) + return table + def to_pandas(self): """ Convert the arrow::Table to a pandas DataFrame diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index dc4f37a830e..25463145c00 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -69,7 +69,8 @@ def test_recordbatchlist_to_pandas(): batch1 = pa.RecordBatch.from_pandas(data1) batch2 = pa.RecordBatch.from_pandas(data2) - result = pa.dataframe_from_batches([batch1, batch2]) + table = pa.Table.from_batches([batch1, batch2]) + result = table.to_pandas() data = pd.concat([data1, data2], ignore_index=True) assert_frame_equal(data, result) @@ -82,7 +83,7 @@ def test_recordbatchlist_schema_equals(): batch2 = pa.RecordBatch.from_pandas(data2) with pytest.raises(pa.ArrowException): - pa.dataframe_from_batches([batch1, batch2]) + pa.Table.from_batches([batch1, batch2]) def test_table_basics():