Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/pyspark/sql/pandas/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):

# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for fixing this!

While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, .at, .iat, .loc and .iloc.

Is it the only place?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I can tell, yes.


# Create list of Arrow (columns, type) for serializer dump_stream
arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
Expand Down
6 changes: 6 additions & 0 deletions python/pyspark/sql/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,12 @@ def test_createDateFrame_with_category_type(self):
self.assertIsInstance(arrow_first_category_element, str)
self.assertIsInstance(spark_first_category_element, str)

def test_createDataFrame_with_float_index(self):
# SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame
self.assertEqual(
self.spark.createDataFrame(
pd.DataFrame({'a': [1, 2, 3]}, index=[2., 3., 4.])).distinct().count(), 3)


@unittest.skipIf(
not have_pandas or not have_pyarrow,
Expand Down