Skip to content

Commit

Permalink
#793: Added support for Pandas 2 pyarrow dtype columns for emitting d…
Browse files Browse the repository at this point in the history
…ata from Python UDFs (#357)

* Update pandas to 2.0.2 in compatible template flavors 
* #796: Fixed silent data corruption when emitting dataframes with float16 dtype columns from Python UDFs
* Replace asscalar with item in test dataframe.py, because asscalar was removed
* Added handleEmitPyFloat to also support float pyarrow dtype columns with NAN and object-dtype columns with float
* Refactored and split Pandas Tests
* Added tests for more dtypes to Pandas Tests
  • Loading branch information
tkilias committed Jun 1, 2023
1 parent 9e31ee4 commit 433f064
Show file tree
Hide file tree
Showing 11 changed files with 957 additions and 113 deletions.
275 changes: 206 additions & 69 deletions exaudfclient/base/python/python3/python_ext_dataframe.cc

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
generic_language_tests=python3
test_folders=python3/all
test_folders=python3/all pandas/all
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas|1.3.4
numpy|1.21.3
pandas|2.0.2
numpy|1.24.3
pyarrow|12.0.0
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
generic_language_tests=python3
test_folders=python3/all
test_folders=python3/all pandas/all pandas/pandas2
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
python|3.8.13
numpy|1.22.3
pandas|1.4.2
numpy|1.24.3
pandas|2.0.2
libblas|3.9.0=15_linux64_mkl
mamba|1.3.1
ld_impl_linux-64|2.36.1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
generic_language_tests=python3
test_folders=python3/all
test_folders=python3/all pandas/all pandas/pandas2
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
python|3.8.13
numpy|1.22.3
pandas|1.4.2
numpy|1.24.3
pandas|2.0.2
libblas|3.9.0=15_linux64_mkl
mamba|1.3.1
ld_impl_linux-64|2.36.1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
generic_language_tests=python3
test_folders=python3/all
test_folders=python3/all pandas/all pandas/pandas2
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
from datetime import datetime

from exasol_python_test_framework import udf
from exasol_python_test_framework.exatest.testcase import useData
from exasol_python_test_framework.udf.udf_debug import UdfDebugger
from typing import List, Tuple, Union


class PandasDataFrame(udf.TestCase):
def setUp(self):
self.maxDiff=None

self.query('CREATE SCHEMA FN2', ignore_errors=True)
self.query('OPEN SCHEMA FN2', ignore_errors=True)
Expand Down Expand Up @@ -159,7 +163,7 @@ def test_dataframe_scalar_returns(self):
def run(ctx):
df = ctx.get_dataframe()
return np.asscalar(df.iloc[0, 0] + df.iloc[0, 1])
return (df.iloc[0, 0] + df.iloc[0, 1]).item()
/
''' % (self.col_defs_str))
self.query(udf_sql)
Expand Down Expand Up @@ -217,7 +221,7 @@ def test_dataframe_scalar_emits_unique(self):
def run(ctx):
df = ctx.get_dataframe()
ctx.emit(np.asscalar(df.C0))
ctx.emit(df.C0.item())
/
''')
print(udf_sql)
Expand All @@ -236,7 +240,7 @@ def test_dataframe_scalar_emits_all_unique(self):
def run(ctx):
df = ctx.get_dataframe(num_rows="all")
ctx.emit(np.asscalar(df.C0))
ctx.emit(df.C0.item())
/
''')
print(udf_sql)
Expand Down Expand Up @@ -331,7 +335,7 @@ def test_dataframe_set_returns(self):
def run(ctx):
df = ctx.get_dataframe(num_rows="all")
return np.asscalar(df.iloc[:, 0].sum())
return df.iloc[:, 0].sum().item()
/
''' % (self.col_defs_str))
print(udf_sql)
Expand Down Expand Up @@ -477,7 +481,7 @@ def run(ctx):
df = ctx.get_dataframe(num_rows=1)
if df is None:
break
ctx.emit(np.asscalar(df.C0))
ctx.emit(df.C0.item())
/
''')
print(udf_sql)
Expand All @@ -500,7 +504,7 @@ def run(ctx):
if df is None:
break
for i in range(df.shape[0]):
ctx.emit(np.asscalar(df.iloc[i, 0]))
ctx.emit(df.iloc[i, 0].item())
/
''')
print(udf_sql)
Expand Down Expand Up @@ -901,33 +905,6 @@ def run(ctx):
(234,)
], rows)

def test_dataframe_set_emits_double_pyfloat_only_todo(self):
import datetime
udf_sql = udf.fixindent('''
CREATE OR REPLACE PYTHON3 SET SCRIPT foo(sec int) EMITS (ts double) AS
def run(ctx):
import pandas as pd
import numpy as np
import datetime
c1=np.empty(shape=(2),dtype=np.object_)
c1[:]=234.5
df=pd.DataFrame({0:c1})
ctx.emit(df)
/
''')
print(udf_sql)
self.query(udf_sql)
select_sql = 'SELECT foo(1)'
print(select_sql)
#TODO implement support
with self.assertRaisesRegex(Exception, 'F-UDF-CL-SL-PYTHON-1056'):
rows = self.query(select_sql)

def test_dataframe_set_emits_double_npfloat32_only(self):
import datetime
udf_sql = udf.fixindent('''
Expand Down Expand Up @@ -1015,7 +992,6 @@ def run(ctx):
print(select_sql)
rows = self.query(select_sql)


if __name__ == '__main__':
udf.main()

Loading

0 comments on commit 433f064

Please sign in to comment.