23
23
24
24
25
25
class NumPyCompatTest (ReusedSQLTestCase , SQLTestUtils ):
26
+ blacklist = [
27
+ # Koalas does not currently support
28
+ "conj" ,
29
+ "conjugate" ,
30
+ "isnat" ,
31
+ "matmul" ,
32
+ "frexp" ,
33
+
34
+ # Values are close enough but tests failed.
35
+ "arccos" ,
36
+ "exp" ,
37
+ "expm1" ,
38
+ "log" , # flaky
39
+ "log10" , # flaky
40
+ "log1p" , # flaky
41
+ "modf" ,
42
+ "floor_divide" , # flaky
43
+
44
+ # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow.
45
+ # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we
46
+ # can enable it later when Koalas switches to PyArrow 0.15 completely.
47
+ "left_shift" ,
48
+ ]
26
49
27
50
@property
28
51
def pdf (self ):
@@ -49,12 +72,17 @@ def test_np_add_index(self):
49
72
p_index = self .pdf .index
50
73
self .assert_eq (np .add (k_index , k_index ), np .add (p_index , p_index ))
51
74
52
- def test_np_unsupported (self ):
75
+ def test_np_unsupported_series (self ):
53
76
kdf = self .kdf
54
77
with self .assertRaisesRegex (NotImplementedError , "Koalas.*not.*support.*sqrt.*" ):
55
78
np .sqrt (kdf .a , kdf .b )
56
79
57
- def test_np_spark_compat (self ):
80
+ def test_np_unsupported_frame (self ):
81
+ kdf = self .kdf
82
+ with self .assertRaisesRegex (NotImplementedError , "Koalas.*not.*support.*sqrt.*" ):
83
+ np .sqrt (kdf , kdf )
84
+
85
+ def test_np_spark_compat_series (self ):
58
86
# Use randomly generated dataFrame
59
87
pdf = pd .DataFrame (
60
88
np .random .randint (- 100 , 100 , size = (np .random .randint (100 ), 2 )), columns = ['a' , 'b' ])
@@ -63,33 +91,9 @@ def test_np_spark_compat(self):
63
91
kdf = ks .from_pandas (pdf )
64
92
kdf2 = ks .from_pandas (pdf2 )
65
93
66
- blacklist = [
67
- # Koalas does not currently support
68
- "conj" ,
69
- "conjugate" ,
70
- "isnat" ,
71
- "matmul" ,
72
- "frexp" ,
73
-
74
- # Values are close enough but tests failed.
75
- "arccos" ,
76
- "exp" ,
77
- "expm1" ,
78
- "log" , # flaky
79
- "log10" , # flaky
80
- "log1p" , # flaky
81
- "modf" ,
82
- "floor_divide" , # flaky
83
-
84
- # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow.
85
- # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we
86
- # can enable it later when Koalas switches to PyArrow 0.15 completely.
87
- "left_shift" ,
88
- ]
89
-
90
94
for np_name , spark_func in unary_np_spark_mappings .items ():
91
95
np_func = getattr (np , np_name )
92
- if np_name not in blacklist :
96
+ if np_name not in self . blacklist :
93
97
try :
94
98
# unary ufunc
95
99
self .assert_eq (np_func (pdf .a ), np_func (kdf .a ), almost = True )
@@ -98,7 +102,7 @@ def test_np_spark_compat(self):
98
102
99
103
for np_name , spark_func in binary_np_spark_mappings .items ():
100
104
np_func = getattr (np , np_name )
101
- if np_name not in blacklist :
105
+ if np_name not in self . blacklist :
102
106
try :
103
107
# binary ufunc
104
108
self .assert_eq (
@@ -113,7 +117,7 @@ def test_np_spark_compat(self):
113
117
set_option ('compute.ops_on_diff_frames' , True )
114
118
for np_name , spark_func in list (binary_np_spark_mappings .items ())[:5 ]:
115
119
np_func = getattr (np , np_name )
116
- if np_name not in blacklist :
120
+ if np_name not in self . blacklist :
117
121
try :
118
122
# binary ufunc
119
123
self .assert_eq (
@@ -123,3 +127,50 @@ def test_np_spark_compat(self):
123
127
raise AssertionError ("Test in '%s' function was failed." % np_name ) from e
124
128
finally :
125
129
reset_option ('compute.ops_on_diff_frames' )
130
+
131
+ def test_np_spark_compat_frame (self ):
132
+ # Use randomly generated dataFrame
133
+ pdf = pd .DataFrame (
134
+ np .random .randint (- 100 , 100 , size = (np .random .randint (100 ), 2 )), columns = ['a' , 'b' ])
135
+ pdf2 = pd .DataFrame (
136
+ np .random .randint (- 100 , 100 , size = (len (pdf ), len (pdf .columns ))), columns = ['a' , 'b' ])
137
+ kdf = ks .from_pandas (pdf )
138
+ kdf2 = ks .from_pandas (pdf2 )
139
+
140
+ for np_name , spark_func in unary_np_spark_mappings .items ():
141
+ np_func = getattr (np , np_name )
142
+ if np_name not in self .blacklist :
143
+ try :
144
+ # unary ufunc
145
+ self .assert_eq (np_func (pdf ), np_func (kdf ), almost = True )
146
+ except Exception as e :
147
+ raise AssertionError ("Test in '%s' function was failed." % np_name ) from e
148
+
149
+ for np_name , spark_func in binary_np_spark_mappings .items ():
150
+ np_func = getattr (np , np_name )
151
+ if np_name not in self .blacklist :
152
+ try :
153
+ # binary ufunc
154
+ self .assert_eq (
155
+ np_func (pdf , pdf ), np_func (kdf , kdf ), almost = True )
156
+ self .assert_eq (
157
+ np_func (pdf , 1 ), np_func (kdf , 1 ), almost = True )
158
+ except Exception as e :
159
+ raise AssertionError ("Test in '%s' function was failed." % np_name ) from e
160
+
161
+ # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
162
+ try :
163
+ set_option ('compute.ops_on_diff_frames' , True )
164
+ for np_name , spark_func in list (binary_np_spark_mappings .items ())[:5 ]:
165
+ np_func = getattr (np , np_name )
166
+ if np_name not in self .blacklist :
167
+ try :
168
+ # binary ufunc
169
+ self .assert_eq (
170
+ np_func (pdf , pdf2 ).sort_index (),
171
+ np_func (kdf , kdf2 ).sort_index (), almost = True )
172
+
173
+ except Exception as e :
174
+ raise AssertionError ("Test in '%s' function was failed." % np_name ) from e
175
+ finally :
176
+ reset_option ('compute.ops_on_diff_frames' )
0 commit comments