Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ibis/expr/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def get_result(self):
return shape_like_args(self.args, promoted_type)

def _get_type(self):
if util.any_of(self.args, ir.FloatingValue):
if util.any_of(self.args, ir.DecimalValue):
return _decimal_promoted_type(self.args)
elif util.any_of(self.args, ir.FloatingValue):
if util.any_of(self.args, ir.DoubleValue):
return 'double'
else:
return 'float'
elif util.all_of(self.args, ir.IntegerValue):
return self._get_int_type()
elif util.any_of(self.args, ir.DecimalValue):
return _decimal_promoted_type(self.args)
else:
raise NotImplementedError

Expand Down
15 changes: 15 additions & 0 deletions ibis/expr/tests/test_value_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,3 +890,18 @@ def test_fillna_null(value, expected):
def test_string_temporal_compare(op, left, right):
result = op(left, right)
assert result.type().equals(dt.boolean)


@pytest.mark.parametrize(
('value', 'type', 'expected_type_class'),
[
(2.21, 'decimal', dt.Decimal),
(3.14, 'double', dt.Double),
(4.2, 'int64', dt.Double),
(4, 'int64', dt.Int64),
]
)
def test_decimal_modulo_output_type(value, type, expected_type_class):
t = ibis.table([('a', type)])
expr = t.a % value
assert isinstance(expr.type(), expected_type_class)
3 changes: 3 additions & 0 deletions ibis/pandas/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from __future__ import absolute_import

from ibis.pandas.client import PandasClient
from ibis.pandas.decimal import execute_node # noqa: F401
from ibis.pandas.execution import execute # noqa: F401


Expand Down
28 changes: 21 additions & 7 deletions ibis/pandas/client.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import

import six

import numpy as np
Expand Down Expand Up @@ -30,7 +32,7 @@
}


def pandas_dtypes_to_ibis_schema(df):
def pandas_dtypes_to_ibis_schema(df, schema):
dtypes = df.dtypes

pairs = []
Expand All @@ -41,10 +43,20 @@ def pandas_dtypes_to_ibis_schema(df):
'Column names must be strings to use the pandas backend'
)

if dtype == np.object_:
ibis_type = _INFERRED_DTYPE_TO_IBIS_TYPE[
infer_dtype(df[column_name].dropna())
]
if column_name in schema:
ibis_type = dt.validate_type(schema[column_name])
elif dtype == np.object_:
inferred_dtype = infer_dtype(df[column_name].dropna())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yikes. I guess we should make a NaN-friendly type inference function someplace (seems like an oversight in infer_dtype originally)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u post an issue in pandas tracker about this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I PR'd it :) pandas-dev/pandas#17066.


if inferred_dtype == 'mixed':
raise TypeError(
'Unable to infer type of column {0!r}. Try instantiating '
'your table from the client with client.table('
"'my_table', schema={{{0!r}: <explicit type>}})".format(
column_name
)
)
ibis_type = _INFERRED_DTYPE_TO_IBIS_TYPE[inferred_dtype]
elif hasattr(dtype, 'tz'):
ibis_type = dt.Timestamp(str(dtype.tz))
else:
Expand All @@ -60,9 +72,11 @@ class PandasClient(client.Client):
def __init__(self, dictionary):
self.dictionary = dictionary

def table(self, name):
def table(self, name, schema=None):
df = self.dictionary[name]
schema = pandas_dtypes_to_ibis_schema(df)
schema = pandas_dtypes_to_ibis_schema(
df, schema if schema is not None else {}
)
return ops.DatabaseTable(name, schema, self).to_expr()

def execute(self, query, *args, **kwargs):
Expand Down
2 changes: 2 additions & 0 deletions ibis/pandas/core.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import

import collections
import numbers
import datetime
Expand Down
121 changes: 121 additions & 0 deletions ibis/pandas/decimal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import absolute_import

import decimal
import math
import numbers

import numpy as np
import pandas as pd
import six

import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
from ibis.pandas.dispatch import execute_node


@execute_node.register(ops.Ln, decimal.Decimal)
def execute_decimal_natural_log(op, data, scope=None):
try:
return data.ln()
except decimal.InvalidOperation:
return decimal.Decimal('NaN')


@execute_node.register(ops.Log, decimal.Decimal, decimal.Decimal)
def execute_decimal_log_with_decimal_base(op, data, base, scope=None):
try:
return data.ln() / base.ln()
except decimal.InvalidOperation:
return decimal.Decimal('NaN')


@execute_node.register(ops.Log, decimal.Decimal, type(None))
def execute_decimal_log_with_no_base(op, data, _, scope=None):
return execute_decimal_natural_log(op, data, scope=scope)


@execute_node.register(ops.Log, decimal.Decimal, numbers.Real)
def execute_decimal_log_with_real_base(op, data, base, scope=None):
return execute_node(op, data, decimal.Decimal(base), scope=scope)


@execute_node.register(ops.Log, decimal.Decimal, np.integer)
def execute_decimal_log_with_np_integer_base(op, data, base, scope=None):
return execute_node(op, data, int(base), scope=scope)


@execute_node.register(ops.Log2, decimal.Decimal)
def execute_decimal_log2(op, data, scope=None):
try:
return data.ln() / decimal.Decimal(2).ln()
except decimal.InvalidOperation:
return decimal.Decimal('NaN')


@execute_node.register(ops.UnaryOp, decimal.Decimal)
def execute_decimal_unary(op, data, scope=None):
operation_name = type(op).__name__.lower()
math_function = getattr(math, operation_name, None)
function = getattr(
decimal.Decimal,
operation_name,
lambda x: decimal.Decimal(math_function(x))
)
try:
return function(data)
except decimal.InvalidOperation:
return decimal.Decimal('NaN')


@execute_node.register(ops.Sign, decimal.Decimal)
def execute_decimal_sign(op, data, scope=None):
return data if not data else decimal.Decimal(1).copy_sign(data)


@execute_node.register(ops.Abs, decimal.Decimal)
def execute_decimal_abs(op, data, scope=None):
return abs(data)


@execute_node.register(
ops.Round, decimal.Decimal, (np.integer,) + six.integer_types
)
def execute_round_decimal(op, data, places, scope=None):
# If we only allowed Python 3, we wouldn't have to implement any of this;
# we could just call round(data, places) :(
tuple_value = data.as_tuple()
precision = len(tuple_value.digits)
integer_part_length = precision + min(tuple_value.exponent, 0)

if places < 0:
decimal_format_string = '0.{}E+{:d}'.format(
'0' * (integer_part_length - 1 + places),
max(integer_part_length + places, abs(places))
)
else:
decimal_format_string = '{}.{}'.format(
'0' * integer_part_length, '0' * places
)

places = decimal.Decimal(decimal_format_string)
return data.quantize(places)


@execute_node.register(ops.Round, decimal.Decimal, type(None))
def execute_round_decimal_no_places(op, data, _, scope=None):
return np.int64(round(data))


@execute_node.register(ops.Cast, pd.Series, dt.Decimal)
def execute_cast_series_to_decimal(op, data, type, scope=None):
precision = type.precision
scale = type.scale
context = decimal.Context(prec=precision)
places = context.create_decimal(
'{}.{}'.format('0' * (precision - scale), '0' * scale),
)
return data.apply(
lambda x, context=context, places=places: ( # noqa: E501
context.create_decimal(x).quantize(places)
)
)
2 changes: 2 additions & 0 deletions ibis/pandas/dispatch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import absolute_import

from multipledispatch import Dispatcher


Expand Down
46 changes: 46 additions & 0 deletions ibis/pandas/execution.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from __future__ import absolute_import

import numbers
import operator
import datetime
import functools
import decimal

import six

Expand Down Expand Up @@ -135,6 +138,38 @@ def execute_cast_series_date(op, data, type, scope=None):
}


@execute_node.register(ops.UnaryOp, pd.Series)
def execute_series_unary_op(op, data, scope=None):
function = getattr(np, type(op).__name__.lower())
if data.dtype == np.dtype(np.object_):
return data.apply(functools.partial(execute_node, op, scope=scope))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is Series.apply different from Series.map (in behavior or performance)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't think so, @jreback any idea here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, it looks like Series.map accepts a dict, to support a simple CASE-statement-like operation, as well as callables, whereas apply only deals with callables. For callables, both methods use the same underlying function lib.map_infer to call the passed in callable in a Cython loop. Here we could use either since we're only dealing with callables.

return function(data)


def vectorize_object(op, arg, *args, **kwargs):
func = np.vectorize(functools.partial(execute_node, op, **kwargs))
return pd.Series(func(arg, *args), index=arg.index, name=arg.name)


@execute_node.register(
ops.Log, pd.Series, (pd.Series, numbers.Real, decimal.Decimal, type(None))
)
def execute_series_log_with_base(op, data, base, scope=None):
if data.dtype == np.dtype(np.object_):
return vectorize_object(op, data, base, scope=scope)

if base is None:
return np.log(data)
return np.log(data) / np.log(base)


@execute_node.register(ops.Ln, pd.Series)
def execute_series_natural_log(op, data, scope=None):
if data.dtype == np.dtype(np.object_):
return data.apply(functools.partial(execute_node, op, scope=scope))
return np.log(data)


@execute_node.register(ops.Cast, datetime.datetime, dt.String)
def execute_cast_datetime_or_timestamp_to_string(op, data, type, scope=None):
"""Cast timestamps to strings"""
Expand Down Expand Up @@ -213,6 +248,17 @@ def execute_cast_string_literal(op, data, type, scope=None):
return cast_function(data)


@execute_node.register(
ops.Round,
pd.Series,
(pd.Series, np.integer, type(None)) + six.integer_types
)
def execute_round_series(op, data, places, scope=None):
if data.dtype == np.dtype(np.object_):
return vectorize_object(op, data, places, scope=scope)
return data.round(places if places is not None else 0)


@execute_node.register(ops.TableColumn, (pd.DataFrame, DataFrameGroupBy))
def execute_table_column_dataframe_or_dataframe_groupby(op, data, scope=None):
return data[op.name]
Expand Down
Loading