Skip to content
This repository has been archived by the owner on Dec 11, 2023. It is now read-only.

Commit

Permalink
pandas out_flavor for ctable
Browse files Browse the repository at this point in the history
* introduction of an abstraction layer for the "output array"
* implementation of an numpy specialisation of the abstraction layer
* implementation of a pandas specialisation of the abstraction layer
  • Loading branch information
ARF committed May 3, 2015
1 parent 5ebe7c1 commit 5766048
Show file tree
Hide file tree
Showing 2 changed files with 253 additions and 15 deletions.
245 changes: 230 additions & 15 deletions bcolz/ctable.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import numpy as np
import bcolz
from bcolz import utils, attrs, array2string
if bcolz.pandas_here:
import pandas as pd
import itertools
from collections import namedtuple
import json
Expand Down Expand Up @@ -1013,11 +1015,9 @@ def _where(self, boolarr, colnames=None):

if colnames is None:
colnames = self.names
cols = [self.cols[name][boolarr] for name in colnames]
dtype = np.dtype([(name, self.cols[name].dtype) for name in colnames])
result = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)
result = OutputStructure.fromboolarr(self, boolarr, colnames)

return result
return result.ra

def __getitem__(self, key):
"""Returns values based on `key`.
Expand All @@ -1043,10 +1043,10 @@ def __getitem__(self, key):
# First, check for integer
if isinstance(key, _inttypes):
# Get a copy of the len-1 array
ra = self._arr1.copy()
result = OutputStructure(1, self.dtype)
# Fill it
ra[0] = tuple([self.cols[name][key] for name in self.names])
return ra[0]
result[0] = tuple([self.cols[name][key] for name in self.names])
return result.ra
# Slices
elif type(key) == slice:
(start, stop, step) = key.start, key.stop, key.step
Expand All @@ -1060,7 +1060,7 @@ def __getitem__(self, key):
# List of integers (case of fancy indexing), or list of column names
elif type(key) is list:
if len(key) == 0:
return np.empty(0, self.dtype)
return OutputStructure(0, self.dtype).ra
strlist = [type(v) for v in key] == [str for v in key]
# Range of column names
if strlist:
Expand All @@ -1072,15 +1072,16 @@ def __getitem__(self, key):
except:
raise IndexError(
"key cannot be converted to an array of indices")
return np.fromiter((self[i] for i in key),
dtype=self.dtype, count=len(key))
result = OutputStructure.fromindices(self, key)
return result.ra
# A boolean array (case of fancy indexing)
elif hasattr(key, "dtype"):
if key.dtype.type == np.bool_:
return self._where(key)
elif np.issubsctype(key, np.int_):
# An integer array
return np.array([self[i] for i in key], dtype=self.dtype)
result = OutputStructure.fromindices(self, key)
return result.ra
else:
raise IndexError(
"arrays used as indices must be integer (or boolean)")
Expand All @@ -1105,12 +1106,12 @@ def __getitem__(self, key):
(start, stop, step) = slice(start, stop, step).indices(self.len)
# Build a numpy container
n = utils.get_len_of_range(start, stop, step)
ra = np.empty(shape=(n,), dtype=self.dtype)
result = OutputStructure(n, self.dtype)
# Fill it
for name in self.names:
ra[name][:] = self.cols[name][start:stop:step]
result[name] = self.cols[name][start:stop:step]

return ra
return result.ra

def __setitem__(self, key, value):
"""Sets values based on `key`.
Expand Down Expand Up @@ -1247,7 +1248,221 @@ def __repr__(self):
return fullrepr


# Local Variables:

class OutputStructureEngine(object):
# holds the return array
ra = None

# poor-man's cache, better would be a LRU cache from python3 or its backport
template_cache = {}
template_order = []
template_type = None
template_maxsize = 10

@classmethod
def _push_to_cache(cls, key, value):
if len(cls.template_cache) >= cls.template_maxsize:
# remove first inserted
del cls.template_cache[cls.template_order[0]]
del cls.template_order[0]
cls.template_order.append(key)
cls.template_cache[key] = value

@classmethod
def _try_cache(cls, key):
if cls.template_type != bcolz.defaults.ctable_out_flavor:
return None

if dtype not in cls.template_cache:
return None
else:
return cls.template_cache[key]

# dispatcher functions
def __init__(self, size, dtype):
"""Allocate an output array and return it encapsulated in a class
abstracting data access."""

method = '_allocate_' + bcolz.defaults.ctable_out_flavor
allocate = getattr(self, method, self._fallback)
self.ra = allocate(size, dtype)

@classmethod
def fromindices(cls, ctable_, iter):
"""Create an output array from an iterator or row indices and return
it encapsulated in a class abstracting data access."""

method = '_fromindices_' + bcolz.defaults.ctable_out_flavor
fromindices = getattr(cls, method, cls._fallback)
return fromindices(ctable_, iter)

@classmethod
def fromboolarr(cls, ctable_, boolarr, colnames):
"""Create an output array from a boolean row selector arrayand return
it encapsulated in a class abstracting data access."""

method = '_fromboolarr_' + bcolz.defaults.ctable_out_flavor
fromboolarr = getattr(cls, method, cls._fallback)
return fromboolarr(ctable_, boolarr, colnames)

def __setitem__(self, key, value):
"""Abstract data access to an output array."""

method = '_setitem_' + bcolz.defaults.ctable_out_flavor
setitem = getattr(self, method, self._fallback)
return setitem(key, value)

@classmethod
def _fallback(cls, *args, **kwargs):
import inspect
raise NotImplementedError('_%s_%s not implemented.' %
(inspect.stack()[1][3].strip('_'),
cls.out_flavor)
)


class OutputStructure(OutputStructureEngine):
### numpy implementation ###
@classmethod
def _allocate_numpy(cls, size, dtype):
if size == 1:
# only cache size-1 numpy arrays
result = cls._try_cache(dtype)
if result is None:
result = np.empty(shape=(1,), dtype=dtype)
cls._push_to_cache(dtype, result)

return result.copy()

else:
return np.empty(size, dtype)

@classmethod
def _fromindices_numpy(cls, ctable_, iter):
result = object.__new__(cls)
result.ra = np.fromiter((ctable_[i] for i in iter),
dtype=ctable_.dtype, count=len(iter))
return result

@classmethod
def _fromboolarr_numpy(cls, ctable_, boolarr, colnames):
result = object.__new__(cls)

dtype = np.dtype([(name, ctable_.cols[name].dtype) for name in colnames])
cols = [ctable_.cols[name][boolarr] for name in colnames]
result.ra = np.rec.fromarrays(cols, dtype=dtype).view(np.ndarray)
return result

def _setitem_numpy(self, key, value):
if isinstance(key, int):
self.ra[key] = value
else:
self.ra[key][:] = value

### pandas implementation ###
@classmethod
def _allocate_pandas(cls, size, dtype):
# cache templates of pandas dataframes for faster instantiation
template = cls._try_cache(dtype)
if template is None:
template = pd.DataFrame(np.empty(shape=(0,), dtype=dtype))
cls._push_to_cache(dtype, template)

return allocate_like(template, size)

@classmethod
def _fromindices_pandas(cls, ctable_, iter):
result = object.__new__(cls)
result.ra = cls._allocate_pandas(len(iter), ctable_.dtype)

for name in colnames:
result[name] = ctable_.cols[name][iter]

return result

@classmethod
def _fromboolarr_pandas(cls, ctable_, boolarr, colnames):
dtype = np.dtype([(name, ctable_.cols[name].dtype) for name in colnames])
result = object.__new__(cls)
result.ra = cls._allocate_pandas(len(boolarr[boolarr]), dtype)

for name in colnames:
result[name] = ctable_.cols[name][boolarr]

return result

def _setitem_pandas(self, key, value):
if isinstance(key, int):
blknos = self.ra._data._blknos[range(len(value))]
blklocs = self.ra._data._blklocs[range(len(value))]
for i, (blkno, blkloc) in enumerate(zip(blknos, blklocs)):
self.ra._data.blocks[blkno].values[blkloc, key] = value[i]
else:
# efficiently setting pandas columns
loc = self.ra._data.items.get_loc(key)
blkno = self.ra._data._blknos[loc]
blkloc = self.ra._data._blklocs[loc]
self.ra._data.blocks[blkno].values[blkloc, :] = value


# avoid making pandas a requirement for bcolz
# needs more polished solution eventualls
try:
from pandas.core.internals import BlockManager
from pandas.core.frame import DataFrame
from pandas.core.common import CategoricalDtype
from pandas.core.categorical import Categorical
except ImportError:
pass

try:
from pandas.core.index import RangeIndex
except ImportError:
try:
from pandas.core.index import Int64Index
def RangeIndex(start, stop, step, **kwargs):
return Int64Index(np.arange(start, stop, step), **kwargs)
except ImportError:
pass

def allocate_like(df, size, keep_categories=False):
"""High-performance pandas dataframe constructor for numpy dtype
columns + categoricals working from a template dataframe.
This significantly speed up dataframe instantiation for dataframes
with only a few rows, gains for large dataframes are minimal."""

# define axes (ideally uses PR #9977 for MUCH better performance)
axes = [df.columns.values.tolist(), RangeIndex(0, size, 1, fastpath=True)]

# allocate and create blocks
blocks = []
for block in df._data.blocks:
# special treatment for non-ordinary block types
if isinstance(block.dtype, CategoricalDtype):
if keep_categories:
categories = block.values.categories
else:
categories = Index([])
values = Categorical(values=np.empty(shape=block.values.shape,
dtype=block.values.codes.dtype),
categories=categories,
fastpath=True)
# ordinary block types
else:
new_shape = (block.values.shape[0], size)
values = np.empty(shape=new_shape, dtype=block.dtype)

new_block = block.make_block_same_class(values=values,
placement=block.mgr_locs.as_array)
blocks.append(new_block)

# create block manager
mgr = BlockManager(blocks, axes)

# create dataframe
return DataFrame(mgr)


# mode: python
# tab-width: 4
# fill-column: 78
Expand Down
23 changes: 23 additions & 0 deletions bcolz/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import absolute_import

import bcolz
import warnings


class Defaults(object):
Expand All @@ -22,6 +23,7 @@ def __init__(self):

# Choices setup
self.choices['eval_out_flavor'] = ("carray", "numpy")
self.choices['ctable_out_flavor'] = ("numpy", "pandas")
self.choices['eval_vm'] = ("numexpr", "python")

def check_choices(self, name, value):
Expand Down Expand Up @@ -70,6 +72,21 @@ def eval_out_flavor(self, value):
self.check_choices('eval_out_flavor', value)
self.__eval_out_flavor = value

@property
def ctable_out_flavor(self):
return self.__ctable_out_flavor

@ctable_out_flavor.setter
def ctable_out_flavor(self, value):
try:
self.check_choices('ctable_out_flavor', value)
except ValueError:
warnings.warn(
"'%s' is not implemented out of the box for '%s' default."
% (value, ctable_out_flavor)
+ " Provide your own OutputStructure implementation.")
self.__ctable_out_flavor = value

@property
def cparams(self):
return self.__cparams
Expand All @@ -90,6 +107,12 @@ def cparams(self, value):
'numpy'. Default is 'carray'.
"""

defaults.ctable_out_flavor = "numpy"
"""
The flavor for the output object in `eval()`. It can be 'carray' or
'numpy'. Default is 'carray'.
"""

defaults.eval_vm = "numexpr" if bcolz.numexpr_here else "python"
"""
The virtual machine to be used in computations (via `eval`). It can
Expand Down

0 comments on commit 5766048

Please sign in to comment.