Skip to content

Commit

Permalink
Kill IntColumn and FloatColumn. Closes #64.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed May 1, 2014
1 parent f17d88f commit 29902d7
Show file tree
Hide file tree
Showing 12 changed files with 154 additions and 310 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
0.3.0
-----

* Collapse IntColumn and FloatColumn into NumberColumn. (#64)
* Table.outliers_mad implemented. (#93)
* Column.mad implemented. (#93)
* Table.outliers implemented. (#86)
Expand Down
10 changes: 5 additions & 5 deletions docs/cookbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ If your file does not have headers:

.. code-block:: python
from journalism import Table, TextColumn, IntColumn, FloatColumn
from journalism import Table, TextColumn, NumberColumn
column_names = ('city', 'area', 'population')
column_types = (TextColumn, FloatColumn, IntColumn)
column_types = (TextColumn, NumberColumn, NumberColumn)
with open('population.csv') as f:
rows = list(csv.reader(f)
Expand All @@ -28,7 +28,7 @@ If your file does have headers:
.. code-block:: python
column_types = (TextColumn, FloatColumn, IntColumn)
column_types = (TextColumn, NumberColumn, NumberColumn)
with open('population.csv') as f:
rows = list(csv.reader(f))
Expand All @@ -46,7 +46,7 @@ Of course, cool kids use `csvkit <http://csvkit.rtfd.org/>`_. (Hint: it supports
import csvkit
column_types = (TextColumn, FloatColumn, IntColumn)
column_types = (TextColumn, NumberColumn, NumberColumn)
with open('population.csv') as f:
rows = list(csvkit.reader(f))
Expand Down Expand Up @@ -190,7 +190,7 @@ This will compute the percent change between the :code:`july` and :code:`august`
Rounding to two decimal places
------------------------------
journalism stores fractional values using Python's :class:`decimal.Decimal` type. This data type ensures numerical precision beyond what is supported by the native :func:`float` type, however, because of this we can not use Python's builtin :func:`round` function. Instead we must use :meth:`decimal.Decimal.quantize`.
journalism stores numerical values using Python's :class:`decimal.Decimal` type. This data type ensures numerical precision beyond what is supported by the native :func:`float` type, however, because of this we can not use Python's builtin :func:`round` function. Instead we must use :meth:`decimal.Decimal.quantize`.
We can use :meth:`.Table.compute` to apply the quantize to generate a rounded column from an existing one:
Expand Down
22 changes: 11 additions & 11 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,18 @@ Here is an example of how to use journalism, using financial aid data from data.
import csv
from journalism import Table, TextColumn, IntColumn
from journalism import Table, TextColumn, NumberColumn
COLUMNS = (
('state', TextColumn),
('state_abbr', TextColumn),
('9_11_gi_bill1', IntColumn),
('montogomery_gi_bill_active', IntColumn),
('montgomery_gi_bill_reserve', IntColumn),
('dependants', IntColumn),
('reserve', IntColumn),
('vietnam', IntColumn),
('total', IntColumn)
('9_11_gi_bill1', NumberColumn),
('montogomery_gi_bill_active', NumberColumn),
('montgomery_gi_bill_reserve', NumberColumn),
('dependants', NumberColumn),
('reserve', NumberColumn),
('vietnam', NumberColumn),
('total', NumberColumn)
)
COLUMN_NAMES = tuple(c[0] for c in COLUMNS)
Expand All @@ -118,7 +118,7 @@ Here is an example of how to use journalism, using financial aid data from data.
rows = rows[:-2]
# Create the table
table = Table(rows, COLUMN_TYPES, COLUMN_NAMES, cast=True)
table = Table(rows, COLUMN_TYPES, COLUMN_NAMES)
# Remove Phillipines and Puerto Rico
states = table.where(lambda r: r['state_abbr'] not in ('PR', 'PH'))
Expand All @@ -127,7 +127,7 @@ Here is an example of how to use journalism, using financial aid data from data.
print('Total of all states: %i' % states.columns['total'].sum())
# Sort state total, descending
order_by_total_desc = states.order_by(lambda r: r['total'], reverse=True)
order_by_total_desc = states.order_by('total', reverse=True)
# Grab just the top 5 states
top_five = order_by_total_desc.rows[:5]
Expand All @@ -149,7 +149,7 @@ Here is an example of how to use journalism, using financial aid data from data.
# Calculate the standard of deviation for the state totals
stdev = states.columns['total'].stdev()
print('Standard deviation of totals: %.2f' % stdev) print 'Standard deviation of totals: %.2f' % stdev
print('Standard deviation of totals: %.2f' % stdev)
Cookbook
========
Expand Down
18 changes: 9 additions & 9 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,18 @@

import csv

from journalism import Table, TextColumn, IntColumn
from journalism import Table, TextColumn, NumberColumn

COLUMNS = (
('state', TextColumn),
('state_abbr', TextColumn),
('9_11_gi_bill1', IntColumn),
('montogomery_gi_bill_active', IntColumn),
('montgomery_gi_bill_reserve', IntColumn),
('dependants', IntColumn),
('reserve', IntColumn),
('vietnam', IntColumn),
('total', IntColumn)
('9_11_gi_bill1', NumberColumn),
('montogomery_gi_bill_active', NumberColumn),
('montgomery_gi_bill_reserve', NumberColumn),
('dependants', NumberColumn),
('reserve', NumberColumn),
('vietnam', NumberColumn),
('total', NumberColumn)
)

COLUMN_NAMES = tuple(c[0] for c in COLUMNS)
Expand All @@ -31,7 +31,7 @@
rows = rows[:-2]

# Create the table
table = Table(rows, COLUMN_TYPES, COLUMN_NAMES, cast=True)
table = Table(rows, COLUMN_TYPES, COLUMN_NAMES)

# Remove Phillipines and Puerto Rico
states = table.where(lambda r: r['state_abbr'] not in ('PR', 'PH'))
Expand Down
2 changes: 1 addition & 1 deletion journalism/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

from journalism.columns import TextColumn, IntColumn, DecimalColumn
from journalism.columns import TextColumn, NumberColumn
from journalism.exceptions import *
from journalism.table import Table

Expand Down
159 changes: 50 additions & 109 deletions journalism/columns.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
#!/usr/bin/env python

from collections import Mapping, Sequence, defaultdict
from decimal import Decimal
from decimal import Decimal, InvalidOperation
from functools import wraps
import warnings

try:
from collections import OrderedDict
except ImportError: #pragma: nocover
except ImportError: #pragma: no cover
from ordereddict import OrderedDict

import six

from journalism.exceptions import ColumnDoesNotExistError, ColumnValidationError, NullComputationError
from journalism.exceptions import ColumnDoesNotExistError, NullComputationError, CastError

class ColumnIterator(six.Iterator):
"""
Expand Down Expand Up @@ -69,14 +68,41 @@ def check(c, *args, **kwargs):

return check

def cast_text(d):
"""
Cast a single value to a type appropriate for :class:`TextColumn`.
"""
if d == '' or d is None:
return None

return six.text_type(d)

def cast_number(d):
"""
Cast a single value to a type appropriate for :class:`NumberColumn`.
"""
if isinstance(d, six.string_types):
d = d.replace(',' ,'').strip()

if d == '' or d is None:
return None

if isinstance(d, float):
raise CastError('Can not convert float to Decimal for NumberColumn. Convert data to string first!')

try:
return Decimal(d)
except InvalidOperation:
raise CastError('Can not convert %s to Decimal for NumberColumn.' % repr(d))

def median(data_sorted):
"""
Compute the median value of this column.
"""
length = len(data_sorted)

if length % 2 == 1:
return data_sorted[((length + 1) / 2) - 1]
return data_sorted[((length + 1) // 2) - 1]
else:
half = length // 2
a = data_sorted[half - 1]
Expand All @@ -99,11 +125,12 @@ def __init__(self, table, index):
def __unicode__(self):
data = self._data()

sample = repr(data[:5])
sample = ', '.join(six.text_type(d) for d in data[:5])

if len(data) > 5:
last = sample[-1]
sample = sample[:-1] + ', ...' + last
sample = '%s, ...' % sample

sample = '(%s)' % sample

return '<journalism.columns.%s: %s>' % (self.__class__.__name__, sample)

Expand Down Expand Up @@ -146,17 +173,19 @@ def __ne__(self, other):
"""
return not self.__eq__(other)

def validate(self):
def _get_cast_func(self):
"""
Verify values in this column are of an appopriate type.
Return the function used to cast values in this column.
"""
raise NotImplementedError
raise NotImplementedError # pragma: no cover

def _cast(self):
"""
Cast values in this column to an appropriate type, if possible.
"""
raise NotImplementedError
cast_func = self._get_cast_func()

return tuple(cast_func(d) for d in self._data())

def has_nulls(self):
"""
Expand Down Expand Up @@ -194,7 +223,7 @@ def counts(self):
column.
Returns a new :class:`.Table`, with two columns,
one containing the values and a a second, :class:`IntColumn`
one containing the values and a a second, :class:`NumberColumn`
containing the counts.
Resulting table will be sorted by descending count.
Expand All @@ -208,7 +237,7 @@ def counts(self):
counts[d] += 1

column_names = (self._table._column_names[self._index], 'count')
column_types = (self._table._column_types[self._index], IntColumn)
column_types = (self._table._column_types[self._index], NumberColumn)
data = (tuple(i) for i in counts.items())

rows = sorted(data, key=lambda r: r[1], reverse=True)
Expand All @@ -219,37 +248,18 @@ class TextColumn(Column):
"""
A column containing unicode/string data.
"""
def validate(self):
"""
Verify all values in this column are string/unicode or null.
Will raise :exc:`.ColumnValidationError`
if validation fails.
"""
for d in self._data():
if not isinstance(d, six.string_types) and d is not None:
raise ColumnValidationError(d, self)

def _cast(self):
"""
Cast values to unicode.
"""
casted = []

for d in self._data():
if d == '' or d is None:
casted.append(None)
else:
casted.append(six.text_type(d))

return casted
def _get_cast_func(self):
return cast_text

class NumberColumn(Column):
"""
A column containing numeric data.
Base class for :class:`IntColumn` and :class:`DecimalColumn`.
All data is represented by the :class:`decimal.Decimal` class.'
"""
def _get_cast_func(self):
return cast_number

def sum(self):
"""
Compute the sum of this column.
Expand Down Expand Up @@ -335,72 +345,3 @@ def mad(self):

return median(tuple(abs(n - m) for n in data))

class IntColumn(NumberColumn):
"""
A column containing integer data.
"""
def validate(self):
"""
Verify all values in this column are int or null.
Will raise :exc:`.ColumnValidationError` if validation fails.
"""
for d in self._data():
if not isinstance(d, int) and d is not None:
raise ColumnValidationError(d, self)

def _cast(self):
"""
Cast values in this column to integer.
"""
casted = []

for d in self._data():
if isinstance(d, six.string_types):
d = d.replace(',' ,'').strip()

if d == '' or d is None:
casted.append(None)
else:
casted.append(int(d))

return casted

class DecimalColumn(NumberColumn):
"""
A column containing decimal data.
"""
def validate(self):
"""
Verify all values in this column are Decimal or null.
NB: We never use floats because of rounding error.
Will raise :exc:`.ColumnValidationError` if validation fails.
"""
for d in self._data():
if not isinstance(d, Decimal) and d is not None:
raise ColumnValidationError(d, self)

def _cast(self):
"""
Cast values in this column to Decimal.
NB: casting from float will introduce precision
errors. Always cast from string, e.g. '3.14'.
"""
casted = []

for d in self._data():
if isinstance(d, six.string_types):
d = d.replace(',' ,'').strip()

if d == '' or d is None:
casted.append(None)
elif isinstance(d, float):
warnings.warn('Casting float to Decimal! Precision lost. Cast from string instead!')
else:
casted.append(Decimal(d))

return casted

Loading

0 comments on commit 29902d7

Please sign in to comment.