Kill IntColumn and FloatColumn. Closes #64.

wireservice · May 1, 2014 · 29902d7 · 29902d7
1 parent f17d88f
commit 29902d7
Show file tree

Hide file tree

Showing 12 changed files with 154 additions and 310 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,7 @@
 0.3.0
 -----
 
+* Collapse IntColumn and FloatColumn into NumberColumn. (#64)
 * Table.outliers_mad implemented. (#93)
 * Column.mad implemented. (#93)
 * Table.outliers implemented. (#86)

diff --git a/docs/cookbook.rst b/docs/cookbook.rst
@@ -14,10 +14,10 @@ If your file does not have headers:
 
 .. code-block:: python
 
-    from journalism import Table, TextColumn, IntColumn, FloatColumn
+    from journalism import Table, TextColumn, NumberColumn
 
     column_names = ('city', 'area', 'population')
-    column_types = (TextColumn, FloatColumn, IntColumn)
+    column_types = (TextColumn, NumberColumn, NumberColumn)
 
     with open('population.csv') as f:
         rows = list(csv.reader(f) 
@@ -28,7 +28,7 @@ If your file does have headers:
 
 .. code-block:: python
 
-    column_types = (TextColumn, FloatColumn, IntColumn)
+    column_types = (TextColumn, NumberColumn, NumberColumn)
 
     with open('population.csv') as f:
         rows = list(csv.reader(f))
@@ -46,7 +46,7 @@ Of course, cool kids use `csvkit <http://csvkit.rtfd.org/>`_. (Hint: it supports
 
     import csvkit
 
-    column_types = (TextColumn, FloatColumn, IntColumn)
+    column_types = (TextColumn, NumberColumn, NumberColumn)
 
     with open('population.csv') as f:
         rows = list(csvkit.reader(f))
@@ -190,7 +190,7 @@ This will compute the percent change between the :code:`july` and :code:`august`
 Rounding to two decimal places
 ------------------------------
 
-journalism stores fractional values using Python's :class:`decimal.Decimal` type. This data type ensures numerical precision beyond what is supported by the native :func:`float` type, however, because of this we can not use Python's builtin :func:`round` function. Instead we must use :meth:`decimal.Decimal.quantize`.
+journalism stores numerical values using Python's :class:`decimal.Decimal` type. This data type ensures numerical precision beyond what is supported by the native :func:`float` type, however, because of this we can not use Python's builtin :func:`round` function. Instead we must use :meth:`decimal.Decimal.quantize`.
 
 We can use :meth:`.Table.compute` to apply the quantize to generate a rounded column from an existing one:
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -89,18 +89,18 @@ Here is an example of how to use journalism, using financial aid data from data.
 
     import csv
 
-    from journalism import Table, TextColumn, IntColumn
+    from journalism import Table, TextColumn, NumberColumn
 
     COLUMNS = ( 
         ('state', TextColumn),
         ('state_abbr', TextColumn),
-        ('9_11_gi_bill1', IntColumn),
-        ('montogomery_gi_bill_active', IntColumn),
-        ('montgomery_gi_bill_reserve', IntColumn),
-        ('dependants', IntColumn),
-        ('reserve', IntColumn),
-        ('vietnam', IntColumn),
-        ('total', IntColumn)
+        ('9_11_gi_bill1', NumberColumn),
+        ('montogomery_gi_bill_active', NumberColumn),
+        ('montgomery_gi_bill_reserve', NumberColumn),
+        ('dependants', NumberColumn),
+        ('reserve', NumberColumn),
+        ('vietnam', NumberColumn),
+        ('total', NumberColumn)
     )
 
     COLUMN_NAMES = tuple(c[0] for c in COLUMNS)
@@ -118,7 +118,7 @@ Here is an example of how to use journalism, using financial aid data from data.
     rows = rows[:-2]
 
     # Create the table
-    table = Table(rows, COLUMN_TYPES, COLUMN_NAMES, cast=True)
+    table = Table(rows, COLUMN_TYPES, COLUMN_NAMES)
 
     # Remove Phillipines and Puerto Rico
     states = table.where(lambda r: r['state_abbr'] not in ('PR', 'PH'))
@@ -127,7 +127,7 @@ Here is an example of how to use journalism, using financial aid data from data.
     print('Total of all states: %i' % states.columns['total'].sum())
 
     # Sort state total, descending
-    order_by_total_desc = states.order_by(lambda r: r['total'], reverse=True)
+    order_by_total_desc = states.order_by('total', reverse=True)
 
     # Grab just the top 5 states
     top_five = order_by_total_desc.rows[:5]
@@ -149,7 +149,7 @@ Here is an example of how to use journalism, using financial aid data from data.
     # Calculate the standard of deviation for the state totals
     stdev = states.columns['total'].stdev()
 
-    print('Standard deviation of totals: %.2f' % stdev)    print 'Standard deviation of totals: %.2f' % stdev
+    print('Standard deviation of totals: %.2f' % stdev)
 
 Cookbook
 ========

diff --git a/example.py b/example.py
@@ -2,18 +2,18 @@
 
 import csv
 
-from journalism import Table, TextColumn, IntColumn
+from journalism import Table, TextColumn, NumberColumn
 
 COLUMNS = ( 
     ('state', TextColumn),
     ('state_abbr', TextColumn),
-    ('9_11_gi_bill1', IntColumn),
-    ('montogomery_gi_bill_active', IntColumn),
-    ('montgomery_gi_bill_reserve', IntColumn),
-    ('dependants', IntColumn),
-    ('reserve', IntColumn),
-    ('vietnam', IntColumn),
-    ('total', IntColumn)
+    ('9_11_gi_bill1', NumberColumn),
+    ('montogomery_gi_bill_active', NumberColumn),
+    ('montgomery_gi_bill_reserve', NumberColumn),
+    ('dependants', NumberColumn),
+    ('reserve', NumberColumn),
+    ('vietnam', NumberColumn),
+    ('total', NumberColumn)
 )
 
 COLUMN_NAMES = tuple(c[0] for c in COLUMNS)
@@ -31,7 +31,7 @@
 rows = rows[:-2]
 
 # Create the table
-table = Table(rows, COLUMN_TYPES, COLUMN_NAMES, cast=True)
+table = Table(rows, COLUMN_TYPES, COLUMN_NAMES)
 
 # Remove Phillipines and Puerto Rico
 states = table.where(lambda r: r['state_abbr'] not in ('PR', 'PH'))

diff --git a/journalism/__init__.py b/journalism/__init__.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from journalism.columns import TextColumn, IntColumn, DecimalColumn
+from journalism.columns import TextColumn, NumberColumn
 from journalism.exceptions import *
 from journalism.table import Table
 

diff --git a/journalism/columns.py b/journalism/columns.py
@@ -1,18 +1,17 @@
 #!/usr/bin/env python
 
 from collections import Mapping, Sequence, defaultdict
-from decimal import Decimal
+from decimal import Decimal, InvalidOperation
 from functools import wraps
-import warnings
 
 try:
     from collections import OrderedDict
-except ImportError: #pragma: nocover
+except ImportError: #pragma: no cover
     from ordereddict import OrderedDict
 
 import six
 
-from journalism.exceptions import ColumnDoesNotExistError, ColumnValidationError, NullComputationError
+from journalism.exceptions import ColumnDoesNotExistError, NullComputationError, CastError
 
 class ColumnIterator(six.Iterator):
     """
@@ -69,14 +68,41 @@ def check(c, *args, **kwargs):
 
     return check
 
+def cast_text(d):
+    """
+    Cast a single value to a type appropriate for :class:`TextColumn`.
+    """
+    if d == '' or d is None:
+        return None 
+
+    return six.text_type(d)
+
+def cast_number(d):
+    """
+    Cast a single value to a type appropriate for :class:`NumberColumn`.
+    """
+    if isinstance(d, six.string_types):
+        d = d.replace(',' ,'').strip()
+
+    if d == '' or d is None:
+        return None
+
+    if isinstance(d, float):
+        raise CastError('Can not convert float to Decimal for NumberColumn. Convert data to string first!')
+
+    try:
+        return Decimal(d)
+    except InvalidOperation:
+        raise CastError('Can not convert %s to Decimal for NumberColumn.' % repr(d)) 
+
 def median(data_sorted):
     """
     Compute the median value of this column.
     """
     length = len(data_sorted)
 
     if length % 2 == 1:
-        return data_sorted[((length + 1) / 2) - 1]
+        return data_sorted[((length + 1) // 2) - 1]
     else:
         half = length // 2
         a = data_sorted[half - 1]
@@ -99,11 +125,12 @@ def __init__(self, table, index):
     def __unicode__(self):
         data = self._data()
 
-        sample = repr(data[:5])
+        sample = ', '.join(six.text_type(d) for d in data[:5])
 
         if len(data) > 5:
-            last = sample[-1]
-            sample = sample[:-1] + ', ...' + last
+            sample = '%s, ...' % sample
+
+        sample = '(%s)' % sample
 
         return '<journalism.columns.%s: %s>' % (self.__class__.__name__, sample)
 
@@ -146,17 +173,19 @@ def __ne__(self, other):
         """
         return not self.__eq__(other)
 
-    def validate(self):
+    def _get_cast_func(self):
         """
-        Verify values in this column are of an appopriate type.
+        Return the function used to cast values in this column.
         """
-        raise NotImplementedError
+        raise NotImplementedError   # pragma: no cover
 
     def _cast(self):
         """
         Cast values in this column to an appropriate type, if possible.
         """
-        raise NotImplementedError
+        cast_func = self._get_cast_func()
+
+        return tuple(cast_func(d) for d in self._data())
 
     def has_nulls(self):
         """
@@ -194,7 +223,7 @@ def counts(self):
         column.
 
         Returns a new :class:`.Table`, with two columns,
-        one containing the values and a a second, :class:`IntColumn`
+        one containing the values and a a second, :class:`NumberColumn`
         containing the counts.
 
         Resulting table will be sorted by descending count.
@@ -208,7 +237,7 @@ def counts(self):
             counts[d] += 1
 
         column_names = (self._table._column_names[self._index], 'count')
-        column_types = (self._table._column_types[self._index], IntColumn)
+        column_types = (self._table._column_types[self._index], NumberColumn)
         data = (tuple(i) for i in counts.items())
 
         rows = sorted(data, key=lambda r: r[1], reverse=True)
@@ -219,37 +248,18 @@ class TextColumn(Column):
     """
     A column containing unicode/string data.
     """
-    def validate(self):
-        """
-        Verify all values in this column are string/unicode or null.
-
-        Will raise :exc:`.ColumnValidationError`
-        if validation fails.
-        """
-        for d in self._data():
-            if not isinstance(d, six.string_types) and d is not None:
-                raise ColumnValidationError(d, self)
-
-    def _cast(self):
-        """
-        Cast values to unicode.
-        """
-        casted = []
-
-        for d in self._data():
-            if d == '' or d is None:
-                casted.append(None)
-            else:
-                casted.append(six.text_type(d))
-
-        return casted
+    def _get_cast_func(self):
+        return cast_text 
 
 class NumberColumn(Column):
     """
     A column containing numeric data.
-
-    Base class for :class:`IntColumn` and :class:`DecimalColumn`.
+    
+    All data is represented by the :class:`decimal.Decimal` class.' 
     """
+    def _get_cast_func(self):
+        return cast_number
+
     def sum(self):
         """
         Compute the sum of this column.
@@ -335,72 +345,3 @@ def mad(self):
 
         return median(tuple(abs(n - m) for n in data))
 
-class IntColumn(NumberColumn):
-    """
-    A column containing integer data.
-    """
-    def validate(self):
-        """
-        Verify all values in this column are int or null.
-
-        Will raise :exc:`.ColumnValidationError` if validation fails.
-        """
-        for d in self._data():
-            if not isinstance(d, int) and d is not None:
-                raise ColumnValidationError(d, self)
-
-    def _cast(self):
-        """
-        Cast values in this column to integer.
-        """
-        casted = []
-
-        for d in self._data():
-            if isinstance(d, six.string_types):
-                d = d.replace(',' ,'').strip()
-
-            if d == '' or d is None:
-                casted.append(None)
-            else:
-                casted.append(int(d))
-
-        return casted
-
-class DecimalColumn(NumberColumn):
-    """
-    A column containing decimal data.
-    """
-    def validate(self):
-        """
-        Verify all values in this column are Decimal or null.
-
-        NB: We never use floats because of rounding error.
-
-        Will raise :exc:`.ColumnValidationError` if validation fails.
-        """
-        for d in self._data():
-            if not isinstance(d, Decimal) and d is not None:
-                raise ColumnValidationError(d, self)
-
-    def _cast(self):
-        """
-        Cast values in this column to Decimal.
-
-        NB: casting from float will introduce precision
-        errors. Always cast from string, e.g. '3.14'.
-        """
-        casted = []
-
-        for d in self._data():
-            if isinstance(d, six.string_types):
-                d = d.replace(',' ,'').strip()
-
-            if d == '' or d is None:
-                casted.append(None)
-            elif isinstance(d, float):
-                warnings.warn('Casting float to Decimal! Precision lost. Cast from string instead!')
-            else:
-                casted.append(Decimal(d))
-
-        return casted
-