cenpy-devs · rluedde · Jul 13, 2020 · Jul 13, 2020 · Jul 13, 2020 · Jul 16, 2020
diff --git a/cenpy/moe/replicate_table_utils.py b/cenpy/moe/replicate_table_utils.py
@@ -96,6 +96,11 @@ def read_replicate_file(fname):
     table = table.drop(["TBLID", "NAME", "ORDER", "CME", "TITLE"], axis=1)
     table = table.pivot(index="GEOID", columns="variable")
     table.columns.names = ["categories", "variables"]
+    # Standardize the names of the columns because the ACB's 2014 tables have
+    # lowercase titles while others are uppercase.
+    table = table.rename(columns = {"estimate":"ESTIMATE", 
+                                    "moe": "MOE",
+                                     "se": "SE"})
     return table
 
 
@@ -472,9 +477,9 @@ def apply_func(func, data, params={}):
     Pandas 81 column dataframe, where the first column is the estimates and
     the remaining columns are the replicates.
     """
-    estimates = func(data.estimate, **params)
+    estimates = func(data.ESTIMATE, **params)
     # subset just the replicates
-    replicates = data.drop(["estimate", "moe", "SE"], axis=1, level=0)
+    replicates = data.drop(["ESTIMATE", "MOE", "SE"], axis=1, level=0)
     # clean out unused column names
     replicates.columns = replicates.columns.remove_unused_levels()
     # apply the user function to each replicate
@@ -488,7 +493,7 @@ def apply_func(func, data, params={}):
     ]
     rep_results = pd.concat(rep_results, axis=1, keys=replicates.columns.levels[0])
     # cleanup
-    rep_results["estimate"] = estimates
+    rep_results["ESTIMATE"] = estimates
     rep_results = rep_results.replace([np.inf, -np.inf], 0)  # per census documentation
     return rep_results
 

diff --git a/cenpy/products.py b/cenpy/products.py
@@ -1,7 +1,10 @@
+from .utilities import _replace_missing
+from .utilities import _fuzzy_match
+from .utilities import _coerce
+from .utilities import _can_int
 from .remote import APIConnection
 from .explorer import fips_table as _ft
 from shapely import geometry
-from fuzzywuzzy import fuzz
 from warnings import warn
 import geopandas
 import pandas
@@ -16,8 +19,6 @@
 
 __all__ = ["Decennial2010", "ACS"]
 
-_ACS_MISSING = (-999999999, -888888888, -666666666, -555555555, -333333333, -222222222)
-
 
 class _Product(object):
     """The fundamental building block to make pre-configured Census Products, like ACS or Decennial2010."""
@@ -852,7 +853,7 @@ def tables(self):
             result = stems.drop("GEO", axis=0, errors="ignore")
             self._stems = result
             # keep around the main tables only if they're not crosstabs (ending in alphanumeric)
-            self._tables = result.loc[[ix for ix in result.index if _can_int(ix[-1])]]
+            self._tables = result.loc[[ix for ix in result.index if can_int(ix[-1])]]
             return self._tables
 
     @property
@@ -882,129 +883,3 @@ def crosstab_tables(self):
             ]
             return self._crosstabs
 
-
-#############
-# UTILITIES #
-#############
-
-
-def _fuzzy_match(matchtarget, matchlist, return_table=False):
-    """
-    Conduct a fuzzy match with matchtarget, within the list of possible match candidates in matchlist. 
-
-    Parameters
-    ---------
-    matchtarget :   str
-                 a string to be matched to a set of possible candidates
-    matchlist   :   list of str
-                 a list (or iterable) containing strings we are interested in matching
-    return_table:   bool
-                 whether to return the full table of scored candidates, or to return only the single
-                 best match. If False (the default), only the best match is returned.
-
-    Notes
-    -----
-    consult the docstring for Product.check_match for more information on how the actual matching
-    algorithm works. 
-    """
-    split = matchtarget.split(",")
-    if len(split) == 2:
-        target, state = split
-    elif len(split) == 1:
-        target = split[0]
-    else:
-        raise AssertionError(
-            "Uncertain place identifier {}. The place identifier should "
-            'look something like "placename, state" or, for larger areas, '
-            "like Combined Statistical Areas or Metropolitan Statistical Areas,"
-            "placename1-placename2, state1-state2-state3".format(target)
-        )
-
-    table = pandas.DataFrame({"target": matchlist})
-    table["score"] = table.target.apply(
-        lambda x: fuzz.partial_ratio(target.strip().lower(), x.lower())
-    )
-    if len(split) == 1:
-        if (table.score == table.score.max()).sum() > 1:
-            ixmax, rowmax = _break_ties(matchtarget, table)
-        else:
-            ixmax = table.score.idxmax()
-            rowmax = table.loc[ixmax]
-        if return_table:
-            return rowmax, table.sort_values("score")
-        return rowmax
-
-    in_state = table.target.str.lower().str.endswith(state.strip().lower())
-
-    assert any(in_state), (
-        "State {} is not found from place {}. "
-        "Should be a standard Census abbreviation, like"
-        " CA, AZ, NC, or PR".format(state, matchtarget)
-    )
-    table = table[in_state]
-    if (table.score == table.score.max()).sum() > 1:
-        ixmax, rowmax = _break_ties(matchtarget, table)
-    else:
-        ixmax = table.score.idxmax()
-        rowmax = table.loc[ixmax]
-    if return_table:
-        return rowmax, table.sort_values("score")
-    return rowmax
-
-
-def _coerce(column, kind):
-    """
-    Converty type of column to kind, or keep column unchanged
-    if that conversion fails.
-    """
-    try:
-        return column.astype(kind)
-    except ValueError:
-        return column
-
-
-def _replace_missing(column, missings=_ACS_MISSING):
-    """
-    replace ACS missing values using numpy.nan. 
-    """
-    for val in _ACS_MISSING:
-        column.replace(val, numpy.nan, inplace=True)
-    return column
-
-
-def _break_ties(matchtarget, table):
-    """
-    break ties in the fuzzy matching algorithm using a second scoring method 
-    which prioritizes full string matches over substring matches.  
-    """
-    split = matchtarget.split(",")
-    if len(split) == 2:
-        target, state = split
-    else:
-        target = split[0]
-    table["score2"] = table.target.apply(
-        lambda x: fuzz.ratio(target.strip().lower(), x.lower())
-    )
-    among_winners = table[table.score == table.score.max()]
-    double_winners = among_winners[among_winners.score2 == among_winners.score2.max()]
-    if double_winners.shape[0] > 1:
-        ixmax = double_winners.score2.idxmax()
-        ixmax_row = double_winners.loc[ixmax]
-        warn(
-            "Cannot disambiguate placename {}. Picking the shortest, best "
-            "matched placename, {}, from {}".format(
-                matchtarget, ixmax_row.target, ", ".join(double_winners.target.tolist())
-            )
-        )
-        return ixmax, ixmax_row
-    ixmax = double_winners.score2.idxmax()
-    return ixmax, double_winners.loc[ixmax]
-
-
-def _can_int(char):
-    """check if a character can be turned into an integer"""
-    try:
-        int(char)
-        return True
-    except ValueError:
-        return False
diff --git a/cenpy/remote.py b/cenpy/remote.py
@@ -6,6 +6,7 @@
 from . import tiger as tig
 import math
 from six import iteritems, PY3
+from .utilities import _coerce
 
 if PY3:
     unicode = str
@@ -220,7 +221,7 @@ def query(self, cols=None, geo_unit="", geo_filter={}, apikey="", **kwargs):
             df = pd.DataFrame().from_records(json_content[1:], columns=json_content[0])
             assert all([col in df.columns for col in cols])
             if convert_numeric:
-                df = df.infer_objects()
+                df[cols] = _coerce(df[cols], int)
             if index is not "":
                 df.index = df[index]
             return df

diff --git a/cenpy/tests/test_utilities.py b/cenpy/tests/test_utilities.py
@@ -0,0 +1,55 @@
+import unittest
+import pandas
+import numpy
+from cenpy.utilities import _coerce as coerce
+from cenpy.utilities import _replace_missing as replace_missing
+
+class TestUtilities(unittest.TestCase):
+
+    def test_coerce(self):
+        # Make sure coerce works on Series and doesn't change them
+        ser_orig = pandas.Series([3,4,5])
+        ser_floats = coerce(ser_orig, cast_to = numpy.float64)
+        self.assertFalse(ser_orig.equals(ser_floats))
+
+        # Make sure coerce changes what columns it can and doesn't alter
+        # original data
+        df_orig = pandas.DataFrame({"ints": [1,2,3],
+                                    "floats": [0.1, 3.79, 14.9],
+                                    "strings": ["fst", "sec", "thd"]})
+        df_floats = coerce(df_orig, cast_to = numpy.float64)
+        # Correct types of columns after coercion:
+        float_dtypes = pandas.Series(["float64", "float64", "object"],
+                                     index = ["ints", "floats", "strings"])
+        # Make sure that the coerced dtypes are as expected
+        self.assertFalse(df_orig.equals(df_floats)) 
+        self.assertTrue(float_dtypes.equals(df_floats.dtypes))
+
+        # Cast castable columns into strings - 
+        # Confusingly enough, pandas calls them "objects" 
+        df_objects = coerce(df_orig, cast_to = str)
+        object_dtypes = pandas.Series(["object", "object", "object"],
+                                      index = ["ints", "floats", "strings"]) 
+        self.assertTrue(object_dtypes.equals(df_objects.dtypes))
+
+        # Make sure an error gets raised if a non-Series/DataFrame object is used
+        arr = numpy.zeros((2,2))
+        self.assertRaises(TypeError, coerce, arr)
+
+
+    def test_replace_missing(self):
+        df_orig = pandas.DataFrame({"ints": [-888888888,2,3],
+                                    "floats": [-555555555, 3.79, -333333333]})
+        df_replaced = replace_missing(df_orig)
+        # Correct output after replacing missing values
+        df_correct = pandas.DataFrame({"ints": [numpy.nan,2,3],
+                                       "floats": [numpy.nan, 3.79, numpy.nan]})
+        self.assertTrue(df_replaced.equals(df_correct))
+
+        # Make sure an error is raised if non-Series/DataFrame types are used
+        arr = numpy.zeros((2,2))
+        self.assertRaises(TypeError, replace_missing, arr)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/cenpy/tiger.py b/cenpy/tiger.py
@@ -262,9 +262,9 @@ def query(self, **kwargs):
         """
         layer_result = kwargs.pop("layer", None)
         if isinstance(layer_result, str):
-            from .products import _fuzzy_match
+            from .utilities import _fuzzy_match as fuzzy_match
 
-            layer_result = _fuzzy_match(
+            layer_result = fuzzy_match(
                 layer_result, [f.__repr__() for f in self.layers]
             ).index
         if layer_result is None: