Skip to content
11 changes: 8 additions & 3 deletions cenpy/moe/replicate_table_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def read_replicate_file(fname):
table = table.drop(["TBLID", "NAME", "ORDER", "CME", "TITLE"], axis=1)
table = table.pivot(index="GEOID", columns="variable")
table.columns.names = ["categories", "variables"]
# Standardize the names of the columns because the ACB's 2014 tables have
# lowercase titles while others are uppercase.
table = table.rename(columns = {"estimate":"ESTIMATE",
"moe": "MOE",
"se": "SE"})
return table


Expand Down Expand Up @@ -472,9 +477,9 @@ def apply_func(func, data, params={}):
Pandas 81 column dataframe, where the first column is the estimates and
the remaining columns are the replicates.
"""
estimates = func(data.estimate, **params)
estimates = func(data.ESTIMATE, **params)
# subset just the replicates
replicates = data.drop(["estimate", "moe", "SE"], axis=1, level=0)
replicates = data.drop(["ESTIMATE", "MOE", "SE"], axis=1, level=0)
# clean out unused column names
replicates.columns = replicates.columns.remove_unused_levels()
# apply the user function to each replicate
Expand All @@ -488,7 +493,7 @@ def apply_func(func, data, params={}):
]
rep_results = pd.concat(rep_results, axis=1, keys=replicates.columns.levels[0])
# cleanup
rep_results["estimate"] = estimates
rep_results["ESTIMATE"] = estimates
rep_results = rep_results.replace([np.inf, -np.inf], 0) # per census documentation
return rep_results

Expand Down
135 changes: 5 additions & 130 deletions cenpy/products.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from .utilities import _replace_missing
from .utilities import _fuzzy_match
from .utilities import _coerce
from .utilities import _can_int
from .remote import APIConnection
from .explorer import fips_table as _ft
from shapely import geometry
from fuzzywuzzy import fuzz
from warnings import warn
import geopandas
import pandas
Expand All @@ -16,8 +19,6 @@

__all__ = ["Decennial2010", "ACS"]

_ACS_MISSING = (-999999999, -888888888, -666666666, -555555555, -333333333, -222222222)


class _Product(object):
"""The fundamental building block to make pre-configured Census Products, like ACS or Decennial2010."""
Expand Down Expand Up @@ -852,7 +853,7 @@ def tables(self):
result = stems.drop("GEO", axis=0, errors="ignore")
self._stems = result
# keep around the main tables only if they're not crosstabs (ending in alphanumeric)
self._tables = result.loc[[ix for ix in result.index if _can_int(ix[-1])]]
self._tables = result.loc[[ix for ix in result.index if can_int(ix[-1])]]
return self._tables

@property
Expand Down Expand Up @@ -882,129 +883,3 @@ def crosstab_tables(self):
]
return self._crosstabs


#############
# UTILITIES #
#############


def _fuzzy_match(matchtarget, matchlist, return_table=False):
"""
Conduct a fuzzy match with matchtarget, within the list of possible match candidates in matchlist.

Parameters
---------
matchtarget : str
a string to be matched to a set of possible candidates
matchlist : list of str
a list (or iterable) containing strings we are interested in matching
return_table: bool
whether to return the full table of scored candidates, or to return only the single
best match. If False (the default), only the best match is returned.

Notes
-----
consult the docstring for Product.check_match for more information on how the actual matching
algorithm works.
"""
split = matchtarget.split(",")
if len(split) == 2:
target, state = split
elif len(split) == 1:
target = split[0]
else:
raise AssertionError(
"Uncertain place identifier {}. The place identifier should "
'look something like "placename, state" or, for larger areas, '
"like Combined Statistical Areas or Metropolitan Statistical Areas,"
"placename1-placename2, state1-state2-state3".format(target)
)

table = pandas.DataFrame({"target": matchlist})
table["score"] = table.target.apply(
lambda x: fuzz.partial_ratio(target.strip().lower(), x.lower())
)
if len(split) == 1:
if (table.score == table.score.max()).sum() > 1:
ixmax, rowmax = _break_ties(matchtarget, table)
else:
ixmax = table.score.idxmax()
rowmax = table.loc[ixmax]
if return_table:
return rowmax, table.sort_values("score")
return rowmax

in_state = table.target.str.lower().str.endswith(state.strip().lower())

assert any(in_state), (
"State {} is not found from place {}. "
"Should be a standard Census abbreviation, like"
" CA, AZ, NC, or PR".format(state, matchtarget)
)
table = table[in_state]
if (table.score == table.score.max()).sum() > 1:
ixmax, rowmax = _break_ties(matchtarget, table)
else:
ixmax = table.score.idxmax()
rowmax = table.loc[ixmax]
if return_table:
return rowmax, table.sort_values("score")
return rowmax


def _coerce(column, kind):
"""
Converty type of column to kind, or keep column unchanged
if that conversion fails.
"""
try:
return column.astype(kind)
except ValueError:
return column


def _replace_missing(column, missings=_ACS_MISSING):
"""
replace ACS missing values using numpy.nan.
"""
for val in _ACS_MISSING:
column.replace(val, numpy.nan, inplace=True)
return column


def _break_ties(matchtarget, table):
"""
break ties in the fuzzy matching algorithm using a second scoring method
which prioritizes full string matches over substring matches.
"""
split = matchtarget.split(",")
if len(split) == 2:
target, state = split
else:
target = split[0]
table["score2"] = table.target.apply(
lambda x: fuzz.ratio(target.strip().lower(), x.lower())
)
among_winners = table[table.score == table.score.max()]
double_winners = among_winners[among_winners.score2 == among_winners.score2.max()]
if double_winners.shape[0] > 1:
ixmax = double_winners.score2.idxmax()
ixmax_row = double_winners.loc[ixmax]
warn(
"Cannot disambiguate placename {}. Picking the shortest, best "
"matched placename, {}, from {}".format(
matchtarget, ixmax_row.target, ", ".join(double_winners.target.tolist())
)
)
return ixmax, ixmax_row
ixmax = double_winners.score2.idxmax()
return ixmax, double_winners.loc[ixmax]


def _can_int(char):
"""check if a character can be turned into an integer"""
try:
int(char)
return True
except ValueError:
return False
3 changes: 2 additions & 1 deletion cenpy/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import tiger as tig
import math
from six import iteritems, PY3
from .utilities import _coerce

if PY3:
unicode = str
Expand Down Expand Up @@ -220,7 +221,7 @@ def query(self, cols=None, geo_unit="", geo_filter={}, apikey="", **kwargs):
df = pd.DataFrame().from_records(json_content[1:], columns=json_content[0])
assert all([col in df.columns for col in cols])
if convert_numeric:
df = df.infer_objects()
df[cols] = _coerce(df[cols], int)
if index is not "":
df.index = df[index]
return df
Expand Down
55 changes: 55 additions & 0 deletions cenpy/tests/test_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest
import pandas
import numpy
from cenpy.utilities import _coerce as coerce
from cenpy.utilities import _replace_missing as replace_missing

class TestUtilities(unittest.TestCase):

def test_coerce(self):
# Make sure coerce works on Series and doesn't change them
ser_orig = pandas.Series([3,4,5])
ser_floats = coerce(ser_orig, cast_to = numpy.float64)
self.assertFalse(ser_orig.equals(ser_floats))

# Make sure coerce changes what columns it can and doesn't alter
# original data
df_orig = pandas.DataFrame({"ints": [1,2,3],
"floats": [0.1, 3.79, 14.9],
"strings": ["fst", "sec", "thd"]})
df_floats = coerce(df_orig, cast_to = numpy.float64)
# Correct types of columns after coercion:
float_dtypes = pandas.Series(["float64", "float64", "object"],
index = ["ints", "floats", "strings"])
# Make sure that the coerced dtypes are as expected
self.assertFalse(df_orig.equals(df_floats))
self.assertTrue(float_dtypes.equals(df_floats.dtypes))

# Cast castable columns into strings -
# Confusingly enough, pandas calls them "objects"
df_objects = coerce(df_orig, cast_to = str)
object_dtypes = pandas.Series(["object", "object", "object"],
index = ["ints", "floats", "strings"])
self.assertTrue(object_dtypes.equals(df_objects.dtypes))

# Make sure an error gets raised if a non-Series/DataFrame object is used
arr = numpy.zeros((2,2))
self.assertRaises(TypeError, coerce, arr)


def test_replace_missing(self):
df_orig = pandas.DataFrame({"ints": [-888888888,2,3],
"floats": [-555555555, 3.79, -333333333]})
df_replaced = replace_missing(df_orig)
# Correct output after replacing missing values
df_correct = pandas.DataFrame({"ints": [numpy.nan,2,3],
"floats": [numpy.nan, 3.79, numpy.nan]})
self.assertTrue(df_replaced.equals(df_correct))

# Make sure an error is raised if non-Series/DataFrame types are used
arr = numpy.zeros((2,2))
self.assertRaises(TypeError, replace_missing, arr)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions cenpy/tiger.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ def query(self, **kwargs):
"""
layer_result = kwargs.pop("layer", None)
if isinstance(layer_result, str):
from .products import _fuzzy_match
from .utilities import _fuzzy_match as fuzzy_match

layer_result = _fuzzy_match(
layer_result = fuzzy_match(
layer_result, [f.__repr__() for f in self.layers]
).index
if layer_result is None:
Expand Down
Loading