Skip to content

Commit

Permalink
all mismatches (#64)
Browse files Browse the repository at this point in the history
* [WIP] adding all_mismatch

* adding tests for mismatches

* tweaking docstr
  • Loading branch information
fdosani authored May 17, 2020
1 parent b9c2a0e commit 1ae017a
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 0 deletions.
20 changes: 20 additions & 0 deletions datacompy/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import logging
import os


import numpy as np
import pandas as pd

Expand Down Expand Up @@ -432,6 +433,25 @@ def sample_mismatch(self, column, sample_count=10, for_display=False):
]
return to_return

def all_mismatch(self):
"""All rows with any columns that have a mismatch. Returns all df1 and df2 versions of the columns and join
columns.
Returns
-------
Pandas.DataFrame
All rows of the intersection dataframe, containing any columns, that don't match.
"""
match_list = []
return_list = []
for col in self.intersect_rows.columns:
if col.endswith("_match"):
match_list.append(col)
return_list.extend([col[:-6] + "_df1", col[:-6] + "_df2"])

mm_bool = self.intersect_rows[match_list].all(axis="columns")
return self.intersect_rows[~mm_bool][self.join_columns + return_list]

def report(self, sample_count=10):
"""Returns a string representation of a report. The representation can
then be printed or saved to a file.
Expand Down
70 changes: 70 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,76 @@ def test_integers_with_ignore_spaces_and_join_columns():
assert compare.count_matching_rows() == 2


def test_sample_mismatch():
data1 = """acct_id,dollar_amt,name,float_fld,date_fld
10000001234,123.45,George Maharis,14530.1555,2017-01-01
10000001235,0.45,Michael Bluth,1,2017-01-01
10000001236,1345,George Bluth,,2017-01-01
10000001237,123456,Bob Loblaw,345.12,2017-01-01
10000001239,1.05,Lucille Bluth,,2017-01-01
10000001240,123.45,George Maharis,14530.1555,2017-01-02
"""

data2 = """acct_id,dollar_amt,name,float_fld,date_fld
10000001234,123.4,George Michael Bluth,14530.155,
10000001235,0.45,Michael Bluth,,
10000001236,1345,George Bluth,1,
10000001237,123456,Robert Loblaw,345.12,
10000001238,1.05,Loose Seal Bluth,111,
10000001240,123.45,George Maharis,14530.1555,2017-01-02
"""
df1 = pd.read_csv(io.StringIO(data1), sep=",")
df2 = pd.read_csv(io.StringIO(data2), sep=",")
compare = datacompy.Compare(df1, df2, "acct_id")

output = compare.sample_mismatch(column="name", sample_count=1)
assert output.shape[0] == 1
assert (output.name_df1 != output.name_df2).all()

output = compare.sample_mismatch(column="name", sample_count=2)
assert output.shape[0] == 2
assert (output.name_df1 != output.name_df2).all()

output = compare.sample_mismatch(column="name", sample_count=3)
assert output.shape[0] == 2
assert (output.name_df1 != output.name_df2).all()


def test_all_mismatch():
data1 = """acct_id,dollar_amt,name,float_fld,date_fld
10000001234,123.45,George Maharis,14530.1555,2017-01-01
10000001235,0.45,Michael Bluth,1,2017-01-01
10000001236,1345,George Bluth,,2017-01-01
10000001237,123456,Bob Loblaw,345.12,2017-01-01
10000001239,1.05,Lucille Bluth,,2017-01-01
10000001240,123.45,George Maharis,14530.1555,2017-01-02
"""

data2 = """acct_id,dollar_amt,name,float_fld,date_fld
10000001234,123.4,George Michael Bluth,14530.155,
10000001235,0.45,Michael Bluth,,
10000001236,1345,George Bluth,1,
10000001237,123456,Robert Loblaw,345.12,
10000001238,1.05,Loose Seal Bluth,111,
10000001240,123.45,George Maharis,14530.1555,2017-01-02
"""
df1 = pd.read_csv(io.StringIO(data1), sep=",")
df2 = pd.read_csv(io.StringIO(data2), sep=",")
compare = datacompy.Compare(df1, df2, "acct_id")

output = compare.all_mismatch()
assert output.shape[0] == 4

assert (output.name_df1 != output.name_df2).values.sum() == 2
assert (~(output.name_df1 != output.name_df2)).values.sum() == 2

assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 1
assert (~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 3

assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3
assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1


MAX_DIFF_DF = pd.DataFrame(
{
"base": [1, 1, 1, 1, 1],
Expand Down

0 comments on commit 1ae017a

Please sign in to comment.