all mismatches (#64)

* [WIP] adding all_mismatch * adding tests for mismatches * tweaking docstr
capitalone · May 17, 2020 · 1ae017a · 1ae017a
1 parent b9c2a0e
commit 1ae017a
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 0 deletions.
diff --git a/datacompy/core.py b/datacompy/core.py
@@ -24,6 +24,7 @@
 import logging
 import os
 
+
 import numpy as np
 import pandas as pd
 
@@ -432,6 +433,25 @@ def sample_mismatch(self, column, sample_count=10, for_display=False):
             ]
         return to_return
 
+    def all_mismatch(self):
+        """All rows with any columns that have a mismatch. Returns all df1 and df2 versions of the columns and join
+        columns.
+
+        Returns
+        -------
+        Pandas.DataFrame
+            All rows of the intersection dataframe, containing any columns, that don't match.
+        """
+        match_list = []
+        return_list = []
+        for col in self.intersect_rows.columns:
+            if col.endswith("_match"):
+                match_list.append(col)
+                return_list.extend([col[:-6] + "_df1", col[:-6] + "_df2"])
+
+        mm_bool = self.intersect_rows[match_list].all(axis="columns")
+        return self.intersect_rows[~mm_bool][self.join_columns + return_list]
+
     def report(self, sample_count=10):
         """Returns a string representation of a report.  The representation can
         then be printed or saved to a file.

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -851,6 +851,76 @@ def test_integers_with_ignore_spaces_and_join_columns():
     assert compare.count_matching_rows() == 2
 
 
+def test_sample_mismatch():
+    data1 = """acct_id,dollar_amt,name,float_fld,date_fld
+    10000001234,123.45,George Maharis,14530.1555,2017-01-01
+    10000001235,0.45,Michael Bluth,1,2017-01-01
+    10000001236,1345,George Bluth,,2017-01-01
+    10000001237,123456,Bob Loblaw,345.12,2017-01-01
+    10000001239,1.05,Lucille Bluth,,2017-01-01
+    10000001240,123.45,George Maharis,14530.1555,2017-01-02
+    """
+
+    data2 = """acct_id,dollar_amt,name,float_fld,date_fld
+    10000001234,123.4,George Michael Bluth,14530.155,
+    10000001235,0.45,Michael Bluth,,
+    10000001236,1345,George Bluth,1,
+    10000001237,123456,Robert Loblaw,345.12,
+    10000001238,1.05,Loose Seal Bluth,111,
+    10000001240,123.45,George Maharis,14530.1555,2017-01-02
+    """
+    df1 = pd.read_csv(io.StringIO(data1), sep=",")
+    df2 = pd.read_csv(io.StringIO(data2), sep=",")
+    compare = datacompy.Compare(df1, df2, "acct_id")
+
+    output = compare.sample_mismatch(column="name", sample_count=1)
+    assert output.shape[0] == 1
+    assert (output.name_df1 != output.name_df2).all()
+
+    output = compare.sample_mismatch(column="name", sample_count=2)
+    assert output.shape[0] == 2
+    assert (output.name_df1 != output.name_df2).all()
+
+    output = compare.sample_mismatch(column="name", sample_count=3)
+    assert output.shape[0] == 2
+    assert (output.name_df1 != output.name_df2).all()
+
+
+def test_all_mismatch():
+    data1 = """acct_id,dollar_amt,name,float_fld,date_fld
+    10000001234,123.45,George Maharis,14530.1555,2017-01-01
+    10000001235,0.45,Michael Bluth,1,2017-01-01
+    10000001236,1345,George Bluth,,2017-01-01
+    10000001237,123456,Bob Loblaw,345.12,2017-01-01
+    10000001239,1.05,Lucille Bluth,,2017-01-01
+    10000001240,123.45,George Maharis,14530.1555,2017-01-02
+    """
+
+    data2 = """acct_id,dollar_amt,name,float_fld,date_fld
+    10000001234,123.4,George Michael Bluth,14530.155,
+    10000001235,0.45,Michael Bluth,,
+    10000001236,1345,George Bluth,1,
+    10000001237,123456,Robert Loblaw,345.12,
+    10000001238,1.05,Loose Seal Bluth,111,
+    10000001240,123.45,George Maharis,14530.1555,2017-01-02
+    """
+    df1 = pd.read_csv(io.StringIO(data1), sep=",")
+    df2 = pd.read_csv(io.StringIO(data2), sep=",")
+    compare = datacompy.Compare(df1, df2, "acct_id")
+
+    output = compare.all_mismatch()
+    assert output.shape[0] == 4
+
+    assert (output.name_df1 != output.name_df2).values.sum() == 2
+    assert (~(output.name_df1 != output.name_df2)).values.sum() == 2
+
+    assert (output.dollar_amt_df1 != output.dollar_amt_df2).values.sum() == 1
+    assert (~(output.dollar_amt_df1 != output.dollar_amt_df2)).values.sum() == 3
+
+    assert (output.float_fld_df1 != output.float_fld_df2).values.sum() == 3
+    assert (~(output.float_fld_df1 != output.float_fld_df2)).values.sum() == 1
+
+
 MAX_DIFF_DF = pd.DataFrame(
     {
         "base": [1, 1, 1, 1, 1],