From 0d3333f5dba438dc8cfd5d40bdd5dd45f7ad2bc0 Mon Sep 17 00:00:00 2001 From: Andrew Sanchez Date: Wed, 12 Feb 2020 16:34:46 -0700 Subject: [PATCH 1/4] Use MultiIndex.codes instead of labels (no longer support) Please see these links for the rational if interested: https://github.com/pandas-dev/pandas/issues/13443 https://github.com/pandas-dev/pandas/pull/23752 --- q2_diversity/tests/test_alpha_rarefaction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/q2_diversity/tests/test_alpha_rarefaction.py b/q2_diversity/tests/test_alpha_rarefaction.py index 22603115..1531ecdc 100644 --- a/q2_diversity/tests/test_alpha_rarefaction.py +++ b/q2_diversity/tests/test_alpha_rarefaction.py @@ -421,7 +421,7 @@ def test_unique_metadata_groups(self): obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], @@ -445,7 +445,7 @@ def test_some_duplicates_in_column(self): obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], @@ -469,7 +469,7 @@ def test_all_identical(self): obs = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8]], @@ -495,7 +495,7 @@ def test_multiple_columns(self): obs = _reindex_with_metadata('pet', ['pet', 'toy'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet', 'toy'], [1, 2, '']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['depth', 'iter']) exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet') exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], From b0cc0df0679e3b220549f1c26e5cd48581394254 Mon Sep 17 00:00:00 2001 From: Andrew Sanchez Date: Wed, 12 Feb 2020 16:41:46 -0700 Subject: [PATCH 2/4] Unpack and name values returned by _reindex_with_metadata This makes it easier to see what's going on below with the values returned by this function. --- q2_diversity/tests/test_alpha_rarefaction.py | 30 ++++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/q2_diversity/tests/test_alpha_rarefaction.py b/q2_diversity/tests/test_alpha_rarefaction.py index 1531ecdc..cef2c10f 100644 --- a/q2_diversity/tests/test_alpha_rarefaction.py +++ b/q2_diversity/tests/test_alpha_rarefaction.py @@ -418,7 +418,7 @@ def test_unique_metadata_groups(self): [9, 10, 11, 12, 'peanut']], columns=columns, index=['S1', 'S2', 'S3']) - obs = _reindex_with_metadata('pet', ['pet'], data) + median, counts = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], @@ -427,12 +427,12 @@ def test_unique_metadata_groups(self): exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[0]) + pdt.assert_frame_equal(exp, median) exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[1]) + pdt.assert_frame_equal(exp, counts) def test_some_duplicates_in_column(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), @@ -442,7 +442,7 @@ def test_some_duplicates_in_column(self): [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) - obs = _reindex_with_metadata('pet', ['pet'], data) + median, counts = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], @@ -451,12 +451,12 @@ def test_some_duplicates_in_column(self): exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[0]) + pdt.assert_frame_equal(exp, median) exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[1]) + pdt.assert_frame_equal(exp, counts) def test_all_identical(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), @@ -466,7 +466,7 @@ def test_all_identical(self): [9, 10, 11, 12, 'russ']], columns=columns, index=['S1', 'S2', 'S3']) - obs = _reindex_with_metadata('pet', ['pet'], data) + median, counts = _reindex_with_metadata('pet', ['pet'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], @@ -475,12 +475,12 @@ def test_all_identical(self): exp = pd.DataFrame(data=[[5, 6, 7, 8]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[0]) + pdt.assert_frame_equal(exp, median) exp = pd.DataFrame(data=[[3, 3, 3, 3]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[1]) + pdt.assert_frame_equal(exp, counts) def test_multiple_columns(self): columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1), @@ -492,7 +492,7 @@ def test_multiple_columns(self): [9, 10, 11, 12, 'peanut', 'stick']], columns=columns, index=['S1', 'S2', 'S3']) - obs = _reindex_with_metadata('pet', ['pet', 'toy'], data) + median, counts = _reindex_with_metadata('pet', ['pet', 'toy'], data) exp_col = pd.MultiIndex(levels=[[1, 200, 'pet', 'toy'], [1, 2, '']], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], @@ -501,25 +501,25 @@ def test_multiple_columns(self): exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[0]) + pdt.assert_frame_equal(exp, median) exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[1]) + pdt.assert_frame_equal(exp, counts) - obs = _reindex_with_metadata('toy', ['pet', 'toy'], data) + median, counts = _reindex_with_metadata('toy', ['pet', 'toy'], data) exp_ind = pd.Index(['stick', 'yeti'], name='toy') exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[0]) + pdt.assert_frame_equal(exp, median) exp = pd.DataFrame(data=[[2, 2, 2, 2], [1, 1, 1, 1]], columns=exp_col, index=exp_ind) - pdt.assert_frame_equal(exp, obs[1]) + pdt.assert_frame_equal(exp, counts) class AlphaRarefactionJSONPTests(unittest.TestCase): From 552152f0948effc6bcd9e5e4939917ee6062dc0e Mon Sep 17 00:00:00 2001 From: Andrew Sanchez Date: Wed, 12 Feb 2020 16:43:08 -0700 Subject: [PATCH 3/4] Initial patch to handle new pandas error This prevents attempting to drop columns that don't exist in merged.columns after setting the index, while still dropping columns that are present in merged.columns. Attempting to do so raises an exception in pandas >= 1. Please see https://github.com/pandas-dev/pandas/issues/8594 for details. --- q2_diversity/_alpha/_visualizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/q2_diversity/_alpha/_visualizer.py b/q2_diversity/_alpha/_visualizer.py index 25387553..fbc0a3e2 100644 --- a/q2_diversity/_alpha/_visualizer.py +++ b/q2_diversity/_alpha/_visualizer.py @@ -241,7 +241,10 @@ def _reindex_with_metadata(column, columns, merged): merged.sort_index(axis=0, ascending=True, inplace=True) merged = merged.groupby(level=[column]) counts = merged.count() - counts.drop(columns, axis=1, inplace=True, level=0) + # Removes the column name used to set the index of `merged` above + col_diff = set(columns) - set([column]) + if col_diff: + counts.drop(col_diff, axis=1, inplace=True, level=0) median_ = merged.median() return median_, counts From b9ddead466654806adfcebeae3c824847dda5114 Mon Sep 17 00:00:00 2001 From: Andrew Sanchez Date: Thu, 13 Feb 2020 12:20:24 -0700 Subject: [PATCH 4/4] Avoid mutating `merged` in place by assigning to new variable This avoids attempting to drop columns that had already been dropped in previous calls to _reindex_with_metadata in the for loop in `alpha_rarefaction`. Co-authored-by: Matthew Dillon --- q2_diversity/_alpha/_visualizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/q2_diversity/_alpha/_visualizer.py b/q2_diversity/_alpha/_visualizer.py index fbc0a3e2..19446ad3 100644 --- a/q2_diversity/_alpha/_visualizer.py +++ b/q2_diversity/_alpha/_visualizer.py @@ -237,15 +237,15 @@ def alpha_correlation(output_dir: str, def _reindex_with_metadata(column, columns, merged): - merged.set_index(column, inplace=True) - merged.sort_index(axis=0, ascending=True, inplace=True) - merged = merged.groupby(level=[column]) - counts = merged.count() + reindexed = merged.set_index(column) + reindexed.sort_index(axis=0, ascending=True, inplace=True) + grouped = reindexed.groupby(level=[column]) + counts = grouped.count() # Removes the column name used to set the index of `merged` above col_diff = set(columns) - set([column]) if col_diff: counts.drop(col_diff, axis=1, inplace=True, level=0) - median_ = merged.median() + median_ = grouped.median() return median_, counts