import pandas as pd
from string_grouper import match_strings, match_most_similar, group_similar_strings
test_series_1_nameless = pd.Series(['foo', 'bar', 'baz', 'foo'])
test_series_1_nameless
0 foo
1 bar
2 baz
3 foo
dtype: object
match_strings(test_series_1_nameless)
left_index | left_side | similarity | right_side | right_index | |
---|---|---|---|---|---|
0 | 0 | foo | 1.0 | foo | 0 |
1 | 0 | foo | 1.0 | foo | 3 |
2 | 1 | bar | 1.0 | bar | 1 |
3 | 2 | baz | 1.0 | baz | 2 |
4 | 3 | foo | 1.0 | foo | 0 |
5 | 3 | foo | 1.0 | foo | 3 |
match_strings(test_series_1_nameless, ignore_index=True)
left_side | similarity | right_side | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
test_series_1_named = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow')
test_series_1_named
0 foo
1 bar
2 baz
3 foo
Name: wow, dtype: object
match_strings(test_series_1_named)
left_index | left_wow | similarity | right_wow | right_index | |
---|---|---|---|---|---|
0 | 0 | foo | 1.0 | foo | 0 |
1 | 0 | foo | 1.0 | foo | 3 |
2 | 1 | bar | 1.0 | bar | 1 |
3 | 2 | baz | 1.0 | baz | 2 |
4 | 3 | foo | 1.0 | foo | 0 |
5 | 3 | foo | 1.0 | foo | 3 |
match_strings(test_series_1_named, ignore_index=True)
left_wow | similarity | right_wow | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
test_series_1_nameless_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD'))
test_series_1_nameless_index
A foo
B bar
C baz
D foo
Name: wow, dtype: object
match_strings(test_series_1_nameless_index)
left_index | left_wow | similarity | right_wow | right_index | |
---|---|---|---|---|---|
0 | A | foo | 1.0 | foo | A |
1 | A | foo | 1.0 | foo | D |
2 | B | bar | 1.0 | bar | B |
3 | C | baz | 1.0 | baz | C |
4 | D | foo | 1.0 | foo | A |
5 | D | foo | 1.0 | foo | D |
match_strings(test_series_1_nameless_index, ignore_index=True)
left_wow | similarity | right_wow | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
test_series_1_named_index = pd.Series(['foo', 'bar', 'baz', 'foo'], name='wow', index=list('ABCD')).rename_axis('id')
test_series_1_named_index
id
A foo
B bar
C baz
D foo
Name: wow, dtype: object
match_strings(test_series_1_named_index)
left_id | left_wow | similarity | right_wow | right_id | |
---|---|---|---|---|---|
0 | A | foo | 1.0 | foo | A |
1 | A | foo | 1.0 | foo | D |
2 | B | bar | 1.0 | bar | B |
3 | C | baz | 1.0 | baz | C |
4 | D | foo | 1.0 | foo | A |
5 | D | foo | 1.0 | foo | D |
match_strings(test_series_1_named_index, ignore_index=True)
left_wow | similarity | right_wow | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
test_df = pd.DataFrame(
{
'label': ['foo', 'bar', 'baz', 'foo'],
'LVL1': list('ABCD'),
'LVL2': [0, 1, 2, 3]
}
).set_index(['LVL1', 'LVL2']).squeeze()
test_df
LVL1 LVL2
A 0 foo
B 1 bar
C 2 baz
D 3 foo
Name: label, dtype: object
match_strings(test_df)
left_LVL1 | left_LVL2 | left_label | similarity | right_label | right_LVL2 | right_LVL1 | |
---|---|---|---|---|---|---|---|
0 | A | 0 | foo | 1.0 | foo | 0 | A |
1 | A | 0 | foo | 1.0 | foo | 3 | D |
2 | B | 1 | bar | 1.0 | bar | 1 | B |
3 | C | 2 | baz | 1.0 | baz | 2 | C |
4 | D | 3 | foo | 1.0 | foo | 0 | A |
5 | D | 3 | foo | 1.0 | foo | 3 | D |
match_strings(test_df, ignore_index=True)
left_label | similarity | right_label | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
test_df2 = pd.Series(
['foo', 'bar', 'baz', 'foo'],
index=pd.MultiIndex.from_tuples(list(zip(list('ABCD'), [0, 1, 2, 3])))
)
test_df2
A 0 foo
B 1 bar
C 2 baz
D 3 foo
dtype: object
match_strings(test_df2)
left_level_0 | left_level_1 | left_side | similarity | right_side | right_level_1 | right_level_0 | |
---|---|---|---|---|---|---|---|
0 | A | 0 | foo | 1.0 | foo | 0 | A |
1 | A | 0 | foo | 1.0 | foo | 3 | D |
2 | B | 1 | bar | 1.0 | bar | 1 | B |
3 | C | 2 | baz | 1.0 | baz | 2 | C |
4 | D | 3 | foo | 1.0 | foo | 0 | A |
5 | D | 3 | foo | 1.0 | foo | 3 | D |
match_strings(test_df2, ignore_index=True)
left_side | similarity | right_side | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
match_strings(test_df2, test_series_1_named)
left_level_0 | left_level_1 | left_side | similarity | right_wow | right_index | |
---|---|---|---|---|---|---|
0 | A | 0 | foo | 1.0 | foo | 3 |
1 | A | 0 | foo | 1.0 | foo | 0 |
2 | B | 1 | bar | 1.0 | bar | 1 |
3 | C | 2 | baz | 1.0 | baz | 2 |
4 | D | 3 | foo | 1.0 | foo | 3 |
5 | D | 3 | foo | 1.0 | foo | 0 |
match_strings(test_df2, test_series_1_named, ignore_index=True)
left_side | similarity | right_wow | |
---|---|---|---|
0 | foo | 1.0 | foo |
1 | foo | 1.0 | foo |
2 | bar | 1.0 | bar |
3 | baz | 1.0 | baz |
4 | foo | 1.0 | foo |
5 | foo | 1.0 | foo |
Let's import some data:
companies_df = pd.read_csv('data/sec__edgar_company_info.csv')[0:50000]
companies_df.squeeze()
Line Number | Company Name | Company CIK Key | |
---|---|---|---|
0 | 1 | !J INC | 1438823 |
1 | 2 | #1 A LIFESAFER HOLDINGS, INC. | 1509607 |
2 | 3 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 1457512 |
3 | 4 | #1 PAINTBALL CORP | 1433777 |
4 | 5 | $ LLC | 1427189 |
... | ... | ... | ... |
49995 | 49996 | BABB DOUGLAS J | 1190359 |
49996 | 49997 | BABB HENRY C | 1193948 |
49997 | 49998 | BABB INTERNATIONAL INC | 1139504 |
49998 | 49999 | BABB JACK J | 1280368 |
49999 | 50000 | BABB JAMES G. III | 1575424 |
50000 rows × 3 columns
Let's give the data a different (unique-valued) index as is commonly done:
companies_df.set_index(['Line Number', 'Company CIK Key'], inplace=True, verify_integrity=True)
companies_df
Company Name | ||
---|---|---|
Line Number | Company CIK Key | |
1 | 1438823 | !J INC |
2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | #1 PAINTBALL CORP |
5 | 1427189 | $ LLC |
... | ... | ... |
49996 | 1190359 | BABB DOUGLAS J |
49997 | 1193948 | BABB HENRY C |
49998 | 1139504 | BABB INTERNATIONAL INC |
49999 | 1280368 | BABB JACK J |
50000 | 1575424 | BABB JAMES G. III |
50000 rows × 1 columns
Now let's do some grouping as usual:
companies = companies_df.copy()
group_similar_strings(companies['Company Name'])
group_rep_Line Number | group_rep_Company CIK Key | group_rep_Company Name | ||
---|---|---|---|---|
Line Number | Company CIK Key | |||
1 | 1438823 | 1 | 1438823 | !J INC |
2 | 1509607 | 2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | 3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | 4 | 1433777 | #1 PAINTBALL CORP |
5 | 1427189 | 5 | 1427189 | $ LLC |
... | ... | ... | ... | ... |
49996 | 1190359 | 49996 | 1190359 | BABB DOUGLAS J |
49997 | 1193948 | 49997 | 1193948 | BABB HENRY C |
49998 | 1139504 | 49998 | 1139504 | BABB INTERNATIONAL INC |
49999 | 1280368 | 49999 | 1280368 | BABB JACK J |
50000 | 1575424 | 50000 | 1575424 | BABB JAMES G. III |
50000 rows × 3 columns
Notice that group_similar_strings
preserves the index of the input Series while also showing the index(es) of the group-representatives in new columns with column-names prefixed by the string "group_rep_".
To ignore the indexes of the group-representatives, simply set the keyword argument ignore_index = True
:
group_similar_strings(companies['Company Name'], ignore_index=True)
Line Number Company CIK Key
1 1438823 !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP
5 1427189 $ LLC
...
49996 1190359 BABB DOUGLAS J
49997 1193948 BABB HENRY C
49998 1139504 BABB INTERNATIONAL INC
49999 1280368 BABB JACK J
50000 1575424 BABB JAMES G. III
Name: group_rep_Company Name, Length: 50000, dtype: object
Because the output always inherits the index of the input Series, it is possible to directly assign it to new columns of the companies
DataFrame (which has the exact same index) while also giving them new column names, as in the following:
companies[['Group Line Number', 'Group CIK Key', 'Group']] = \
group_similar_strings(companies['Company Name'], min_similarity=0.70)
companies
Company Name | Group Line Number | Group CIK Key | Group | ||
---|---|---|---|---|---|
Line Number | Company CIK Key | ||||
1 | 1438823 | !J INC | 1 | 1438823 | !J INC |
2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. | 2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC | 3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | #1 PAINTBALL CORP | 4 | 1433777 | #1 PAINTBALL CORP |
5 | 1427189 | $ LLC | 5 | 1427189 | $ LLC |
... | ... | ... | ... | ... | ... |
49996 | 1190359 | BABB DOUGLAS J | 49996 | 1190359 | BABB DOUGLAS J |
49997 | 1193948 | BABB HENRY C | 49997 | 1193948 | BABB HENRY C |
49998 | 1139504 | BABB INTERNATIONAL INC | 49998 | 1139504 | BABB INTERNATIONAL INC |
49999 | 1280368 | BABB JACK J | 49999 | 1280368 | BABB JACK J |
50000 | 1575424 | BABB JAMES G. III | 50000 | 1575424 | BABB JAMES G. III |
50000 rows × 4 columns
The grouping is not readily seen in the above display. So let us determine the number of members of each group (the group size) and then sort by group size:
companies['Group Size'] = \
companies\
.groupby(['Group Line Number', 'Group CIK Key'], as_index=False)['Company Name']\
.transform('count')
companies.sort_values('Group Size', ascending=False)
Company Name | Group Line Number | Group CIK Key | Group | Group Size | ||
---|---|---|---|---|---|---|
Line Number | Company CIK Key | |||||
14060 | 1425318 | ADVISORS DISCIPLINED TRUST 241 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13845 | 1662291 | ADVISORS DISCIPLINED TRUST 1674 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13821 | 1662315 | ADVISORS DISCIPLINED TRUST 1652 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13822 | 1662313 | ADVISORS DISCIPLINED TRUST 1653 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13823 | 1662312 | ADVISORS DISCIPLINED TRUST 1654 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
... | ... | ... | ... | ... | ... | ... |
20536 | 1529695 | ALENTUS CORP | 20536 | 1529695 | ALENTUS CORP | 1 |
20535 | 1585882 | ALENT PLC/ADR | 20535 | 1585882 | ALENT PLC/ADR | 1 |
20534 | 1688865 | ALENSON CARMAN | 20534 | 1688865 | ALENSON CARMAN | 1 |
20533 | 3433 | ALENICK JEROME B | 20533 | 3433 | ALENICK JEROME B | 1 |
50000 | 1575424 | BABB JAMES G. III | 50000 | 1575424 | BABB JAMES G. III | 1 |
50000 rows × 5 columns
Let's see where the largest group 'ADVISORS DISCIPLINED TRUST' ends:
companies.sort_values('Group Size', ascending=False).iloc[(start:=1861):(start + 10)]
Company Name | Group Line Number | Group CIK Key | Group | Group Size | ||
---|---|---|---|---|---|---|
Line Number | Company CIK Key | |||||
13248 | 1578605 | ADVISORS DISCIPLINED TRUST 1131 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13265 | 1582225 | ADVISORS DISCIPLINED TRUST 1147 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13250 | 1578617 | ADVISORS DISCIPLINED TRUST 1133 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13249 | 1578604 | ADVISORS DISCIPLINED TRUST 1132 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
13264 | 1582226 | ADVISORS DISCIPLINED TRUST 1146 | 14940 | 1297377 | ADVISORS DISCIPLINED TRUST | 1866 |
32937 | 1643285 | ANGELLIST-SSTY-FUND, A SERIES OF ANGELLIST-FG-... | 32535 | 1600532 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... | 485 |
32552 | 1677061 | ANGELLIST-BDID-PR-FUND, A SERIES OF ANGELLIST-... | 32535 | 1600532 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... | 485 |
32943 | 1623545 | ANGELLIST-STHE-FUND, A SERIES OF ANGELLIST-GP-... | 32535 | 1600532 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... | 485 |
32944 | 1680681 | ANGELLIST-STHE-PR-FUND, A SERIES OF ANGELLIST-... | 32535 | 1600532 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... | 485 |
32947 | 1610185 | ANGELLIST-SUIT-FUND, A SERIES OF ANGELLIST-FGR... | 32535 | 1600532 | ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-F... | 485 |
Now let's create a 'master' Series of group-representatives (we will use it later in the function match_most_similar
):
master = companies.groupby(['Group Line Number', 'Group CIK Key'])['Group'].first().head(-5)
Notice that we have intentionally excluded the last five groups.
master.index.rename(['Line Number', 'Company CIK Key'], inplace=True)
master.rename('Company Name', inplace=True)
Line Number Company CIK Key
1 1438823 !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP
5 1427189 $ LLC
...
49991 1615648 BABACAN THOMAS
49992 1443093 BABAD SHOLOM CHAIM
49993 1208255 BABALU LLC
49994 1270229 BABANI SUSIE
49995 1660243 BABAY KARIM
Name: Company Name, Length: 34249, dtype: object
Let's examine the neighbourhood of the largest group 'ADVISORS DISCIPLINED TRUST':
master.iloc[(start:=(master.index.get_loc((14940, 1297377)) - 5)):(start + 10)]
Line Number Company CIK Key
13087 1029068 ADVISORONE FUNDS
13089 1313535 ADVISORPORT, INC.
13090 1559198 ADVISORS 999
13094 789623 ADVISORS CAPITAL INVESTMENTS INC
13096 932536 ADVISORS CLEARING NETWORK, INC.
14940 1297377 ADVISORS DISCIPLINED TRUST
14954 1313525 ADVISORS EDGE SECURITIES, LLC
14955 825201 ADVISORS FUND L P
14957 1267654 ADVISORS GENPAR INC
14958 1016084 ADVISORS GROUP INC /DC
Name: Company Name, dtype: object
Now let's use master
in function match_most_similar
:
companies = companies_df.copy()
grouped_data = match_most_similar(
master,
companies['Company Name'],
min_similarity=0.55,
max_n_matches=2000
)
grouped_data
most_similar_Line Number | most_similar_Company CIK Key | most_similar_Company Name | ||
---|---|---|---|---|
Line Number | Company CIK Key | |||
1 | 1438823 | 1.0 | 1438823.0 | !J INC |
2 | 1509607 | 2.0 | 1509607.0 | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | 3.0 | 1457512.0 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | 4.0 | 1433777.0 | #1 PAINTBALL CORP |
5 | 1427189 | 5.0 | 1427189.0 | $ LLC |
... | ... | ... | ... | ... |
49996 | 1190359 | 31995.0 | 1336287.0 | ANDREA DOUGLAS J |
49997 | 1193948 | NaN | NaN | BABB HENRY C |
49998 | 1139504 | 19399.0 | 1569329.0 | AL INTERNATIONAL, INC. |
49999 | 1280368 | NaN | NaN | BABB JACK J |
50000 | 1575424 | NaN | NaN | BABB JAMES G. III |
50000 rows × 3 columns
Notice that match_most_similar
also preserves the indexes of the input duplicates
Series.
Also, notice that the indexes of the matched strings (including the matched strings themselves) in master
appear in the output as columns whose names are prefixed with the string "most_similar_".
Finally, notice the 'NaN' values in the index columns corresponding to those strings in duplicates
that have no match in master
(above the similarity threshold specified by the keyword argument setting min_similarity=0.55
). Recall that we earlier intentionally excluded certain groups in master
. These 'NaN' values are a consequence of that exclusion. Apparently, it is also because of these 'NaN' values that these index columns have been converted into float data-types.
We can replace the 'NaN' values with their corresponding index values by setting the keyword argument replace_na=True
. (The reason why the function match_most_similar
does not do this by default is because in general duplicates
may have a different index from master
with possibly a different number of index levels.)
grouped_data = match_most_similar(
master,
companies['Company Name'],
min_similarity=0.55,
max_n_matches=2000,
replace_na=True
)
grouped_data
most_similar_Line Number | most_similar_Company CIK Key | most_similar_Company Name | ||
---|---|---|---|---|
Line Number | Company CIK Key | |||
1 | 1438823 | 1 | 1438823 | !J INC |
2 | 1509607 | 2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | 3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | 4 | 1433777 | #1 PAINTBALL CORP |
5 | 1427189 | 5 | 1427189 | $ LLC |
... | ... | ... | ... | ... |
49996 | 1190359 | 31995 | 1336287 | ANDREA DOUGLAS J |
49997 | 1193948 | 49997 | 1193948 | BABB HENRY C |
49998 | 1139504 | 19399 | 1569329 | AL INTERNATIONAL, INC. |
49999 | 1280368 | 49999 | 1280368 | BABB JACK J |
50000 | 1575424 | 50000 | 1575424 | BABB JAMES G. III |
50000 rows × 3 columns
Let's inspect the result by determining the group sizes:
grouped_data.groupby('most_similar_Company Name')['most_similar_Line Number'].count()\
.rename('Size')\
.sort_values(ascending=False)
most_similar_Company Name
ADVISORS DISCIPLINED TRUST 1866
ANGELLIST-ART-FUND, A SERIES OF ANGELLIST-FG-FUNDS, LLC 279
ALTERNATIVE LOAN TRUST 2005-4 193
AGL LIFE ASSURANCE CO SEPARATE ACCOUNT 188
AMERICREDIT AUTOMOBILE RECEIVABLES TRUST 2001-1 89
...
AIM SAFETY CO INC 1
AIM REAL ESTATE HEDGED EQUITY (U.S.) FUND, LP 1
AIM QUANTITATIVE GLOBAL SF LP 1
AIM OXFORD HOLDINGS, LLC 1
TALISMAN ENERGY SWEDEN AB 1
Name: Size, Length: 34496, dtype: int64
Just like we did for function group_similar_strings
, we can ignore the indexes of master
if we choose, by setting ignore_index=True
:
grouped_data_dropped = match_most_similar(
master,
companies['Company Name'],
ignore_index=True,
min_similarity=0.55,
max_n_matches=2000
)
grouped_data_dropped
Line Number Company CIK Key
1 1438823 !J INC
2 1509607 #1 A LIFESAFER HOLDINGS, INC.
3 1457512 #1 ARIZONA DISCOUNT PROPERTIES LLC
4 1433777 #1 PAINTBALL CORP
5 1427189 $ LLC
...
49996 1190359 ANDREA DOUGLAS J
49997 1193948 BABB HENRY C
49998 1139504 AL INTERNATIONAL, INC.
49999 1280368 BABB JACK J
50000 1575424 BABB JAMES G. III
Name: most_similar_Company Name, Length: 50000, dtype: object
As before, we can here also directly assign the output to new columns of the companies
DataFrame (because it has the exact same index) while also giving them new column names, as in the following:
companies['Most Similar Name'] = grouped_data_dropped
companies
Company Name | Most Similar Name | ||
---|---|---|---|
Line Number | Company CIK Key | ||
1 | 1438823 | !J INC | !J INC |
2 | 1509607 | #1 A LIFESAFER HOLDINGS, INC. | #1 A LIFESAFER HOLDINGS, INC. |
3 | 1457512 | #1 ARIZONA DISCOUNT PROPERTIES LLC | #1 ARIZONA DISCOUNT PROPERTIES LLC |
4 | 1433777 | #1 PAINTBALL CORP | #1 PAINTBALL CORP |
5 | 1427189 | $ LLC | $ LLC |
... | ... | ... | ... |
49996 | 1190359 | BABB DOUGLAS J | ANDREA DOUGLAS J |
49997 | 1193948 | BABB HENRY C | BABB HENRY C |
49998 | 1139504 | BABB INTERNATIONAL INC | AL INTERNATIONAL, INC. |
49999 | 1280368 | BABB JACK J | BABB JACK J |
50000 | 1575424 | BABB JAMES G. III | BABB JAMES G. III |
50000 rows × 2 columns
The options ignore_index
and replace_na
can be passed directly to a StringGrouper
object during instantiation. These will be used by its methods StringGrouper.get_groups
and StringGrouper.get_matches
.
The options ignore_index
and replace_na
can also, where applicable, be passed directly to StringGrouper.get_groups
and StringGrouper.get_matches
themselves to temporarily override the StringGrouper-instance's defaults.