-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheader_analysis.py
128 lines (94 loc) · 3.28 KB
/
header_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
'''
Header level analysis
'''
#%%
import utilities.utils
from utilities.etl import IndeedETL
# import importlib
# importlib.reload(etl)
import pandas as pd
from utilities.utils import to_wcdf
data = pd.read_csv(IndeedETL().paths['merged_with_headings'], index_col=0)
# %%
# checking distribution of job vs person
data['Person/Job/Org/None'].value_counts()
# %%
# creating respective dataframes
person = data[data['Person/Job/Org/None'] == 'Person']
job = data[data['Person/Job/Org/None'] == 'Job']
# %%
person
# %%
job
# %%
from utilities.visualize import bar, distplot
# visualizing
# %%
import utilities.utils
# import importlib
# importlib.reload(utils)
from utilities.utils import word_cloud
# distribution of headings in Person sections
word_cloud(person['Heading'])
# distribution of headings in Job sections
word_cloud(job['Heading'])
# %%
# set analysis
to_wcdf(list(person['Heading']))
# %%
person['Heading']
# %%
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
def set_diagnostics(df_a, df_b, name_a, name_b):
'''
This function compares the tokens from two sequences.
It does it by taking the "Heading Text" column from
two dataframes, df_a and df_b, then it makes comparisons
between the two.
Input
-----
df_a: Dataframe with a column containing
tokens you want to compare in the 'Heading Text' column
df_b: Same thing but for the other df you want to compare
with
Output
------
3 graphs: A, B, and A n B tokens
'''
def topn(data, n = data.shape[0]):
stop = stopwords.words("english") + ["data"]
# Initializing the Count Vectorizer, exluding words
# that appear less than 5 times
bagofwords = CountVectorizer(min_df = 5, stop_words = stop)
words = bagofwords.fit_transform(data['Heading Text'].dropna())
counts = pd.DataFrame(columns = bagofwords.get_feature_names(), data = words.toarray())
top = counts.sum().sort_values(ascending=False)[:n]
return top
#visualizing top words in person and job
a_words = set(topn(df_a, 50).index)
b_words = set(topn(df_b,50).index)
bar(x = topn(df_a, 50).index, y = topn(df_a, 50).values,
title = "Top in {}".format(name_a))
bar(x = topn(df_b, 50).index, y = topn(df_b, 50).values,
title = "Top in {}".format(name_b))
# visualizing intersections
intersection = a_words.intersection(b_words)
# visualizing words that appear in both
import plotly.express as px
# creating stacked bar chart of top intersection terms
a_mask = [i in intersection for i in topn(df_a).index]
b_mask = [i in intersection for i in topn(df_b).index]
a_counts = topn(df_a)[a_mask].reset_index()
a_counts["title"] = [name_a] * a_counts.shape[0]
b_counts = topn(df_b)[b_mask].reset_index()
b_counts["title"] = [name_b] * b_counts.shape[0]
combined = pd.concat([b_counts, a_counts], axis=0)
combined = combined.rename(columns = {"index": "word",
0: "counts"}).sort_values("counts", ascending = False)
fig = px.bar(combined, x = "word", y = "counts",
color = "title", title = f"Comparing Tokens That Appear in {name_a} and {name_b}",
barmode = "group")
fig.show()
set_diagnostics(person, job, "Person", "Job")
# %%