-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbaseline.py
105 lines (82 loc) · 3.2 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# source
# https://www.analyticsvidhya.com/blog/2022/12/build-accurate-job-resume-matching-algorithm-using-doc2vec/
#
####
# Implement Job Resume Matching Algorithm using Doc2Vec
####
# import libraries
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import PyPDF2
import re
# sample articles data
articles = pd.read_csv('data.csv')
print(articles.head())
# tag data using TaggedDocument
data = list(articles['data'])
tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(data)]
# initialize model
model = Doc2Vec(vector_size = 50, min_count = 10, epochs = 50)
# vocabulary building
model.build_vocab(tagged_data)
# k = model.wv.get_vecattr()
# k = model.wv.vocab.keys()
k = list(model.wv.index_to_key)
# print(k)
print(len(k))
# model training
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('doc2Vec.model')
print("Model saved")
# Matching the JD and Resume
resume_path = 'resume1.pdf'
resume = ''
pdfReader = PyPDF2.PdfReader(resume_path)
for page in pdfReader.pages:
resume += page.extract_text()
# pre normalize tokenization
resume = resume.lower()
resume = re.sub('[^a-z]', ' ', resume) # removing punctuations and digits
# load job description and process it
jd_links = ['https://datahack.analyticsvidhya.com/jobathon/clix-capital/senior-manager-growthrisk-analytics-2',
'https://datahack.analyticsvidhya.com/jobathon/clix-capital/manager-growth-analytics-2',
'https://datahack.analyticsvidhya.com/jobathon/clix-capital/manager-risk-analytics-2',
'https://datahack.analyticsvidhya.com/jobathon/cropin/data-scientist-85']
jd_df = pd.DataFrame(columns=['links', 'data'])
jd_df['links'] = jd_links
def extract_data(url):
list1 = []
count = 0
resp = requests.get(url)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'html.parser')
l = soup.find(class_ = 'av-company-description-page mb-2')
web = ''.join([i.text for i in l.find_all(['p','li'])])
list1.append(web)
return web
else:
print("ERROR extrating data from url")
#extract the JD data from the links
for i in range(len(jd_df)):
jd_df['data'][i] = extract_data(jd_df['links'][i])
## Preprocessing the JD data
# converting the text into lower case
jd_df.loc[:,"data"] = jd_df.data.apply(lambda x : str.lower(x))
# removing the punctuations from the text
jd_df.loc[:,"data"] = jd_df.data.apply(lambda x : " ".join(re.findall('[\w]+',x)))
# removing the numerics present in the text
jd_df.loc[:,"data"] = jd_df.data.apply(lambda x: re.sub(r'\d+','',x))
model = Doc2Vec.load('doc2Vec.model')
v1 = model.infer_vector(resume.split())
v2 = model.infer_vector(jd_df['data'][0].split()) # cosine similarity of .499
#v2 = model.infer_vector(jd_df['data'][1].split()) # .502
#v2 = model.infer_vector(jd_df['data'][2].split()) # .506
#v2 = model.infer_vector(jd_df['data'][3].split()) # .507
cosine_similarity = (np.dot(np.array(v1), np.array(v2))) / (np.linalg.norm(np.array(v1)) * np.linalg.norm(np.array(v2)))
print(round(cosine_similarity, 3))