Skip to content

Commit b74be01

Browse files
authored
Merge pull request #18 from hvudeshi/het#1
added scrapper for indeed
2 parents ba5d373 + cc83540 commit b74be01

File tree

1 file changed

+179
-0
lines changed

1 file changed

+179
-0
lines changed

Code/Scrapper/scrapper_indeed.py

+179
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
from selenium import webdriver
2+
import time
3+
import keyword_extraction_modules as ke
4+
from email.mime.multipart import MIMEMultipart
5+
from email.mime.text import MIMEText
6+
from socket import gaierror
7+
from webdriver_manager.chrome import ChromeDriverManager
8+
import smtplib
9+
from selenium.webdriver.chrome.options import Options
10+
import urllib
11+
12+
13+
# ===============Database Connector Script ==============================================================
14+
def db_connect(properties):
15+
import mysql.connector
16+
from mysql.connector import Error
17+
import json
18+
data = json.load(properties)
19+
server_name = data['server_name']
20+
user_name = data['user_name']
21+
password = data['password']
22+
db_name = data['db_name']
23+
connection = mysql.connector.connect(host=server_name,
24+
database=db_name,
25+
user=user_name,
26+
password=password)
27+
28+
return connection
29+
30+
31+
# ================= fetch total skills database ==========================================================
32+
def get_total_skills(connection):
33+
query = "select skill_id,skill_title from skill_master"
34+
cursor = connection.cursor()
35+
cursor.execute(query)
36+
table = cursor.fetchall()
37+
final_skills = {}
38+
for row in table:
39+
final_skills[row[0]] = row[1]
40+
return final_skills
41+
42+
43+
# ========================= fetch resume id and corresponding skills ==============================================
44+
def get_resume_id_skills(connection):
45+
query1 = "select r.resume_id,r.skill_id from resume_skills r where is_active='1'"
46+
cursor = connection.cursor()
47+
cursor.execute(query1)
48+
records = cursor.fetchall()
49+
mapping_dict = {}
50+
for row in records:
51+
if row[0] in mapping_dict:
52+
mapping_dict[row[0]].append(row[1])
53+
else:
54+
mapping_dict[row[0]] = [row[1]]
55+
return mapping_dict
56+
57+
58+
# ======================= fetch user email ids ==================================
59+
def get_email_id_users(connection):
60+
query2 = "select r.resume_id,u.user_email from user_resume r join user_master u on r.user_id=u.user_id"
61+
cursor = connection.cursor()
62+
cursor.execute(query2)
63+
details = cursor.fetchall()
64+
email_dict = {}
65+
for row in details:
66+
if row[0] in email_dict:
67+
email_dict[row[0]].append(row[1])
68+
else:
69+
email_dict[row[0]] = [row[1]]
70+
return email_dict
71+
72+
# =========================== get job description =================================================================
73+
final_dict = {}
74+
threshold = 1
75+
76+
77+
def get_job_description(keyword, num_jobs, verbose):
78+
options = Options()
79+
options.add_argument("--window-size-1920,1200")
80+
options.add_argument('--headless')
81+
options.add_argument('--no-sandbox')
82+
options.add_argument('--disable-dev-shm-usage')
83+
driver = webdriver.Chrome(options=options, executable_path=ChromeDriverManager().install())
84+
url = "https://www.indeed.com/jobs?"
85+
#-------------------Job perferences(input from user)-------------------------------------#
86+
data={}
87+
data["q"] = "Software Developer"
88+
data["l"] = "New York"
89+
data["jt"]="parttime"
90+
data["explvl"]="senior_level"
91+
#------------------------------------------------------------------------------------------#
92+
url_parts = list(urllib.parse.urlparse(url))
93+
query = dict(urllib.parse.parse_qsl(url_parts[4]))
94+
query.update(data)
95+
url_parts[4] = urllib.parse.urlencode(query,quote_via=urllib.parse.quote_plus)
96+
url = urllib.parse.urlunparse(url_parts)
97+
driver.get(url)
98+
job_urls = []
99+
c = 0
100+
jobcards = driver.find_element_by_id('mosaic-provider-jobcards')
101+
jobs = jobcards.find_elements_by_xpath("./*")
102+
print(len(jobs))
103+
for text in jobs:
104+
if text.get_attribute('href'): ### get all the job postings URL'sz
105+
job_urls.append(text.get_attribute('href'))
106+
c = c + 1
107+
if (c >= num_jobs):
108+
break
109+
110+
# ========== Iterate through each url and get the job description =================================
111+
for i in job_urls:
112+
time.sleep(5)
113+
jobs = []
114+
driver.get(i)
115+
job_description = driver.find_element_by_xpath('//*[@id="jobDescriptionText"]').text
116+
jobs.append(job_description)
117+
final_dict[i] = job_description
118+
return final_dict
119+
120+
121+
122+
if __name__ =='__main__':
123+
properties = open('parameters.json')
124+
connection = db_connect(properties)
125+
final_skills = get_total_skills(connection)
126+
print(final_skills)
127+
mapping_dict = get_resume_id_skills(connection)
128+
print(mapping_dict)
129+
email_dict = get_email_id_users(connection)
130+
print(email_dict)
131+
final_dict = get_job_description("Software Engineer", 5, False)
132+
print(final_dict)
133+
134+
# ================= send email to users======================================================
135+
136+
total = {}
137+
total = ke.get_user_id_to_list_of_job_ids(mapping_dict, final_dict, connection, final_skills, threshold)
138+
print(total)
139+
port = 587
140+
smtp_server = "smtp.gmail.com"
141+
142+
password = "SRIJASGMAILPWD"
143+
sender = "[email protected]"
144+
for key in total:
145+
if key in email_dict:
146+
receiver = ''.join(email_dict[key])
147+
print(receiver)
148+
msg = MIMEMultipart()
149+
msg['From'] = sender
150+
msg['To'] = receiver
151+
msg['Subject'] = 'JOB Listing'
152+
body = """Hi \n PFA the attached list of jobs that match your resume \n """
153+
temp_str = ""
154+
list_curr_links = total[key]
155+
counter = 1
156+
for link in list_curr_links:
157+
temp_str += (str(counter) + link + '\n')
158+
counter += 1
159+
body += temp_str
160+
msg.attach(MIMEText(body, 'plain'))
161+
text = msg.as_string()
162+
163+
try:
164+
server = smtplib.SMTP(smtp_server, port)
165+
server.connect(smtp_server, port)
166+
server.ehlo()
167+
server.starttls()
168+
server.ehlo()
169+
server.login(login, password)
170+
server.sendmail(sender, receiver, text)
171+
server.quit()
172+
print('Sent')
173+
except (gaierror, ConnectionRefusedError):
174+
print('Failed to connect to the server. Bad connection settings?')
175+
except smtplib.SMTPServerDisconnected as e:
176+
print('Failed to connect to the server. Wrong user/password?')
177+
print(str(e))
178+
except smtplib.SMTPException as e:
179+
print('SMTP error occurred: ' + str(e))

0 commit comments

Comments
 (0)