-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
150 lines (125 loc) · 5.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import csv
import urllib3
import re
from bs4 import BeautifulSoup
import pandas as pd
import currency_converter
kzt = currency_converter.convertTenge()
rub = currency_converter.convertRuble()
eur = currency_converter.convertEuro()
class resume:
def __init__(self, title, specialization, salary, age, employment, work_schedule, experience_years, experience_month, citizenship, sex):
self.title = title
self.specialization = specialization
self.salary = salary
self.age = age
self.employment = employment
self.work_schedule = work_schedule
self.experience_years = experience_years
self.experience_month = experience_month
self.citizenship = citizenship
self.sex = sex
def parse_number(text):
return int(re.search(r'\d+', text.replace(" ", "").replace("\t", "").replace("\n", "")).group())
def convert_salary(text):
num = parse_number(text)
if ('K' in text): #tenge
return num / kzt
elif ('р' in text): # ruble
return num / rub
elif ('E' in text): # euro
return num /eur
return num
def get_max_page(http, search_text):
url = f"https://hh.kz/search/resume?text={search_text}&area=40&isDefaultArea=true&pos=full_text&logic=normal&exp_period=all_time¤cy_code=KZT&ored_clusters=true&order_by=relevance"
res = http.request('GET', url)
soup = BeautifulSoup(res.data, "html.parser")
pages = soup.find_all('a', attrs={'class': 'bloko-button'})[2:-1]
max_page = 0
for page in pages:
max_page = max(int(page.text), max_page)
return min(1, max_page)
def parse_resume(http, url):
res = http.request('GET', url)
soup = BeautifulSoup(res.data, "html.parser")
resume_experience = soup.find('span', attrs={
'class': 'resume-block__title-text resume-block__title-text_sub'}).find_all('span')
try:
resume_title = soup.find(
'span', attrs={'class': 'resume-block__title-text'}).text
except:
resume_title = None
try:
resume_gender = soup.find(
'span', attrs={'data-qa': 'resume-personal-gender'}).text
except:
resume_gender = None
try:
resume_citizenship = soup.find(
'div', attrs={'data-qa': 'resume-block-additional'}).find('p').text
resume_citizenship = resume_citizenship.split()[1]
except:
resume_citizenship = None
try:
resume_experience_year = resume_experience[0].text
except:
resume_experience_year = None
try:
resume_experience_month = resume_experience[1].text
except:
resume_experience_month = None
try:
resume_work_schedule = soup.find(
'div', attrs={'class': 'resume-block-item-gap'}).find_all('p')[1].text
resume_work_schedule = ' '.join(resume_work_schedule.split()[2:])
except:
resume_work_schedule = None
try:
resume_employment = soup.find(
'div', attrs={'class': 'resume-block-item-gap'}).find('p').text
resume_employment = ' '.join(resume_employment.split()[1:])
except:
resume_employment = None
try:
resume_age = soup.find(
'span', attrs={'data-qa': 'resume-personal-age'}).text
except:
resume_age = None
try:
resume_salary = soup.find(
'span', attrs={'class': 'resume-block__salary'}).text
except:
resume_salary = None
try:
resume_specialization = ' '.join([i.text for i in soup.find_all(
'li', attrs={'class': 'resume-block__specialization'})])
except:
resume_specialization = None
# Convert Salary
resume_salary = convert_salary(resume_salary)
return resume(resume_title, resume_specialization, resume_salary, resume_age, resume_employment, resume_work_schedule, resume_experience_year, resume_experience_month, resume_citizenship, resume_gender)
def parse_resumes(search_text):
http = urllib3.PoolManager()
resumes = []
for i in range(0, get_max_page(http, search_text)):
url = f"https://hh.kz/search/resume?text={search_text}&area=40&isDefaultArea=true&pos=full_text&logic=normal&exp_period=all_time¤cy_code=KZT&ored_clusters=true&order_by=relevance&page={i}"
res = http.request('GET', url)
soup = BeautifulSoup(res.data, "html.parser")
resume_links = soup.find_all('a', attrs={'class', 'serp-item__title'})
for resume_link in resume_links:
href = "https://hh.kz" + resume_link.get("href")
# print(href)
resumes.append(parse_resume(http, href))
with open('CV.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Title", "Specialization", "Salary", "Age", "Employment",
"Work Schedule", "Experience Year", "Experience Month", "Citizenship", "Sex"])
for resume in resumes:
writer.writerow([resume.title, resume.specialization, resume.salary, resume.age, resume.employment,
resume.work_schedule, resume.experience_years, resume.experience_month, resume.citizenship, resume.sex])
df = pd.read_csv('CV.csv')
print(df.to_string())
return resumes
if __name__ == "__main__":
search_text = input("Search for: ")
resumes = parse_resumes(search_text)