import requests
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
chrome_service = Service('/Users/rcsousa/Downloads/chromedriver')
import json
from bs4 import BeautifulSoup as soup
import random
import matplotlib.pyplot as plt
import requests
import json
import unidecode
import folium
from folium import plugins
import numpy as np
Gera lista de estados e coordenadas
estados = json.loads(requests.get("https://servicodados.ibge.gov.br/api/v1/localidades/estados/").text)
UF = []
estado_nome = []
for x in estados:
UF.append(x['sigla'])
estado_nome.append(x['nome'])
coordenadas = []
for x in estados:
coordenadas.append(json.loads(requests.get("http://servicodados.ibge.gov.br/api/v3/malhas/estados/"+x['sigla']+"/metadados").text))
latitude = []
longitude = []
for x in coordenadas:
latitude.append(x[0]['centroide']['latitude'])
longitude.append(x[0]['centroide']['longitude'])
coord = list(zip(UF, estado_nome, latitude, longitude))
df_coord = pd.DataFrame(coord, columns = ['UF', 'Location', 'latitude', 'longitude'])
coord_plot = list(zip(estado_nome, latitude, longitude))
Define parametros de busca
url = 'https://www.linkedin.com/jobs/search?keywords=Site%20Reliability%20Engineer&location=Brasil&geoId=106057199&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
options = Options()
options.headless = False
driver=webdriver.Chrome
s = driver(service=chrome_service, options=options)
n = random.randint(3,7)
s.get(url)
time.sleep(n)
scroll_pause_time = 1
screen_height = s.execute_script("return window.screen.height;")
i = 1
Faz loop automático até último registro publico
while True:
# scroll one screen height each time
s.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = s.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
try:
s.find_element(By.XPATH, "//button[@class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']").click()
except:
pass
if (screen_height) * i > scroll_height:
break
bsobj = soup(s.page_source, 'html.parser')
job_title = []
company = []
location = []
age = []
link = []
for item in bsobj.findAll('h3', {'class' : 'base-search-card__title'}):
job_title.append(item.get_text().strip())
job_title
for item in bsobj.findAll('a', {'class' : 'hidden-nested-link'}):
company.append(item.get_text().strip())
company
for item in bsobj.findAll('span', {'class' : 'job-search-card__location'}):
location.append(item.get_text().strip())
for item in bsobj.findAll('time', {'class' : 'job-search-card__listdate'}):
age.append(item.get_text().strip())
postings = list(zip(job_title, company, location, age))
df = pd.DataFrame(postings, columns = ['Job Opening', 'Company', 'Location', 'Age'])
s.quit()
Consolida estados
localidades = json.loads(requests.get("https://servicodados.ibge.gov.br/api/v1/localidades/municipios/").text)
for x in localidades:
df.loc[df['Location'].str.contains(unidecode.unidecode(x['nome']), case=False), unidecode.unidecode('Location')] = x['microrregiao']['mesorregiao']['UF']['nome']
df.loc[df['Location'].str.contains(x['microrregiao']['mesorregiao']['UF']['nome'], case=False), 'Location'] = x['microrregiao']['mesorregiao']['UF']['nome']
Tratamento de excessão
df.loc[df['Location'].str.contains('Federal District', case=False), 'Location'] = 'Distrito Federal'
df.loc[df['Location'].str.contains('Ribeirão Preto', case=False), 'Location'] = 'São Paulo'
jobs_per_company = df.groupby('Company').count()
df = df.merge(df_coord, on='Location', how='left')
df = df[df.latitude.notnull()]
prep_chart_company = jobs_per_company.sort_values("Job Opening", ascending=False).head(10)
df2 = pd.DataFrame(prep_chart_company['Job Opening'])
df2.plot(kind="barh", \
legend=False, \
figsize=(24, 12), \
rot=0, \
fontsize = 16, \
sort_columns = False)
<AxesSubplot:ylabel='Company'>
Gera Total de vagas por localidade
df['Openings'] = df.groupby('Location')['Location'].transform('count')
Remove colunas desnecessárias
for x in 'Company', 'Age', 'UF', 'Job Opening':
df.drop(x, axis='columns', inplace=True)
Define HTTP Header
headers = {
'Content-Type': 'application/json;charset=UTF-8',
'User-Agent': 'google-colab',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
}
Coleta Geojson do IBGE e atribui variável
meshes_url = 'https://servicodados.ibge.gov.br/api/v2/malhas/?resolucao=2&formato=application/vnd.geo+json'
meshes_data = requests.get(meshes_url, headers=headers).json()
Coleta informacoes dos estados (com id) do IBGE e atribui variável
states_url = 'https://servicodados.ibge.gov.br/api/v1/localidades/estados'
states_data = requests.get(states_url, headers=headers).json()
# creating lists to be populated by IBGE requested data
meshes_ids = []
states_ids = []
states_names = []
states_codes = []
# populating information about meshes
for feature in meshes_data['features']:
meshes_ids.append( str(feature['properties']['codarea']) )
meshes_ids.sort()
# populating information about Federative Units
for state in states_data:
states_ids.append( str(state['id']) )
states_names.append( state['nome'] )
states_codes.append( state['sigla'] )
states_ids.sort()
# creating a dataframe of Federative Units to be merged
states = pd.DataFrame( {'id': states_ids, 'nome': states_names, 'sigla': states_codes} )
# appending centroid coordinates columns
states['lat'] = 0
states['lng'] = 0
states.set_index('id', inplace=True)
# retrieving centroid data
for feature in meshes_data['features']:
centroid = feature['properties']['centroide']
lat = centroid[1]
lng = centroid[0]
cod = str(feature['properties']['codarea'])
states.loc[cod,'lat'] = lat
states.loc[cod,'lng'] = lng
states.reset_index(inplace=True)
Faz merge dos dataframes df e localidades com ids
df = df.merge(states, left_on='Location', right_on='nome').drop(columns=['nome'])
Remove entradas duplicadas
df = df.drop_duplicates()
Gera Mapa
# Coordenadas de Brasilia
federal_district = [-15.7757875,-48.0778477]
# Cria objeto Mapa
basemap = folium.Map(
location=federal_district,
zoom_start=4,
tiles='openstreetmap'
)
# Permite alterar tipo de layout do mapa
tiles = ['openstreetmap', 'cartodbpositron']
# iterating over the tiles and creating the maps
for tile in tiles:
folium.TileLayer(tile).add_to(basemap)
# Define plot
legends = 'SRE Jobs Opening in Brazil 2021'
folium.Choropleth(
geo_data=meshes_data,
data=df,
name=legends,
columns=['id','Openings'],
key_on='feature.properties.codarea',
fill_color='YlOrRd',
fill_opacity=1.0,
line_opacity=0.7,
legend_name=legends
).add_to(basemap)
# controles do mapa
folium.LayerControl().add_to(basemap)
# Renderiza Mapa
basemap