-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathall_def.py
56 lines (41 loc) · 1.81 KB
/
all_def.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from urllib.request import urlretrieve
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from my_options import HEADER, CONTAINER_MAIN_CARDS_FIRST, CONTAINER_MAIN_CARDS_SEC
import pandas as pd
def search_items(result_pages, result_anuncio):
__cards = []
for interat in range(result_pages):
soup = interat_pages(result_pages)
__result_anuncio = soup.find(
'div', class_=CONTAINER_MAIN_CARDS_FIRST).findAll('div', class_=CONTAINER_MAIN_CARDS_SEC)
for item in result_anuncio:
card = {}
card['value'] = item.find('p', 'fnmrjs-16 jqSHIm').getText()
card['title'] = item.find('div', 'fnmrjs-8 kRlFBv').getText()
card['location'] = item.find('p', 'fnmrjs-13 hdwqVC').getText()
card['category'] = item.find('p', 'fnmrjs-14 iIprpQ').getText()
__cards.append(card)
return create_dataset(__cards)
def interat_pages(result_pages):
URL = (
'https://mg.olx.com.br/moda-e-beleza?o={}&sp=1'.format(str(result_pages + 1)))
request = Request(URL, headers=HEADER)
response = urlopen(request)
html = (response.read()).decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
return soup
def search_image(result_pages, result_anuncio):
for interat in range(result_pages):
for item in result_anuncio:
image = item.find(
'div', class_='fnmrjs-5 jksoiN').img.get('src')
urlretrieve(image, './output/img/' + image.split('/')[-1])
def create_dataset(cards):
dataset = pd.DataFrame(cards)
# dataset.to_csv('\output\data', sep=';',index=False, encoding='utf-8-sig')
return dataset
def handling_url(response):
html = (response.read()).decode('utf-8').split()
html = " ".join(html).replace('> <', '><')
return html