-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
126 lines (94 loc) · 4.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import csv
PATH="C:\Program Files (x86)\chromedriver.exe"
driver= webdriver.Chrome(PATH)
url="https://www.jumia.com.ng/flash-sales/"
driver.get(url)
def main():
# URL = 'https://www.jumia.com.ng/flash-sales/'
categories=[
'Computing',
'Electronics',
'Sporting Goods',
'Phones & Tablets',
'Fashion',
'Home & Office',
'Grocery',
'Health & Beauty']
# oldCsv = []
# with open('headers.csv', 'r', encoding="utf-8") as csvfile:
# csvReader = csv.reader(csvfile, delimiter=',')
# for row in csvReader:
# oldCsv.append(row)
# generates page source for different category
for category in categories:
search= driver.find_element(By.ID,'fi-q')
search.send_keys(category)
search.clear()
search.send_keys(Keys.RETURN)
scrape=driver.page_source
soup1=BeautifulSoup(scrape, "html.parser")
soup2= BeautifulSoup(soup1.prettify(), "html.parser")
oldCsv = []
with open('dataset3.csv', 'r', encoding="utf-8") as csvfile:
csvReader = csv.reader(csvfile, delimiter=',')
for row in csvReader:
oldCsv.append(row)
for entry in soup2.find_all('div', {'class': 'info'}):
title = entry.find('h3', {'class': 'name'}).text.strip()
discount_price= entry.find('div', {'class': 'prc'}).text.strip()
original_price= getattr(entry.find('div', {'class': 's-prc-w'}),'text','None')
percentage_dsc= getattr(entry.find('div', {'class': 'bdg _dsct _sm'}),'text','None')
rating=getattr(entry.find('div', {'class': 'stars _s'}),'text','None')
raters=getattr(entry.find('div', {'#text'}),'text','None')
data= [title,discount_price,original_price,percentage_dsc,rating,raters]
# appending file
addData = True
for _ in oldCsv:
if _[0] == title:
addData = False
if addData:
with open('dataset3','a+', newline='', encoding='UTF8') as f:
writer=csv.writer(f)
writer.writerow(data)
# with open('dataset1.csv','a+', newline='', encoding='UTF8') as f:
# writer=csv.writer(f)
# writer.writerow(data)
time.sleep(2)
driver.quit()
def scrape_pages(url) -> None:
max_pages = 3
current_page = 1
# Loop through all pages dynamically and build the url using the page number suffix the website uses
while current_page <= max_pages:
current_page += 1
print(f'{url}?page={current_page}')
# # Get each page's html
raw_html = f'{url}?page={current_page}'
f"https://www.jumia.com.ng/{category}/?page={current_page}#catalog-listing
soup = BeautifulSoup(raw_html.text, 'html.parser')
# # # Find all table rows and from each table row get the needed data
for entry in soup.find_all('div', {'class': 'info'}):
title = entry.find('h3', {'class': 'name'}).text.strip()
discount_price= entry.find('div', {'class': 'prc'}).text.strip('₦ ')
original_price= getattr(entry.find('div', {'class': 's-prc-w'}),'text','None')
original_price= original_price.strip('₦ %')
percentage_dsc= getattr(entry.find('div', {'class': 'bdg _dsct _sm'}),'text','None')
rating=(entry.find('div', {'class': 'rev'}),'text','None')
data= [title,discount_price,original_price,percentage_dsc]
with open('dataset1.csv','a+', newline='', encoding='UTF8') as f:
writer=csv.writer(f)
writer.writerow(data)
# time.sleep(10) # sleep before scraping next page to not send too many requests at once
# # current_page += 1
print('\n\n') # Clearing console up
if __name__ == '__main__':
while True:
main()
time_wait=60
print(f'Waiting {time_wait} seconds. . .')
time.sleep(time_wait)