-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
81 lines (67 loc) · 2.47 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import time
import httpx
import logging
import importlib
from bs4 import BeautifulSoup
from modules.data_pipelines import DataProc
# # configures root logger
# main_path = os.path.dirname(os.path.abspath(__file__))
# logging.basicConfig(
# filename = os.path.join(main_path, 'logs', 'met_scrape.log'),
# filemode = 'a',
# format = '%(asctime)s | %(levelname)s | %(name)s | %(message)s',
# datefmt = '%y/%m/%d %H:%M:%S',
# encoding = 'utf-8',
# level = logging.DEBUG
# )
# # adds module loggers to root logger
# module_loggers = [
# 'modules.data_pipelines',
# 'modules.get_geography'
# ]
# for module in module_loggers:
# mod = importlib.import_module(module)
# logger = getattr(mod, 'logger')
# logging.getLogger().addHandler(logger)
def main():
'''scrapes data from weather station table via urls'''
# requesting html from weather station page
page_url = (
'https://www.metoffice.gov.uk/'
'research/climate/maps-and-data/historic-station-data'
)
page_response = httpx.get(page_url)
soup = BeautifulSoup(page_response, 'html.parser')
# gets name and url for each station
table_rows = soup.find_all('tr')
station_dict = dict()
for row in table_rows:
# gets formatted station from table
column_0 = row.find('td')
if column_0:
station_name = column_0.text.lower().strip().replace(' ', '_')
# stores requested station data from url
for tag in row.find_all('a'):
# logger.info(f'Requesting data from {station_name.title()}...')
time.sleep(0.1)
station_url = tag.get('href')
station_response = httpx.get(station_url)
station_dict[station_name] = DataProc.clean_data(
station_response,
station_name
)
# writes combines station data
combined_data = str()
for (index, data_lines) in enumerate(station_dict.values()):
if index > 0:
data_lines = data_lines[1:]
data_lines[0] = '\n' + data_lines[0] # review this part
station_data = '\n'.join(data_lines)
combined_data += station_data
with open(f'data/combined_data.csv', 'w') as file:
file.write(combined_data)
# calls pre-processing pipeline
DataProc.pre_processing(combined_data)
if __name__ == '__main__':
main()