-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_locations.py
52 lines (41 loc) · 1.63 KB
/
get_locations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import requests
from bs4 import BeautifulSoup
def load_and_preprocess_locations():
locations = []
with open("data/locations", encoding="utf-8") as locations_file:
for loc in locations_file:
locations.append(preprocess(loc))
return locations
def preprocess(word):
word = word.strip().lower()
word = re.sub("\s", "", word)
word = re.sub("\"", "", word)
word = re.sub("\)", "", word)
word = re.sub("\(", "", word)
word = re.sub("\.", "", word)
word = re.sub(",", "", word)
word = re.sub("-", "", word)
word = re.sub("/", "", word)
word = re.sub("ä", "ae", word)
word = re.sub("ö", "oe", word)
word = re.sub("ü", "ue", word)
word = re.sub("ß", "ss", word)
return word
def fetch_all_locations():
letters = (chr(code) for code in range(65, 91))
locations = []
for letter in letters:
locations.extend(fetch_locations_for_letter(letter))
return locations
def fetch_locations_for_letter(letter):
print(f"getting locations for letter {letter}")
soup = BeautifulSoup(requests.get("http://www.deutsche-staedte.de/staedte.php?city=" + letter).content)
ul = soup.find("ul", style="list-style-type: square; list-style-position: outside; padding-left: 25px;")
return (location_link.text.strip() for location_link in ul.find_all("a") if location_link.text.strip() != "")
if __name__ == "__main__":
locations = fetch_all_locations()
with open("data/locations", mode="w", encoding="UTF-8") as locations_file:
for loc in locations:
if loc.strip() != "":
locations_file.write(f"{loc}\n")