forked from WoutervdVijver/challenge-collecting-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_data_saveCsv.py
189 lines (148 loc) · 5.98 KB
/
scrape_data_saveCsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from typing import List
# Work in progress: try to get the missing data, Locality also not showing, just like area, swimming pool..
# Note: Some data is set as default empty because for now I haven't found a way to get that data, I will try..
def getDataFrame(my_urls: List[str]):
"""
Function that takes list of urls and scrapes the needed data from the given list of urls.
:params my_urls as list of str
"""
# creates the dataframe of all data
headers_df = [
"Locality",
"Type of property",
"Subtype of property",
"Price",
"Type of sale",
"Number of rooms",
"Area",
"Fully equipped kitchen",
"Furnished",
"Open fire",
"Terrace",
"Garden",
"Surface of the land",
"Surface area of the plot of land",
"Number of facades",
"Swimming pool",
"State of the building",
]
df = pd.DataFrame(columns=headers_df)
# We run through all urls
for my_url in my_urls:
url = my_url
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
# We retrieve the area
area = None
more_info = soup.find("p", attrs={"class": "classified__information--property"})
if more_info != None:
if len(more_info.text.split()) > 3:
area = more_info.text.split()[3]
# We retrieve the property
pre_type_property = soup.find("h1", {"class": "classified__title"})
if pre_type_property:
type_property = pre_type_property.text.strip().replace("\n", "").split()[0]
else:
type_property = None
subtype_property = type_property
# Price is retrieved
price = soup.find("span", {"class": "sr-only"})
price = "None" if price == None else price.text.strip().replace("€", "")
# We limit type_property to house or appartment
if type_property:
if type_property.lower() not in ["house", "appartment"]:
type_property = "House"
# dictionary containing the scraped data
table = soup.findAll("table", {"class": "classified-table"})
my_dict = {}
for table_item in table:
for row in table_item.find_all("tr", class_="classified-table__row"):
if row is not None:
header_row = row.find(
"th", class_="classified-table__header", text=True
)
data_row = row.find(
"td", class_="classified-table__data", text=True
)
if header_row is not None and data_row is not None:
header_name = str(header_row.string).strip()
column_name = str(data_row.string).strip()
my_dict[header_name] = column_name
locality = my_dict.get("Neighbourhood or locality")
type_sale = my_dict.get("Tenement building")
numbers_rooms = my_dict.get("Bedrooms")
if my_dict.get("Kitchen type") == "Installed":
kitchen = 1
else:
kitchen = 0
furnished = my_dict.get("Furnished")
# No info on immoweb about that
open_fire = ""
# No confirmation on garden on the page but it does have garden surface, so maybe we could use that to confirm that garden exists
garden = ""
surface_of_land = ""
terrace = my_dict.get("Terrace")
surface_plot_land = ""
number_facades = my_dict.get("Number of frontages")
swimming_pool = ""
state_building = my_dict.get("Building condition")
my_list = [
locality,
type_property,
subtype_property,
price,
type_sale,
numbers_rooms,
area,
kitchen,
furnished,
open_fire,
terrace,
garden,
surface_of_land,
surface_plot_land,
number_facades,
swimming_pool,
state_building,
]
# Create new dictionary from headers_df and my_list
new_row = dict(zip(headers_df, my_list))
df = df.append(new_row, ignore_index=True)
# Replaces Yes and No for 1/0
df["Terrace"] = np.where(df["Terrace"] == "Yes", "1", df["Terrace"])
df["Terrace"] = np.where(df["Terrace"] == "No", "0", df["Terrace"])
df["Furnished"] = np.where(df["Furnished"] == "Yes", "1", df["Furnished"])
df["Furnished"] = np.where(df["Furnished"] == "No", "0", df["Furnished"])
# Convert all empty strings to None
df = df.replace(r"^\s*$", "None", regex=True)
df
df.head()
# print(df.head())
return df
# list of urls to test
# my_url_list = [
# "https://www.immoweb.be/en/classified/house/for-sale/merelbeke/9820/9768055?searchId=621c8a1b690a0",
# "https://www.immoweb.be/en/classified/apartment-block/for-sale/aarschot/3200/9781041?searchId=621e8012c1135",
# "https://www.immoweb.be/en/classified/house/for-sale/de-panne/8660/9781188?searchId=621e8012c1135",
# "https://www.immoweb.be/en/classified/house/for-sale/brasschaat/2930/9781340?searchId=621e8012c1135",
# "https://www.immoweb.be/en/classified/house/for-sale/hoogstraten/2320/9780998?searchId=621e81bd33bc2",
# ]
def get_url_list(name: str) -> List[str]:
"""
Function that takes a name and returns a list of urls form the file with that name
:params name as str
"""
prop = pd.read_csv(f"url files/urls_{name}")
return np.array(prop).reshape((prop.size))[1::2]
def create_new_csv(name: str):
"""
Function that takes a name refering to one of the url files and creates a .csv file with the related data
"""
my_url_list = get_url_list(name)
df = getDataFrame(my_url_list)
df.to_csv(f"property_files/property_{name}.csv")
create_new_csv("flat-studio")