1
- import re
2
-
3
1
from utils import CanadianPerson as Person
4
2
from utils import CanadianScraper
5
3
6
- COUNCIL_PAGE = (
7
- "https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
8
- )
9
- MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
4
+ COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
5
+ MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"
10
6
11
7
12
8
class MarkhamPersonScraper (CanadianScraper ):
@@ -17,10 +13,15 @@ def scrape(self):
17
13
18
14
yield self .scrape_mayor (MAYOR_PAGE )
19
15
20
- councillors = page .xpath ('//div[@class="col-sm-3 col-xs-6"]' )
16
+ councillors = page .xpath (
17
+ '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div'
18
+ )
21
19
assert len (councillors ), "No councillors found"
20
+
22
21
for councillor in councillors :
23
- name , district = councillor .xpath (".//h4/text()" )[0 ].split (", " )
22
+ name = councillor .xpath (".//h3/text()" )[0 ].strip ()
23
+ district = councillor .xpath (".//p/text()" )[0 ].strip ()
24
+
24
25
if "Ward" in district :
25
26
district = district .replace ("Councillor" , "" ).strip ()
26
27
role = "Councillor"
@@ -33,9 +34,7 @@ def scrape(self):
33
34
district = "Markham"
34
35
35
36
image = councillor .xpath (".//img/@src" )[0 ]
36
- url = "https://www.markham.ca/wps/portal/home/about" + re .search (
37
- r"(?<=about).*(?='\))" , councillor .xpath (".//a/@href" )[0 ]
38
- ).group (0 )
37
+ url = councillor .xpath (".//a/@href" )[0 ]
39
38
40
39
address , phone , email , links = self .get_contact (url )
41
40
@@ -56,10 +55,27 @@ def scrape(self):
56
55
def get_contact (self , url ):
57
56
page = self .lxmlize (url )
58
57
59
- contact_node = page .xpath ('//div[@class="vcard col-sm-6"]' )[0 ]
58
+ contact_node = page .xpath (
59
+ '//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]'
60
+ )[0 ]
60
61
links = []
61
62
62
- address = contact_node .xpath (".//p/text()" )[:2 ]
63
+ if contact_node .xpath ('.//span[@class="address-line1"]/text()' ):
64
+ address = " " .join (
65
+ (
66
+ contact_node .xpath ('.//span[@class="address-line1"]/text()' )[0 ],
67
+ contact_node .xpath ('.//span[@class="locality"]/text()' )[0 ],
68
+ contact_node .xpath ('.//span[@class="administrative-area"]/text()' )[0 ],
69
+ contact_node .xpath ('.//span[@class="postal-code"]/text()' )[0 ],
70
+ contact_node .xpath ('.//span[@class="country"]/text()' )[0 ],
71
+ )
72
+ )
73
+ else :
74
+ contact_node = page .xpath (
75
+ '//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]'
76
+ )[0 ]
77
+ address = f'{ contact_node .xpath (".//p/text()" )[0 ]} { contact_node .xpath (".//p/text()" )[1 ]} '
78
+
63
79
links = get_links (contact_node )
64
80
phone = self .get_phone (contact_node )
65
81
email = self .get_email (contact_node )
@@ -68,12 +84,15 @@ def get_contact(self, url):
68
84
69
85
def scrape_mayor (self , url ):
70
86
page = self .lxmlize (url )
71
- name = page .xpath ('//img/@alt[contains(., "Mayor")]' )[0 ].split (", " , 1 )[1 ]
72
- email = self .get_email (page )
73
- phone = self .get_phone (page )
87
+ name = page .xpath (
88
+ './/div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()'
89
+ )[0 ]
90
+ contact_node = page .xpath ('.//div[@class="dept-contact-info--block"]' )[0 ]
91
+ email = self .get_email (contact_node )
92
+ phone = self .get_phone (contact_node )
74
93
75
94
p = Person (primary_org = "legislature" , name = name , district = "Markham" , role = "Mayor" )
76
- p .image = page .xpath ('//img[contains(./@alt, "Mayor")] /@src' )[0 ]
95
+ p .image = page .xpath ('.//div[@class="align-right media--image"]/div/img /@src' )[0 ]
77
96
p .add_contact ("email" , email )
78
97
p .add_contact ("voice" , phone , "legislature" )
79
98
p .add_source (url )
@@ -86,6 +105,6 @@ def get_links(elem):
86
105
links = elem .xpath (".//a" )
87
106
for link in links :
88
107
link = link .attrib ["href" ]
89
- if "http://www.markham.ca" not in link and "mail" not in link :
108
+ if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link :
90
109
links_r .append (link )
91
110
return links_r
0 commit comments