Skip to content

Commit 29c7f83

Browse files
authored
Merge branch 'opencivicdata:master' into montreal_est_scraper
2 parents f8d2789 + 26b8e72 commit 29c7f83

File tree

8 files changed

+73
-63
lines changed

8 files changed

+73
-63
lines changed

.python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.9

ca_bc_victoria/people.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def scrape(self):
3636
'//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/@href'
3737
)[0]
3838
page = self.lxmlize(mayor_url)
39-
role, name = page.xpath("//h1/span")[0].text_content().split(" ", 1)
39+
role, name = page.xpath(
40+
'//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/text()'
41+
)[0].split(" ", 1)
4042
photo = councillor.xpath('//div[@class="field__item"]/img/@src')[0]
4143
email = self.get_email(page)
4244
phone = self.get_phone(page)

ca_on_guelph/people.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33

44
class GuelphPersonScraper(CSVScraper):
5-
# http://data.open.guelph.ca/dataset/city-of-guelph-contacts
6-
csv_url = "http://data.open.guelph.ca/datafiles/guelph-mayor-and-councillors-contact-information-2018-2022.csv"
5+
# https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about
6+
csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv"
77
many_posts_per_area = True

ca_on_markham/people.py

+37-18
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
1-
import re
2-
31
from utils import CanadianPerson as Person
42
from utils import CanadianScraper
53

6-
COUNCIL_PAGE = (
7-
"https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
8-
)
9-
MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
4+
COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
5+
MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"
106

117

128
class MarkhamPersonScraper(CanadianScraper):
@@ -17,10 +13,15 @@ def scrape(self):
1713

1814
yield self.scrape_mayor(MAYOR_PAGE)
1915

20-
councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]')
16+
councillors = page.xpath(
17+
'//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div'
18+
)
2119
assert len(councillors), "No councillors found"
20+
2221
for councillor in councillors:
23-
name, district = councillor.xpath(".//h4/text()")[0].split(", ")
22+
name = councillor.xpath(".//h3/text()")[0].strip()
23+
district = councillor.xpath(".//p/text()")[0].strip()
24+
2425
if "Ward" in district:
2526
district = district.replace("Councillor", "").strip()
2627
role = "Councillor"
@@ -33,9 +34,7 @@ def scrape(self):
3334
district = "Markham"
3435

3536
image = councillor.xpath(".//img/@src")[0]
36-
url = "https://www.markham.ca/wps/portal/home/about" + re.search(
37-
r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0]
38-
).group(0)
37+
url = councillor.xpath(".//a/@href")[0]
3938

4039
address, phone, email, links = self.get_contact(url)
4140

@@ -56,10 +55,27 @@ def scrape(self):
5655
def get_contact(self, url):
5756
page = self.lxmlize(url)
5857

59-
contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0]
58+
contact_node = page.xpath(
59+
'//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]'
60+
)[0]
6061
links = []
6162

62-
address = contact_node.xpath(".//p/text()")[:2]
63+
if contact_node.xpath('.//span[@class="address-line1"]/text()'):
64+
address = " ".join(
65+
(
66+
contact_node.xpath('.//span[@class="address-line1"]/text()')[0],
67+
contact_node.xpath('.//span[@class="locality"]/text()')[0],
68+
contact_node.xpath('.//span[@class="administrative-area"]/text()')[0],
69+
contact_node.xpath('.//span[@class="postal-code"]/text()')[0],
70+
contact_node.xpath('.//span[@class="country"]/text()')[0],
71+
)
72+
)
73+
else:
74+
contact_node = page.xpath(
75+
'//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]'
76+
)[0]
77+
address = f'{contact_node.xpath(".//p/text()")[0]} {contact_node.xpath(".//p/text()")[1]}'
78+
6379
links = get_links(contact_node)
6480
phone = self.get_phone(contact_node)
6581
email = self.get_email(contact_node)
@@ -68,12 +84,15 @@ def get_contact(self, url):
6884

6985
def scrape_mayor(self, url):
7086
page = self.lxmlize(url)
71-
name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1]
72-
email = self.get_email(page)
73-
phone = self.get_phone(page)
87+
name = page.xpath(
88+
'.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()'
89+
)[0]
90+
contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0]
91+
email = self.get_email(contact_node)
92+
phone = self.get_phone(contact_node)
7493

7594
p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor")
76-
p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
95+
p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0]
7796
p.add_contact("email", email)
7897
p.add_contact("voice", phone, "legislature")
7998
p.add_source(url)
@@ -86,6 +105,6 @@ def get_links(elem):
86105
links = elem.xpath(".//a")
87106
for link in links:
88107
link = link.attrib["href"]
89-
if "http://www.markham.ca" not in link and "mail" not in link:
108+
if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link:
90109
links_r.append(link)
91110
return links_r

ca_on_mississauga/people.py

+9-14
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import re
2+
13
from utils import CanadianPerson as Person
24
from utils import CanadianScraper
35

46
COUNCIL_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorandcouncil"
5-
MAYOR_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorsoffice"
67
CONTACT_PAGE = "http://www.mississauga.ca/portal/helpfeedback/contactus"
78

89

@@ -16,21 +17,15 @@ def scrape(self):
1617
if "vacant" not in councillor_url.xpath(".//div//div[1]/text()")[0].lower():
1718
yield self.councillor_data(councillor_url.attrib["href"])
1819

19-
mayor_page = self.lxmlize(MAYOR_PAGE)
20-
mayor_name = mayor_page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
21-
if "vacant" not in mayor_name.lower():
22-
yield self.mayor_data(MAYOR_PAGE)
20+
mayor_url = page.xpath('//li/a[contains(@href, "mayor")]')[0]
21+
if "vacant" not in mayor_url.xpath(".//div//div[1]/text()")[0].lower():
22+
yield self.mayor_data(mayor_url.attrib["href"])
2323

2424
def councillor_data(self, url):
2525
page = self.lxmlize(url)
2626

2727
name_district = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
28-
hyphen = name_district.find("Councillor")
29-
district = name_district[: hyphen - 3]
30-
name = name_district[hyphen:]
31-
bracket = name.find("(")
32-
if bracket != -1:
33-
name = name[:bracket]
28+
district, name = re.split(r" – (?:Councillor (?:and Deputy Mayor )?)?", name_district) # n-dash
3429
email = self.get_email(page, '//section[contains(@class, "module-content")]')
3530
photo = page.xpath(
3631
'//section[contains(@class, "module-content")]/p[1]/img/@src|//section[contains(@class, "module-content")]/p[1]/b/img/@src|//section[contains(@class, "module-content")]/p[1]/strong/img/@src'
@@ -47,9 +42,9 @@ def councillor_data(self, url):
4742
def mayor_data(self, url):
4843
page = self.lxmlize(url)
4944

50-
name_text = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
51-
name = name_text.split(",")[0]
52-
photo = page.xpath('//img[contains(@src, "mayor")]/@src')[0]
45+
name = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
46+
name = name.replace("Mayor – ", "")
47+
photo = page.xpath('//*[@id="65a01af8598b7"]/p[1]/img/@src')[0]
5348

5449
p = Person(primary_org="legislature", name=name, district="Mississauga", role="Mayor")
5550
p.add_source(url)

ca_on_thunder_bay/people.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,4 @@ def scrape(self):
4545

4646
def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
4747
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key
48-
return super().lxmlize(url, encoding, user_agent, cookies, xml)
48+
return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml)

ca_on_wilmot/people.py

+17-25
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,31 @@
11
from utils import CanadianPerson as Person
22
from utils import CanadianScraper
33

4-
COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl"
4+
COUNCIL_PAGE = "https://www.wilmot.ca/en/township-office/council.aspx"
55

66

77
class WilmotPersonScraper(CanadianScraper):
88
def scrape(self):
99
page = self.lxmlize(COUNCIL_PAGE)
1010

11-
councillors = page.xpath('//table[@class="contactList"]//tr')
11+
councillors = page.xpath('//table[@class="icrtAccordion"]//tr')
1212
assert len(councillors), "No councillors found"
13-
for councillor in councillors:
14-
name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1)
15-
if "Mayor" in role_district:
16-
yield scrape_mayor(councillor, name)
17-
continue
18-
role, district = role_district.split(" - ")
19-
13+
for i in range(0, len(councillors), 2):
14+
role_name, contact_info = councillors[i], councillors[i + 1]
15+
role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ")
16+
17+
# "Ward 1 Councillor"
18+
if "Councillor" in role:
19+
district = role.split(" Councillor")[0]
20+
role = "Councillor"
21+
# "Mayor", "Executive Officer to the Mayor and Council"
22+
else:
23+
district = "Wilmot"
24+
25+
phone = self.get_phone(contact_info)
26+
email = self.get_email(contact_info)
2027
p = Person(primary_org="legislature", name=name, district=district, role=role)
2128
p.add_source(COUNCIL_PAGE)
22-
23-
phone = self.get_phone(councillor).replace("/", "")
2429
p.add_contact("voice", phone, "legislature")
30+
p.add_contact("email", email)
2531
yield p
26-
27-
28-
def scrape_mayor(div, name):
29-
p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor")
30-
p.add_source(COUNCIL_PAGE)
31-
32-
address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content()
33-
phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0]
34-
other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0]
35-
p.add_contact("address", address, "legislature")
36-
p.add_contact("voice", phone, "legislature")
37-
p.add_contact("voice", other_phone, "office")
38-
39-
return p

ca_qc_dollard_des_ormeaux/people.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ def scrape(self):
2929

3030
p = Person(primary_org="legislature", name=name, district=district, role=role)
3131
p.add_source(COUNCIL_PAGE)
32-
p.image = councillor.xpath(".//@data-src")[0]
33-
32+
image = councillor.xpath(".//@data-src")
33+
if image:
34+
p.image = image[0]
3435
p.add_contact("email", email)
3536
p.add_contact("voice", general_phone, "legislature")
3637
p.add_contact("fax", general_fax, "legislature")

0 commit comments

Comments
 (0)