Merge branch 'opencivicdata:master' into montreal_est_scraper

iepmas · web-flow · commit 29c7f83333d4 · 2024-11-06T11:37:28.000-05:00
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.9
diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py
@@ -36,7 +36,9 @@ def scrape(self):
             '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/@href'
         )[0]
         page = self.lxmlize(mayor_url)
-        role, name = page.xpath("//h1/span")[0].text_content().split(" ", 1)
+        role, name = page.xpath(
+            '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/text()'
+        )[0].split(" ", 1)
         photo = councillor.xpath('//div[@class="field__item"]/img/@src')[0]
         email = self.get_email(page)
         phone = self.get_phone(page)
diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py
@@ -2,6 +2,6 @@
 
 
 class GuelphPersonScraper(CSVScraper):
-    # http://data.open.guelph.ca/dataset/city-of-guelph-contacts
-    csv_url = "http://data.open.guelph.ca/datafiles/guelph-mayor-and-councillors-contact-information-2018-2022.csv"
+    # https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about
+    csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv"
     many_posts_per_area = True
diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py
@@ -1,12 +1,8 @@
-import re
-
 from utils import CanadianPerson as Person
 from utils import CanadianScraper
 
-COUNCIL_PAGE = (
-    "https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
-)
-MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
+COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
+MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"
 
 
 class MarkhamPersonScraper(CanadianScraper):
@@ -17,10 +13,15 @@ def scrape(self):
 
         yield self.scrape_mayor(MAYOR_PAGE)
 
-        councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]')
+        councillors = page.xpath(
+            '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div'
+        )
         assert len(councillors), "No councillors found"
+
         for councillor in councillors:
-            name, district = councillor.xpath(".//h4/text()")[0].split(", ")
+            name = councillor.xpath(".//h3/text()")[0].strip()
+            district = councillor.xpath(".//p/text()")[0].strip()
+
             if "Ward" in district:
                 district = district.replace("Councillor", "").strip()
                 role = "Councillor"
@@ -33,9 +34,7 @@ def scrape(self):
                 district = "Markham"
 
             image = councillor.xpath(".//img/@src")[0]
-            url = "https://www.markham.ca/wps/portal/home/about" + re.search(
-                r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0]
-            ).group(0)
+            url = councillor.xpath(".//a/@href")[0]
 
             address, phone, email, links = self.get_contact(url)
 
@@ -56,10 +55,27 @@ def scrape(self):
     def get_contact(self, url):
         page = self.lxmlize(url)
 
-        contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0]
+        contact_node = page.xpath(
+            '//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]'
+        )[0]
         links = []
 
-        address = contact_node.xpath(".//p/text()")[:2]
+        if contact_node.xpath('.//span[@class="address-line1"]/text()'):
+            address = " ".join(
+                (
+                    contact_node.xpath('.//span[@class="address-line1"]/text()')[0],
+                    contact_node.xpath('.//span[@class="locality"]/text()')[0],
+                    contact_node.xpath('.//span[@class="administrative-area"]/text()')[0],
+                    contact_node.xpath('.//span[@class="postal-code"]/text()')[0],
+                    contact_node.xpath('.//span[@class="country"]/text()')[0],
+                )
+            )
+        else:
+            contact_node = page.xpath(
+                '//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]'
+            )[0]
+            address = f'{contact_node.xpath(".//p/text()")[0]} {contact_node.xpath(".//p/text()")[1]}'
+
         links = get_links(contact_node)
         phone = self.get_phone(contact_node)
         email = self.get_email(contact_node)
@@ -68,12 +84,15 @@ def get_contact(self, url):
 
     def scrape_mayor(self, url):
         page = self.lxmlize(url)
-        name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1]
-        email = self.get_email(page)
-        phone = self.get_phone(page)
+        name = page.xpath(
+            './/div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()'
+        )[0]
+        contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0]
+        email = self.get_email(contact_node)
+        phone = self.get_phone(contact_node)
 
         p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor")
-        p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
+        p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0]
         p.add_contact("email", email)
         p.add_contact("voice", phone, "legislature")
         p.add_source(url)
@@ -86,6 +105,6 @@ def get_links(elem):
     links = elem.xpath(".//a")
     for link in links:
         link = link.attrib["href"]
-        if "http://www.markham.ca" not in link and "mail" not in link:
+        if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link:
             links_r.append(link)
     return links_r
diff --git a/ca_on_mississauga/people.py b/ca_on_mississauga/people.py
@@ -1,8 +1,9 @@
+import re
+
 from utils import CanadianPerson as Person
 from utils import CanadianScraper
 
 COUNCIL_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorandcouncil"
-MAYOR_PAGE = "http://www.mississauga.ca/portal/cityhall/mayorsoffice"
 CONTACT_PAGE = "http://www.mississauga.ca/portal/helpfeedback/contactus"
 
 
@@ -16,21 +17,15 @@ def scrape(self):
             if "vacant" not in councillor_url.xpath(".//div//div[1]/text()")[0].lower():
                 yield self.councillor_data(councillor_url.attrib["href"])
 
-        mayor_page = self.lxmlize(MAYOR_PAGE)
-        mayor_name = mayor_page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
-        if "vacant" not in mayor_name.lower():
-            yield self.mayor_data(MAYOR_PAGE)
+        mayor_url = page.xpath('//li/a[contains(@href, "mayor")]')[0]
+        if "vacant" not in mayor_url.xpath(".//div//div[1]/text()")[0].lower():
+            yield self.mayor_data(mayor_url.attrib["href"])
 
     def councillor_data(self, url):
         page = self.lxmlize(url)
 
         name_district = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
-        hyphen = name_district.find("Councillor")
-        district = name_district[: hyphen - 3]
-        name = name_district[hyphen:]
-        bracket = name.find("(")
-        if bracket != -1:
-            name = name[:bracket]
+        district, name = re.split(r" – (?:Councillor (?:and Deputy Mayor )?)?", name_district)  # n-dash
         email = self.get_email(page, '//section[contains(@class, "module-content")]')
         photo = page.xpath(
             '//section[contains(@class, "module-content")]/p[1]/img/@src|//section[contains(@class, "module-content")]/p[1]/b/img/@src|//section[contains(@class, "module-content")]/p[1]/strong/img/@src'
@@ -47,9 +42,9 @@ def councillor_data(self, url):
     def mayor_data(self, url):
         page = self.lxmlize(url)
 
-        name_text = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
-        name = name_text.split(",")[0]
-        photo = page.xpath('//img[contains(@src, "mayor")]/@src')[0]
+        name = page.xpath('//*[@id="com-main"]/div/div/div/h1/text()')[0]
+        name = name.replace("Mayor – ", "")
+        photo = page.xpath('//*[@id="65a01af8598b7"]/p[1]/img/@src')[0]
 
         p = Person(primary_org="legislature", name=name, district="Mississauga", role="Mayor")
         p.add_source(url)
diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py
@@ -45,4 +45,4 @@ def scrape(self):
 
     def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False):
         requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL"  # site uses a weak DH key
-        return super().lxmlize(url, encoding, user_agent, cookies, xml)
+        return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml)
diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py
@@ -1,39 +1,31 @@
 from utils import CanadianPerson as Person
 from utils import CanadianScraper
 
-COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl"
+COUNCIL_PAGE = "https://www.wilmot.ca/en/township-office/council.aspx"
 
 
 class WilmotPersonScraper(CanadianScraper):
     def scrape(self):
         page = self.lxmlize(COUNCIL_PAGE)
 
-        councillors = page.xpath('//table[@class="contactList"]//tr')
+        councillors = page.xpath('//table[@class="icrtAccordion"]//tr')
         assert len(councillors), "No councillors found"
-        for councillor in councillors:
-            name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1)
-            if "Mayor" in role_district:
-                yield scrape_mayor(councillor, name)
-                continue
-            role, district = role_district.split(" - ")
-
+        for i in range(0, len(councillors), 2):
+            role_name, contact_info = councillors[i], councillors[i + 1]
+            role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ")
+
+            # "Ward 1 Councillor"
+            if "Councillor" in role:
+                district = role.split(" Councillor")[0]
+                role = "Councillor"
+            # "Mayor", "Executive Officer to the Mayor and Council"
+            else:
+                district = "Wilmot"
+
+            phone = self.get_phone(contact_info)
+            email = self.get_email(contact_info)
             p = Person(primary_org="legislature", name=name, district=district, role=role)
             p.add_source(COUNCIL_PAGE)
-
-            phone = self.get_phone(councillor).replace("/", "")
             p.add_contact("voice", phone, "legislature")
+            p.add_contact("email", email)
             yield p
-
-
-def scrape_mayor(div, name):
-    p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor")
-    p.add_source(COUNCIL_PAGE)
-
-    address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content()
-    phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0]
-    other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0]
-    p.add_contact("address", address, "legislature")
-    p.add_contact("voice", phone, "legislature")
-    p.add_contact("voice", other_phone, "office")
-
-    return p
diff --git a/ca_qc_dollard_des_ormeaux/people.py b/ca_qc_dollard_des_ormeaux/people.py
@@ -29,8 +29,9 @@ def scrape(self):
 
             p = Person(primary_org="legislature", name=name, district=district, role=role)
             p.add_source(COUNCIL_PAGE)
-            p.image = councillor.xpath(".//@data-src")[0]
-
+            image = councillor.xpath(".//@data-src")
+            if image:
+                p.image = image[0]
             p.add_contact("email", email)
             p.add_contact("voice", general_phone, "legislature")
             p.add_contact("fax", general_fax, "legislature")