Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/point of interest structure upgrade #22

Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions adapters/spreadsheet_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
class SpreadsheetAdapter:
ALIASES = {
# Same: name, city, address, organizations, descriptions
"latitude": "lat",
"longitude": "lng",
"category": "categories",
"country_code": "country"
'latitude': 'lat',
'longitude': 'lng',
'category': 'categories',
'country_code': 'country',
'opening_hours': 'open_hours',
}

def transform(self, source: List[SpreadsheetRow]) -> List[PointOfInterest]:
Expand All @@ -23,7 +24,7 @@ def transform_row(self, row: SpreadsheetRow) -> PointOfInterest:

for name, value in row.dict().items():
field = self.ALIASES.get(name, name)
converter = getattr(self, f"convert_{field}", self.convert_noop)
converter = getattr(self, f'convert_{field}', self.convert_noop)
fields[field] = converter(value)

return PointOfInterest(**fields)
Expand All @@ -33,14 +34,23 @@ def convert_noop(self, value: str) -> str:

def convert_country(self, country_code: str) -> str:
country = pycountry.countries.get(alpha_2=country_code)
return country.name if country else ""
return country.name if country else ''

def convert_lat(self, latitude: str) -> str:
return self._convert_number(latitude)

def convert_lng(self, longitude: str) -> str:
return self._convert_number(longitude)

def convert_organizations(self, organizations: str) -> List[str]:
return organizations.split(', ')

def convert_categories(self, categories: str) -> List[str]:
return categories.split(', ')

def convert_tags(self, tags: str) -> List[str]:
return tags.split(', ')

@staticmethod
def _convert_number(number: str) -> str:
return number.replace(",", ".")
return number.replace(',', '.')
3 changes: 1 addition & 2 deletions config/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import Set


DEFAULT_CATEGORY = 'Any help'

CATEGORIES: Set[str] = {
'Clothes',
'Accommodation',
Expand All @@ -21,4 +19,5 @@
'Disability support',
'Pets'
}

DEFAULT_CATEGORY: str = 'Any help'
29 changes: 20 additions & 9 deletions models/point_of_interest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
from typing import List
from pydantic import BaseModel


class PointOfInterest(BaseModel):
name: str = ""
country: str = ""
city: str = ""
address: str = ""
lat: str = ""
lng: str = ""
categories: str = ""
organizations: str = ""
description: str = ""
name: str = ''
country: str = ''
city: str = ''
address: str = ''
categories: List[str] = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure that these convert correctly into CSV and don't break existing unit tests for spiders

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests I did fix, but export to CSV might actually work a bit differently. Let me poke around it a bit more…

organizations: List[str] = []
description: str = ''
lat: str = ''
lng: str = ''
phone: str = ''
email: str = ''
url: str = ''
socialmedia: str = ''
messenger: str = ''
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While you were working on it they've add two more fields to the schema 😂

messenger ->  FB messenger
telegram  -> telegram
whatsapp -> whatsapp

open_hours: str = ''
tags: List[str] = []
icon: str = ''
approved: bool = True
active: bool = True
13 changes: 12 additions & 1 deletion repositories/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def close_file(file: TextIO):
file.close()


def convert_list(data: list) -> str:
return ','.join(data)


@dataclass
class CSVRepository:

Expand All @@ -28,13 +32,20 @@ class CSVRepository:
close_file: Callable = close_file

def write(self, entries: List[PointOfInterest]) -> List[PointOfInterest]:
converters = {
list: convert_list,
}
fieldnames = list(PointOfInterest.schema()["properties"].keys())
file = self.open_file(self.settings.output_file)

writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry.dict())
row = {
k: converters.get(type(v), lambda x: x)(v)
for k, v in entry.dict().items()
}
writer.writerow(row)

self.close_file(file)

Expand Down
10 changes: 4 additions & 6 deletions scraping/spiders/canada_immigration_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,13 @@ def parse(self, response, **kwargs):
'city': point['City']['en'],
'address': f'Canada, {point["Province"]["en"]}, '
f'{point["City"]["en"]}, {point["Address"]["en"]}',
'categories': ','.join(relevant_categories),
'categories': relevant_categories,
'description': '\n'.join(description),
'organizations': '',
'organizations': ['Canada IRCC'],
'lat': point['Coordinates']['Latitude'],
'lng': point['Coordinates']['Longitude'],
'phone': point.get('Telephone', {}).get('en', ''),
'email': point.get('Email', {}).get('en', ''),
}


Expand Down Expand Up @@ -97,10 +99,6 @@ def build_description(point, other_categories):
description = []
if point.get('Note', {}).get('en'):
description.append(f'{point["Note"]["en"]}')
if point.get('Telephone', {}).get('en'):
description.append(f'Phone: {point["Telephone"]["en"]}')
if point.get('Email', {}).get('en'):
description.append(f'Email: {point["Email"]["en"]}')
if other_categories:
description.append(f'Other services: ' + ', '.join(other_categories))
return description
28 changes: 6 additions & 22 deletions scraping/spiders/france_red_cross.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def parse_point(block):
point = {
'name': name,
'country': 'France',
'organizations': ['Red Cross France'],
'lat': '',
'lng': '',
}

# XXX: all info here is in French, might need to either make it explicit
Expand All @@ -83,12 +86,6 @@ def parse_point(block):

normalize_point_data(point)

point.update({
"organizations": "",
"lat": "",
"lng": "",
})

return point


Expand All @@ -109,23 +106,10 @@ def normalize_point_data(point):
if not point['categories']:
point['categories'] = [DEFAULT_CATEGORY]

point['categories'] = ','.join(point['categories'])

if point.get('_other_services'):
services = ', '.join(point['_other_services'])
point['description'] += f'\nOther services: {services}'

if point.get('_website'):
point['description'] += f'\nWebsite: {point["_website"]}'

if point.get('_phone'):
point['description'] += f'\nContact information: {point["_phone"]}'
if point.get('_fax'):
point['description'] += f', fax {point["_phone"]}'

if point.get('_working_hours'):
point['description'] += f'\nWorking hours: {point["_working_hours"]}'

point['description'] = point['description'].strip()

# all keys starting with _ are for processing purposes only
Expand All @@ -139,10 +123,10 @@ def parse_point_keys(paragraph):
'adresse': (None, parse_address),
'actions': (None, parse_actions),
'filière ': (None, parse_categories),
'site web': ('_website', parse_website),
'téléphone': ('_phone', parse_default_key),
'site web': ('url', parse_website),
'téléphone': ('phone', parse_default_key),
'fax': ('_fax', parse_default_key),
"heures d'ouverture": ('_working_hours', parse_default_key),
"heures d'ouverture": ('open_hours', parse_default_key),
}

title = paragraph.css('strong::text').get().lower()
Expand Down
103 changes: 68 additions & 35 deletions scraping/spiders/moldova_dopomoga.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import logging
import json
from typing import Tuple, Union

from typing import List, Tuple, Union

from scrapy import Spider, Selector
from scrapy import Spider
from scrapy.http import Response


Expand All @@ -11,8 +12,69 @@
ADDRESS_PATTERN1 = r"(s.\s)(.+)(,\sstr.\s)(\D+)([\d\w/]+)"
ADDRESS_PATTERN2 = rf"{BASE_ADDRESS_PATTERN}(,|\()"
ADDRESS_PATTERN3 = rf"{BASE_ADDRESS_PATTERN}(,|[\d\w/]+|\()"
KNOWN_STREETS = {'Hristo Botev'}
VENUE_PREFIXES = {'IP', 'Centrul', 'Complexul', 'Biserica'}
KNOWN_STREETS = {"Hristo Botev"}
VENUE_PREFIXES = {"IP", "Centrul", "Complexul", "Biserica"}


log = logging.getLogger(__name__)


class DopomogaSpider(Spider):
name = "dopomoga"
start_urls = [
"https://www.google.com/maps/d/embed?"
"mid=1RiyRvxOE8q3Z-eo7e298nLn_I0nKXqq6&"
"ehbc=2E312F&ll=47.152161219376964%2C28.378650750000023&z=8",
]

def parse(self, response: Response):
json_xml = parse_response(response)
points = parse_points(json_xml)
for coords, meta in points:
name, address, city, capacity = parse_point_meta(meta)
yield {
"name": name,
"country": "Moldova",
"city": city,
"address": f"Republica Moldova, {city}, {address}",
"categories": ["Accommodation"],
"description": f"Capacity: {capacity}",
"organizations": ["Domomoga Moldova"],
"lat": coords[0][0][0],
"lng": coords[0][0][1],
}


def parse_response(response):
regexp = re.compile(r'var _pageData = "(.*)";', re.S)
json_str, = re.search(regexp, response.text).groups()
return json.loads(json_str.replace("\\", ""))


# XXX: there must be some that encodes/decodes this kind of structure
def parse_points(json_xml):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add a dedicated unit test for this function so that we can see what kind of JSON you're parsing?

In theory, you could deserialize it into a pydantic model

for a in json_xml[1][6][0]:
if type(a) is not list or not a:
continue
for b in a[0]:
if type(b) is not list or not b:
continue
for c in b:
if len(c) <= 7:
continue
for d in c:
if type(d) is not list or not d:
continue
yield (d[1], d[5])


def parse_point_meta(meta):
name_blob, _, _, more = meta
_, city_blob, capacity_blob = more
name_address = name_blob[1].pop()
city = city_blob[1].pop()
name, address = parse_details(name_address, city=city)
return name, address, city, capacity_blob[1].pop()


def clean(value: str) -> str:
Expand Down Expand Up @@ -63,8 +125,7 @@ def parse_details(details: str, city: str = None) -> Tuple[str, str]:
address = strip_punctuation(parts[0])
return name, address or name


parts = details.split(',', 1)
parts = details.split(",", 1)
if len(parts) == 1:
value = clean(parts[0])
return value, value
Expand Down Expand Up @@ -104,31 +165,3 @@ def find_venue(details: str) -> Union[str, None]:
if venue_prefix:
name = sorted(details.split(venue_prefix), key=lambda x: -len(x))[0]
return f"{venue_prefix}{name}"


class DopomogaSpider(Spider):
name = "dopomoga"
start_urls = [
'https://dopomoga.gov.md/akkreditovannye-centry-dlya-bezhencev/'
]

def parse(self, response: Response):
rows: List[Selector] = response.css('.ty-wysiwyg-content table tr')
for row in rows[1:]:
_, city, details, capacity = [clean(cell) for cell in row.css('td::text').getall()]

name, address = parse_details(details, city=city)

point = {
"name": name,
"country": "Moldova",
"city": city,
"address": f"Republica Moldova, {city}, {address}",
"categories": "Accommodation",
"description": f"Capacity: {capacity}",
"organizations": "",
"lat": "",
"lng": "",
}

yield point
20 changes: 11 additions & 9 deletions scraping/spiders/poland_rjps.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,12 @@ def parse(self, response, category: str):
'address': self._get_address(response),
'lat': '',
'lng': '',
'category': category or DEFAULT_CATEGORY,
'organizations': '',
'description': self._get_description(response)
'categories': [category or DEFAULT_CATEGORY],
'organizations': ['Poland RJPS'],
'description': self._get_description(response),
'phone': self._get_phone(response),
'email': self._get_email(response),
'url': self._get_website(response),
}

def _get_name(self, response):
Expand All @@ -74,11 +77,9 @@ def _get_address(self, response):
return ''.join(lines)

def _get_description(self, response):
rows = [self._get_email(response),
self._get_phone(response),
self._get_website(response),
self._get_update_date(response)]

rows = [
self._get_update_date(response),
]
return '\n'.join(map(self._clean_spaces, rows))

def _get_email(self, response):
Expand All @@ -88,7 +89,8 @@ def _get_phone(self, response):
return response.css('div[title=Telefon] > div > span.wrap-anywhere::text').get() or ''

def _get_website(self, response):
return response.css('div[title="Strona www"] > div > div::text').get() or ''
url = response.css('div[title="Strona www"] > div > div::text').get() or ''
return url.strip()

def _get_update_date(self, response):
data = self._clean_spaces(response.css('body > div > div > div > div.data-aktualizacji::text').get()) or ''
Expand Down
Loading