Skip to content

Commit

Permalink
Feature/point of interest structure upgrade (#22)
Browse files Browse the repository at this point in the history
* Upgrade PointOfInterest structure.

* Upgrade PointOfInterest structure, more changes after fresh rebase.

* Fix moldova_dopomoga crawler.

* moldova_dopomoga new source.

* A couple of fixes.

* More messenger fields.

* Tres commas fix.

Co-authored-by: murchik <[email protected]>
  • Loading branch information
moorchegue and murchik authored Jun 11, 2022
1 parent af62fae commit af0345e
Show file tree
Hide file tree
Showing 21 changed files with 193 additions and 131 deletions.
24 changes: 17 additions & 7 deletions adapters/spreadsheet_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
class SpreadsheetAdapter:
ALIASES = {
# Same: name, city, address, organizations, descriptions
"latitude": "lat",
"longitude": "lng",
"category": "categories",
"country_code": "country"
'latitude': 'lat',
'longitude': 'lng',
'category': 'categories',
'country_code': 'country',
'opening_hours': 'open_hours',
}

def transform(self, source: List[SpreadsheetRow]) -> List[PointOfInterest]:
Expand All @@ -23,7 +24,7 @@ def transform_row(self, row: SpreadsheetRow) -> PointOfInterest:

for name, value in row.dict().items():
field = self.ALIASES.get(name, name)
converter = getattr(self, f"convert_{field}", self.convert_noop)
converter = getattr(self, f'convert_{field}', self.convert_noop)
fields[field] = converter(value)

return PointOfInterest(**fields)
Expand All @@ -33,14 +34,23 @@ def convert_noop(self, value: str) -> str:

def convert_country(self, country_code: str) -> str:
country = pycountry.countries.get(alpha_2=country_code)
return country.name if country else ""
return country.name if country else ''

def convert_lat(self, latitude: str) -> str:
return self._convert_number(latitude)

def convert_lng(self, longitude: str) -> str:
return self._convert_number(longitude)

def convert_organizations(self, organizations: str) -> List[str]:
return organizations.split(', ')

def convert_categories(self, categories: str) -> List[str]:
return categories.split(', ')

def convert_tags(self, tags: str) -> List[str]:
return tags.split(', ')

@staticmethod
def _convert_number(number: str) -> str:
return number.replace(",", ".")
return number.replace(',', '.')
3 changes: 1 addition & 2 deletions config/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import Set


DEFAULT_CATEGORY = 'Any help'

CATEGORIES: Set[str] = {
'Clothes',
'Accommodation',
Expand All @@ -21,4 +19,5 @@
'Disability support',
'Pets'
}

DEFAULT_CATEGORY: str = 'Any help'
31 changes: 22 additions & 9 deletions models/point_of_interest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
from typing import List
from pydantic import BaseModel


class PointOfInterest(BaseModel):
name: str = ""
country: str = ""
city: str = ""
address: str = ""
lat: str = ""
lng: str = ""
categories: str = ""
organizations: str = ""
description: str = ""
name: str = ''
country: str = ''
city: str = ''
address: str = ''
categories: List[str] = []
organizations: List[str] = []
description: str = ''
lat: str = ''
lng: str = ''
phone: str = ''
email: str = ''
url: str = ''
socialmedia: str = ''
fb_messenger: str = ''
telegram: str = ''
whatsapp: str = ''
open_hours: str = ''
tags: List[str] = []
icon: str = ''
approved: bool = True
active: bool = True
13 changes: 12 additions & 1 deletion repositories/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def close_file(file: TextIO):
file.close()


def convert_list(data: list) -> str:
return ','.join(data)


@dataclass
class CSVRepository:

Expand All @@ -28,13 +32,20 @@ class CSVRepository:
close_file: Callable = close_file

def write(self, entries: List[PointOfInterest]) -> List[PointOfInterest]:
converters = {
list: convert_list,
}
fieldnames = list(PointOfInterest.schema()["properties"].keys())
file = self.open_file(self.settings.output_file)

writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry.dict())
row = {
k: converters.get(type(v), lambda x: x)(v)
for k, v in entry.dict().items()
}
writer.writerow(row)

self.close_file(file)

Expand Down
10 changes: 4 additions & 6 deletions scraping/spiders/canada_immigration_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,13 @@ def parse(self, response, **kwargs):
'city': point['City']['en'],
'address': f'Canada, {point["Province"]["en"]}, '
f'{point["City"]["en"]}, {point["Address"]["en"]}',
'categories': ','.join(relevant_categories),
'categories': relevant_categories,
'description': '\n'.join(description),
'organizations': '',
'organizations': ['Canada IRCC'],
'lat': point['Coordinates']['Latitude'],
'lng': point['Coordinates']['Longitude'],
'phone': point.get('Telephone', {}).get('en', ''),
'email': point.get('Email', {}).get('en', ''),
}


Expand Down Expand Up @@ -97,10 +99,6 @@ def build_description(point, other_categories):
description = []
if point.get('Note', {}).get('en'):
description.append(f'{point["Note"]["en"]}')
if point.get('Telephone', {}).get('en'):
description.append(f'Phone: {point["Telephone"]["en"]}')
if point.get('Email', {}).get('en'):
description.append(f'Email: {point["Email"]["en"]}')
if other_categories:
description.append(f'Other services: ' + ', '.join(other_categories))
return description
28 changes: 6 additions & 22 deletions scraping/spiders/france_red_cross.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ def parse_point(block):
point = {
'name': name,
'country': 'France',
'organizations': ['Red Cross France'],
'lat': '',
'lng': '',
}

# XXX: all info here is in French, might need to either make it explicit
Expand All @@ -83,12 +86,6 @@ def parse_point(block):

normalize_point_data(point)

point.update({
"organizations": "",
"lat": "",
"lng": "",
})

return point


Expand All @@ -109,23 +106,10 @@ def normalize_point_data(point):
if not point['categories']:
point['categories'] = [DEFAULT_CATEGORY]

point['categories'] = ','.join(point['categories'])

if point.get('_other_services'):
services = ', '.join(point['_other_services'])
point['description'] += f'\nOther services: {services}'

if point.get('_website'):
point['description'] += f'\nWebsite: {point["_website"]}'

if point.get('_phone'):
point['description'] += f'\nContact information: {point["_phone"]}'
if point.get('_fax'):
point['description'] += f', fax {point["_phone"]}'

if point.get('_working_hours'):
point['description'] += f'\nWorking hours: {point["_working_hours"]}'

point['description'] = point['description'].strip()

# all keys starting with _ are for processing purposes only
Expand All @@ -139,10 +123,10 @@ def parse_point_keys(paragraph):
'adresse': (None, parse_address),
'actions': (None, parse_actions),
'filière ': (None, parse_categories),
'site web': ('_website', parse_website),
'téléphone': ('_phone', parse_default_key),
'site web': ('url', parse_website),
'téléphone': ('phone', parse_default_key),
'fax': ('_fax', parse_default_key),
"heures d'ouverture": ('_working_hours', parse_default_key),
"heures d'ouverture": ('open_hours', parse_default_key),
}

title = paragraph.css('strong::text').get().lower()
Expand Down
103 changes: 68 additions & 35 deletions scraping/spiders/moldova_dopomoga.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import logging
import json
from typing import Tuple, Union

from typing import List, Tuple, Union

from scrapy import Spider, Selector
from scrapy import Spider
from scrapy.http import Response


Expand All @@ -11,8 +12,69 @@
ADDRESS_PATTERN1 = r"(s.\s)(.+)(,\sstr.\s)(\D+)([\d\w/]+)"
ADDRESS_PATTERN2 = rf"{BASE_ADDRESS_PATTERN}(,|\()"
ADDRESS_PATTERN3 = rf"{BASE_ADDRESS_PATTERN}(,|[\d\w/]+|\()"
KNOWN_STREETS = {'Hristo Botev'}
VENUE_PREFIXES = {'IP', 'Centrul', 'Complexul', 'Biserica'}
KNOWN_STREETS = {"Hristo Botev"}
VENUE_PREFIXES = {"IP", "Centrul", "Complexul", "Biserica"}


log = logging.getLogger(__name__)


class DopomogaSpider(Spider):
name = "dopomoga"
start_urls = [
"https://www.google.com/maps/d/embed?"
"mid=1RiyRvxOE8q3Z-eo7e298nLn_I0nKXqq6&"
"ehbc=2E312F&ll=47.152161219376964%2C28.378650750000023&z=8",
]

def parse(self, response: Response):
json_xml = parse_response(response)
points = parse_points(json_xml)
for coords, meta in points:
name, address, city, capacity = parse_point_meta(meta)
yield {
"name": name,
"country": "Moldova",
"city": city,
"address": f"Republica Moldova, {city}, {address}",
"categories": ["Accommodation"],
"description": f"Capacity: {capacity}",
"organizations": ["Domomoga Moldova"],
"lat": coords[0][0][0],
"lng": coords[0][0][1],
}


def parse_response(response):
regexp = re.compile(r'var _pageData = "(.*)";', re.S)
json_str, = re.search(regexp, response.text).groups()
return json.loads(json_str.replace("\\", ""))


# XXX: there must be some that encodes/decodes this kind of structure
def parse_points(json_xml):
for a in json_xml[1][6][0]:
if type(a) is not list or not a:
continue
for b in a[0]:
if type(b) is not list or not b:
continue
for c in b:
if len(c) <= 7:
continue
for d in c:
if type(d) is not list or not d:
continue
yield (d[1], d[5])


def parse_point_meta(meta):
name_blob, _, _, more = meta
_, city_blob, capacity_blob = more
name_address = name_blob[1].pop()
city = city_blob[1].pop()
name, address = parse_details(name_address, city=city)
return name, address, city, capacity_blob[1].pop()


def clean(value: str) -> str:
Expand Down Expand Up @@ -63,8 +125,7 @@ def parse_details(details: str, city: str = None) -> Tuple[str, str]:
address = strip_punctuation(parts[0])
return name, address or name


parts = details.split(',', 1)
parts = details.split(",", 1)
if len(parts) == 1:
value = clean(parts[0])
return value, value
Expand Down Expand Up @@ -104,31 +165,3 @@ def find_venue(details: str) -> Union[str, None]:
if venue_prefix:
name = sorted(details.split(venue_prefix), key=lambda x: -len(x))[0]
return f"{venue_prefix}{name}"


class DopomogaSpider(Spider):
name = "dopomoga"
start_urls = [
'https://dopomoga.gov.md/akkreditovannye-centry-dlya-bezhencev/'
]

def parse(self, response: Response):
rows: List[Selector] = response.css('.ty-wysiwyg-content table tr')
for row in rows[1:]:
_, city, details, capacity = [clean(cell) for cell in row.css('td::text').getall()]

name, address = parse_details(details, city=city)

point = {
"name": name,
"country": "Moldova",
"city": city,
"address": f"Republica Moldova, {city}, {address}",
"categories": "Accommodation",
"description": f"Capacity: {capacity}",
"organizations": "",
"lat": "",
"lng": "",
}

yield point
20 changes: 11 additions & 9 deletions scraping/spiders/poland_rjps.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,12 @@ def parse(self, response, category: str):
'address': self._get_address(response),
'lat': '',
'lng': '',
'category': category or DEFAULT_CATEGORY,
'organizations': '',
'description': self._get_description(response)
'categories': [category or DEFAULT_CATEGORY],
'organizations': ['Poland RJPS'],
'description': self._get_description(response),
'phone': self._get_phone(response),
'email': self._get_email(response),
'url': self._get_website(response),
}

def _get_name(self, response):
Expand All @@ -74,11 +77,9 @@ def _get_address(self, response):
return ''.join(lines)

def _get_description(self, response):
rows = [self._get_email(response),
self._get_phone(response),
self._get_website(response),
self._get_update_date(response)]

rows = [
self._get_update_date(response),
]
return '\n'.join(map(self._clean_spaces, rows))

def _get_email(self, response):
Expand All @@ -88,7 +89,8 @@ def _get_phone(self, response):
return response.css('div[title=Telefon] > div > span.wrap-anywhere::text').get() or ''

def _get_website(self, response):
return response.css('div[title="Strona www"] > div > div::text').get() or ''
url = response.css('div[title="Strona www"] > div > div::text').get() or ''
return url.strip()

def _get_update_date(self, response):
data = self._clean_spaces(response.css('body > div > div > div > div.data-aktualizacji::text').get()) or ''
Expand Down
Loading

0 comments on commit af0345e

Please sign in to comment.