-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/point of interest structure upgrade #22
Changes from 5 commits
034971b
d2cd13e
2d0304b
3296c96
19fc0f0
b36f777
55f5f43
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,24 @@ | ||
from typing import List | ||
from pydantic import BaseModel | ||
|
||
|
||
class PointOfInterest(BaseModel): | ||
name: str = "" | ||
country: str = "" | ||
city: str = "" | ||
address: str = "" | ||
lat: str = "" | ||
lng: str = "" | ||
categories: str = "" | ||
organizations: str = "" | ||
description: str = "" | ||
name: str = '' | ||
country: str = '' | ||
city: str = '' | ||
address: str = '' | ||
categories: List[str] = [] | ||
organizations: List[str] = [] | ||
description: str = '' | ||
lat: str = '' | ||
lng: str = '' | ||
phone: str = '' | ||
email: str = '' | ||
url: str = '' | ||
socialmedia: str = '' | ||
messenger: str = '' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While you were working on it they've add two more fields to the schema 😂
|
||
open_hours: str = '' | ||
tags: List[str] = [] | ||
icon: str = '' | ||
approved: bool = True | ||
active: bool = True |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
import re | ||
import logging | ||
import json | ||
from typing import Tuple, Union | ||
|
||
from typing import List, Tuple, Union | ||
|
||
from scrapy import Spider, Selector | ||
from scrapy import Spider | ||
from scrapy.http import Response | ||
|
||
|
||
|
@@ -11,8 +12,69 @@ | |
ADDRESS_PATTERN1 = r"(s.\s)(.+)(,\sstr.\s)(\D+)([\d\w/]+)" | ||
ADDRESS_PATTERN2 = rf"{BASE_ADDRESS_PATTERN}(,|\()" | ||
ADDRESS_PATTERN3 = rf"{BASE_ADDRESS_PATTERN}(,|[\d\w/]+|\()" | ||
KNOWN_STREETS = {'Hristo Botev'} | ||
VENUE_PREFIXES = {'IP', 'Centrul', 'Complexul', 'Biserica'} | ||
KNOWN_STREETS = {"Hristo Botev"} | ||
VENUE_PREFIXES = {"IP", "Centrul", "Complexul", "Biserica"} | ||
|
||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class DopomogaSpider(Spider): | ||
name = "dopomoga" | ||
start_urls = [ | ||
"https://www.google.com/maps/d/embed?" | ||
"mid=1RiyRvxOE8q3Z-eo7e298nLn_I0nKXqq6&" | ||
"ehbc=2E312F&ll=47.152161219376964%2C28.378650750000023&z=8", | ||
] | ||
|
||
def parse(self, response: Response): | ||
json_xml = parse_response(response) | ||
points = parse_points(json_xml) | ||
for coords, meta in points: | ||
name, address, city, capacity = parse_point_meta(meta) | ||
yield { | ||
"name": name, | ||
"country": "Moldova", | ||
"city": city, | ||
"address": f"Republica Moldova, {city}, {address}", | ||
"categories": ["Accommodation"], | ||
"description": f"Capacity: {capacity}", | ||
"organizations": ["Domomoga Moldova"], | ||
"lat": coords[0][0][0], | ||
"lng": coords[0][0][1], | ||
} | ||
|
||
|
||
def parse_response(response): | ||
regexp = re.compile(r'var _pageData = "(.*)";', re.S) | ||
json_str, = re.search(regexp, response.text).groups() | ||
return json.loads(json_str.replace("\\", "")) | ||
|
||
|
||
# XXX: there must be some that encodes/decodes this kind of structure | ||
def parse_points(json_xml): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a dedicated unit test for this function so that we can see what kind of JSON you're parsing? In theory, you could deserialize it into a pydantic model |
||
for a in json_xml[1][6][0]: | ||
if type(a) is not list or not a: | ||
continue | ||
for b in a[0]: | ||
if type(b) is not list or not b: | ||
continue | ||
for c in b: | ||
if len(c) <= 7: | ||
continue | ||
for d in c: | ||
if type(d) is not list or not d: | ||
continue | ||
yield (d[1], d[5]) | ||
|
||
|
||
def parse_point_meta(meta): | ||
name_blob, _, _, more = meta | ||
_, city_blob, capacity_blob = more | ||
name_address = name_blob[1].pop() | ||
city = city_blob[1].pop() | ||
name, address = parse_details(name_address, city=city) | ||
return name, address, city, capacity_blob[1].pop() | ||
|
||
|
||
def clean(value: str) -> str: | ||
|
@@ -63,8 +125,7 @@ def parse_details(details: str, city: str = None) -> Tuple[str, str]: | |
address = strip_punctuation(parts[0]) | ||
return name, address or name | ||
|
||
|
||
parts = details.split(',', 1) | ||
parts = details.split(",", 1) | ||
if len(parts) == 1: | ||
value = clean(parts[0]) | ||
return value, value | ||
|
@@ -104,31 +165,3 @@ def find_venue(details: str) -> Union[str, None]: | |
if venue_prefix: | ||
name = sorted(details.split(venue_prefix), key=lambda x: -len(x))[0] | ||
return f"{venue_prefix}{name}" | ||
|
||
|
||
class DopomogaSpider(Spider): | ||
name = "dopomoga" | ||
start_urls = [ | ||
'https://dopomoga.gov.md/akkreditovannye-centry-dlya-bezhencev/' | ||
] | ||
|
||
def parse(self, response: Response): | ||
rows: List[Selector] = response.css('.ty-wysiwyg-content table tr') | ||
for row in rows[1:]: | ||
_, city, details, capacity = [clean(cell) for cell in row.css('td::text').getall()] | ||
|
||
name, address = parse_details(details, city=city) | ||
|
||
point = { | ||
"name": name, | ||
"country": "Moldova", | ||
"city": city, | ||
"address": f"Republica Moldova, {city}, {address}", | ||
"categories": "Accommodation", | ||
"description": f"Capacity: {capacity}", | ||
"organizations": "", | ||
"lat": "", | ||
"lng": "", | ||
} | ||
|
||
yield point |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sure that these convert correctly into CSV and don't break existing unit tests for spiders
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tests I did fix, but export to CSV might actually work a bit differently. Let me poke around it a bit more…