From 8d5cbfeb78e5a58ab057659929fda9f2382ab5b2 Mon Sep 17 00:00:00 2001 From: msj Date: Fri, 16 Dec 2022 16:25:35 -0500 Subject: [PATCH 01/29] Always record when objects are seen in scrapes --- pupa/importers/base.py | 5 +++- pupa/tests/importers/test_base_importer.py | 27 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pupa/importers/base.py b/pupa/importers/base.py index 0b46bab9..3fa0c40d 100644 --- a/pupa/importers/base.py +++ b/pupa/importers/base.py @@ -282,7 +282,10 @@ def import_item(self, data): what = 'update' if what == 'update': - obj.save() + obj.last_updated = utcnow() + + # Refresh the object's last_seen field whether or not we updated + obj.save() # need to create the data else: diff --git a/pupa/tests/importers/test_base_importer.py b/pupa/tests/importers/test_base_importer.py index 489f7ecb..a321b7d8 100644 --- a/pupa/tests/importers/test_base_importer.py +++ b/pupa/tests/importers/test_base_importer.py @@ -84,6 +84,33 @@ def test_apply_transformers(): # but for completeness maybe it is better to do these on each type? +@pytest.mark.django_db +def test_last_seen_updates_on_scrape(): + create_jurisdiction() + o = Organization.objects.create(name='WWE', jurisdiction_id='jid') + + p = Person.objects.create(name='George Washington', family_name='Washington') + p.memberships.create(organization=o) + + expected_updated_at = p.updated_at + last_seen_before_scrape = p.last_seen + + # Simulate no-op scrape + scraped_p = ScrapePerson('George Washington').as_dict() + PersonImporter('jid').import_data([scraped_p]) + + p.refresh_from_db() + + assert p.updated_at < p.last_seen, "Should refresh last_seen but not updated_at" + assert ( + p.updated_at == expected_updated_at + ), "Should not refresh updated_at when there's no update" + + assert ( + p.last_seen > last_seen_before_scrape + ), "Should refresh last_seen even when there's no update" + + @pytest.mark.django_db def test_deduplication_identical_object(): p1 = ScrapePerson('Dwayne').as_dict() From 21a967c9fe49809b2ac26f79e5c165ffdc5c7211 Mon Sep 17 00:00:00 2001 From: msj Date: Wed, 4 Jan 2023 11:19:25 -0500 Subject: [PATCH 02/29] Run black and fix flake8 issues --- pupa/__init__.py | 2 +- pupa/admin.py | 86 ++- pupa/cli/__main__.py | 42 +- pupa/cli/commands/base.py | 3 +- pupa/cli/commands/dbinit.py | 41 +- pupa/cli/commands/init.py | 169 +++-- pupa/cli/commands/party.py | 25 +- pupa/cli/commands/update.py | 324 +++++---- pupa/exceptions.py | 62 +- pupa/ext/ansistrm.py | 110 +-- pupa/importers/base.py | 161 +++-- pupa/importers/bills.py | 153 ++-- pupa/importers/events.py | 168 +++-- pupa/importers/jurisdiction.py | 17 +- pupa/importers/memberships.py | 69 +- pupa/importers/organizations.py | 75 +- pupa/importers/people.py | 74 +- pupa/importers/posts.py | 23 +- pupa/importers/vote_events.py | 131 ++-- pupa/migrations/0001_initial.py | 142 +++- pupa/migrations/0002_auto_20150906_1458.py | 42 +- pupa/migrations/0003_auto_20151118_0408.py | 30 +- pupa/migrations/0004_identifier.py | 28 +- pupa/migrations/0005_auto_20170522_1935.py | 10 +- .../0006_identifier_jurisdiction.py | 15 +- .../0007_sessiondataqualityreport.py | 52 +- pupa/models.py | 52 +- pupa/reports/__init__.py | 2 +- pupa/reports/session.py | 115 ++- pupa/scrape/base.py | 144 ++-- pupa/scrape/bill.py | 163 +++-- pupa/scrape/event.py | 176 +++-- pupa/scrape/jurisdiction.py | 38 +- pupa/scrape/popolo.py | 240 ++++-- pupa/scrape/schemas/bill.py | 46 +- pupa/scrape/schemas/common.py | 45 +- pupa/scrape/schemas/event.py | 85 +-- pupa/scrape/schemas/jurisdiction.py | 20 +- pupa/scrape/schemas/membership.py | 3 +- pupa/scrape/schemas/organization.py | 17 +- pupa/scrape/schemas/person.py | 13 +- pupa/scrape/schemas/post.py | 2 +- pupa/scrape/schemas/vote_event.py | 39 +- pupa/scrape/vote_event.py | 130 ++-- pupa/settings.py | 77 +- pupa/tests/importers/test_base_importer.py | 147 ++-- pupa/tests/importers/test_bill_importer.py | 428 ++++++----- pupa/tests/importers/test_event_importer.py | 271 ++++--- .../importers/test_jurisdiction_importer.py | 40 +- .../importers/test_membership_importer.py | 194 +++-- .../importers/test_organization_importer.py | 209 +++--- pupa/tests/importers/test_people_importer.py | 177 ++--- pupa/tests/importers/test_post_importer.py | 137 ++-- pupa/tests/importers/test_topsort.py | 8 +- .../importers/test_vote_event_importer.py | 683 +++++++++++------- pupa/tests/reports/test_session_report.py | 165 +++-- pupa/tests/scrape/test_bill_scrape.py | 182 +++-- pupa/tests/scrape/test_event_scrape.py | 102 +-- pupa/tests/scrape/test_jurisdiction_scrape.py | 42 +- pupa/tests/scrape/test_model_basics.py | 154 ++-- pupa/tests/scrape/test_people_org_scrape.py | 140 ++-- pupa/tests/scrape/test_scraper.py | 91 +-- pupa/tests/scrape/test_utils.py | 20 +- pupa/tests/scrape/test_vote_event_scrape.py | 148 ++-- pupa/utils/__init__.py | 11 +- pupa/utils/generic.py | 31 +- pupa/utils/topsort.py | 12 +- setup.py | 2 +- 68 files changed, 4097 insertions(+), 2758 deletions(-) diff --git a/pupa/__init__.py b/pupa/__init__.py index f70140c6..563e2bd6 100644 --- a/pupa/__init__.py +++ b/pupa/__init__.py @@ -1 +1 @@ -__version__ = '0.10.2' # pragma: no cover +__version__ = "0.10.2" # pragma: no cover diff --git a/pupa/admin.py b/pupa/admin.py index 34e40555..a627865d 100644 --- a/pupa/admin.py +++ b/pupa/admin.py @@ -4,25 +4,33 @@ class ScrapeReportInline(admin.TabularInline): model = models.ScrapeReport - readonly_fields = ('scraper', 'args', 'start_time', 'end_time', - 'get_object_list') + readonly_fields = ("scraper", "args", "start_time", "end_time", "get_object_list") def has_add_permission(self, request): return False + can_delete = False def get_object_list(self, obj): - return '\n'.join('{} ({})'.format(o.object_type, o.count) for o in - obj.scraped_objects.all()) + return "\n".join( + "{} ({})".format(o.object_type, o.count) for o in obj.scraped_objects.all() + ) class ImportObjectsInline(admin.TabularInline): model = models.ImportObjects - readonly_fields = ('object_type', 'insert_count', 'update_count', - 'noop_count', 'start_time', 'end_time') + readonly_fields = ( + "object_type", + "insert_count", + "update_count", + "noop_count", + "start_time", + "end_time", + ) def has_add_permission(self, request): return False + can_delete = False @@ -30,10 +38,16 @@ def has_add_permission(self, request): class RunPlanAdmin(admin.ModelAdmin): actions = None - readonly_fields = ('jurisdiction', 'success', 'start_time', 'end_time', - 'exception', 'traceback') - list_filter = ('jurisdiction__name', 'success') - list_display = ('jurisdiction', 'success', 'start_time') + readonly_fields = ( + "jurisdiction", + "success", + "start_time", + "end_time", + "exception", + "traceback", + ) + list_filter = ("jurisdiction__name", "success") + list_display = ("jurisdiction", "success", "start_time") inlines = [ ScrapeReportInline, ImportObjectsInline, @@ -50,31 +64,33 @@ def has_add_permission(self, request): class SessionDataQualityAdmin(admin.ModelAdmin): actions = None - readonly_fields = ('legislative_session', - 'bills_missing_actions', - 'bills_missing_sponsors', - 'bills_missing_versions', - 'votes_missing_voters', - 'votes_missing_bill', - 'votes_missing_yes_count', - 'votes_missing_no_count', - 'votes_with_bad_counts', - 'unmatched_sponsor_people', - 'unmatched_sponsor_organizations', - 'unmatched_voters', - ) - list_display = ('jurisdiction_name', - 'legislative_session', - 'bills_missing_actions', - 'bills_missing_sponsors', - 'bills_missing_versions', - 'votes_missing_voters', - 'votes_missing_bill', - 'votes_missing_yes_count', - 'votes_missing_no_count', - 'votes_with_bad_counts', - ) - list_filter = ('legislative_session__jurisdiction__name',) + readonly_fields = ( + "legislative_session", + "bills_missing_actions", + "bills_missing_sponsors", + "bills_missing_versions", + "votes_missing_voters", + "votes_missing_bill", + "votes_missing_yes_count", + "votes_missing_no_count", + "votes_with_bad_counts", + "unmatched_sponsor_people", + "unmatched_sponsor_organizations", + "unmatched_voters", + ) + list_display = ( + "jurisdiction_name", + "legislative_session", + "bills_missing_actions", + "bills_missing_sponsors", + "bills_missing_versions", + "votes_missing_voters", + "votes_missing_bill", + "votes_missing_yes_count", + "votes_missing_no_count", + "votes_with_bad_counts", + ) + list_filter = ("legislative_session__jurisdiction__name",) def jurisdiction_name(self, obj): return obj.legislative_session.jurisdiction.name diff --git a/pupa/cli/__main__.py b/pupa/cli/__main__.py index 98eafb7a..09cd450d 100644 --- a/pupa/cli/__main__.py +++ b/pupa/cli/__main__.py @@ -7,28 +7,33 @@ from django.conf import settings from pupa.exceptions import CommandError -logger = logging.getLogger('pupa') +logger = logging.getLogger("pupa") COMMAND_MODULES = ( - 'pupa.cli.commands.init', - 'pupa.cli.commands.dbinit', - 'pupa.cli.commands.update', - 'pupa.cli.commands.party', + "pupa.cli.commands.init", + "pupa.cli.commands.dbinit", + "pupa.cli.commands.update", + "pupa.cli.commands.party", ) def main(): - parser = argparse.ArgumentParser('pupa', description='pupa CLI') - parser.add_argument('--debug', action='store_true', - help='open debugger on error') - parser.add_argument('--loglevel', default='INFO', help=('set log level. options are: ' - 'DEBUG|INFO|WARNING|ERROR|CRITICAL ' - '(default is INFO)')) - subparsers = parser.add_subparsers(dest='subcommand') + parser = argparse.ArgumentParser("pupa", description="pupa CLI") + parser.add_argument("--debug", action="store_true", help="open debugger on error") + parser.add_argument( + "--loglevel", + default="INFO", + help=( + "set log level. options are: " + "DEBUG|INFO|WARNING|ERROR|CRITICAL " + "(default is INFO)" + ), + ) + subparsers = parser.add_subparsers(dest="subcommand") # configure Django before model imports if os.environ.get("DJANGO_SETTINGS_MODULE") is None: - os.environ['DJANGO_SETTINGS_MODULE'] = 'pupa.settings' + os.environ["DJANGO_SETTINGS_MODULE"] = "pupa.settings" subcommands = {} for mod in COMMAND_MODULES: @@ -42,16 +47,16 @@ def main(): args, other = parser.parse_known_args() # set log level from command line - handler_level = getattr(logging, args.loglevel.upper(), 'INFO') - settings.LOGGING['handlers']['default']['level'] = handler_level + handler_level = getattr(logging, args.loglevel.upper(), "INFO") + settings.LOGGING["handlers"]["default"]["level"] = handler_level logging.config.dictConfig(settings.LOGGING) # turn debug on if args.debug: try: - debug_module = importlib.import_module('ipdb') + debug_module = importlib.import_module("ipdb") except ImportError: - debug_module = importlib.import_module('pdb') + debug_module = importlib.import_module("pdb") # turn on PDB-on-error mode # stolen from http://stackoverflow.com/questions/1237379/ @@ -59,6 +64,7 @@ def main(): def _tb_info(type, value, tb): traceback.print_exception(type, value, tb) debug_module.pm() + sys.excepthook = _tb_info if not args.subcommand: @@ -71,5 +77,5 @@ def _tb_info(type, value, tb): sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pupa/cli/commands/base.py b/pupa/cli/commands/base.py index 9abcfc87..88997402 100644 --- a/pupa/cli/commands/base.py +++ b/pupa/cli/commands/base.py @@ -1,5 +1,4 @@ class BaseCommand(object): - def __init__(self, subparsers): self.subparser = subparsers.add_parser(self.name, description=self.help) self.add_args() @@ -11,4 +10,4 @@ def add_argument(self, *args, **kwargs): self.subparser.add_argument(*args, **kwargs) def handle(self, args): - raise NotImplementedError('commands must implement handle(args)') + raise NotImplementedError("commands must implement handle(args)") diff --git a/pupa/cli/commands/dbinit.py b/pupa/cli/commands/dbinit.py index ade7b9ca..3615683b 100644 --- a/pupa/cli/commands/dbinit.py +++ b/pupa/cli/commands/dbinit.py @@ -7,14 +7,14 @@ def copy_tmp(tablename): cursor = connection.cursor() - print('copying data from table ' + tablename) + print("copying data from table " + tablename) cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename)) cursor.execute("CREATE TABLE tmp_{t} (LIKE {t});".format(t=tablename)) cursor.execute("INSERT INTO tmp_{t} SELECT * FROM {t};".format(t=tablename)) def restore_from_tmp(tablename): - print('restoring data to table ' + tablename) + print("restoring data to table " + tablename) cursor = connection.cursor() cursor.execute("INSERT INTO {t} SELECT * FROM tmp_{t};".format(t=tablename)) cursor.execute("DROP TABLE IF EXISTS tmp_{t};".format(t=tablename)) @@ -24,8 +24,8 @@ def drop_tables(skip_divisions=False): tables = connection.introspection.table_names() cursor = connection.cursor() for table in tables: - if table.startswith(('opencivicdata_', 'pupa_')): - print('dropping table ' + table) + if table.startswith(("opencivicdata_", "pupa_")): + print("dropping table " + table) cursor.execute("DROP TABLE IF EXISTS {} CASCADE;".format(table)) cursor.execute("DELETE FROM django_migrations WHERE app='core';") cursor.execute("DELETE FROM django_migrations WHERE app='legislative';") @@ -33,32 +33,41 @@ def drop_tables(skip_divisions=False): class Command(BaseCommand): - name = 'dbinit' - help = 'initialize a pupa database' + name = "dbinit" + help = "initialize a pupa database" def add_args(self): - self.add_argument('--reset', action='store_true', default=False, - help='reset entire database - USE WITH CAUTION') - self.add_argument('--partial-reset', action='store_true', default=False, - help='reset entire database, except for divisions - USE WITH CAUTION') - self.add_argument(type=str, dest='country', nargs='+', - help='country to load divisions for') + self.add_argument( + "--reset", + action="store_true", + default=False, + help="reset entire database - USE WITH CAUTION", + ) + self.add_argument( + "--partial-reset", + action="store_true", + default=False, + help="reset entire database, except for divisions - USE WITH CAUTION", + ) + self.add_argument( + type=str, dest="country", nargs="+", help="country to load divisions for" + ) def handle(self, args, other): django.setup() if args.partial_reset: - copy_tmp('opencivicdata_division') + copy_tmp("opencivicdata_division") drop_tables() elif args.reset: drop_tables() else: pass - call_command('migrate', interactive=False) + call_command("migrate", interactive=False) if args.partial_reset: - restore_from_tmp('opencivicdata_division') + restore_from_tmp("opencivicdata_division") else: for country in args.country: - call_command('loaddivisions', country) + call_command("loaddivisions", country) diff --git a/pupa/cli/commands/init.py b/pupa/cli/commands/init.py index 6e45db9c..0e688970 100644 --- a/pupa/cli/commands/init.py +++ b/pupa/cli/commands/init.py @@ -6,101 +6,123 @@ from opencivicdata.divisions import Division -def prompt(ps, default=''): +def prompt(ps, default=""): return input(ps).strip() or default -CLASS_DICT = {'events': 'Event', - 'people': 'Person', - 'bills': 'Bill', - 'vote_events': 'VoteEvent'} +CLASS_DICT = { + "events": "Event", + "people": "Person", + "bills": "Bill", + "vote_events": "VoteEvent", +} -def write_jurisdiction_template(dirname, short_name, long_name, division_id, classification, url, - scraper_types): - camel_case = short_name.title().replace(' ', '') +def write_jurisdiction_template( + dirname, short_name, long_name, division_id, classification, url, scraper_types +): + camel_case = short_name.title().replace(" ", "") # write __init__ - lines = ['# encoding=utf-8', 'from pupa.scrape import Jurisdiction, Organization'] + lines = ["# encoding=utf-8", "from pupa.scrape import Jurisdiction, Organization"] for stype in scraper_types: - lines.append('from .{} import {}{}Scraper'.format(stype, camel_case, CLASS_DICT[stype])) - lines.append('') - lines.append('') - lines.append('class {}(Jurisdiction):'.format(camel_case)) + lines.append( + "from .{} import {}{}Scraper".format(stype, camel_case, CLASS_DICT[stype]) + ) + lines.append("") + lines.append("") + lines.append("class {}(Jurisdiction):".format(camel_case)) lines.append(' division_id = "{}"'.format(division_id)) lines.append(' classification = "{}"'.format(classification)) lines.append(' name = "{}"'.format(long_name)) lines.append(' url = "{}"'.format(url)) - lines.append(' scrapers = {') + lines.append(" scrapers = {") for stype in scraper_types: - lines.append(' "{}": {}{}Scraper,'.format(stype, camel_case, CLASS_DICT[stype])) - lines.append(' }') - lines.append('') - lines.append(' def get_organizations(self):') - lines.append(' #REQUIRED: define an organization using this format') - lines.append(' #where org_name is something like Seattle City Council') - lines.append(' #and classification is described here:') - - lines.append(' org = Organization(name="org_name", classification="legislature")') - lines.append('') - - lines.append(' # OPTIONAL: add posts to your organizaion using this format,') - lines.append(' # where label is a human-readable description of the post ' - '(eg "Ward 8 councilmember")') - lines.append(' # and role is the position type (eg councilmember, alderman, mayor...)') - lines.append(' # skip entirely if you\'re not writing a people scraper.') - lines.append(' org.add_post(label="position_description", role="position_type")') - lines.append('') - lines.append(' #REQUIRED: yield the organization') - lines.append(' yield org') - lines.append('') - - with open(os.path.join(dirname, '__init__.py'), 'w') as of: - of.write('\n'.join(lines)) + lines.append( + ' "{}": {}{}Scraper,'.format(stype, camel_case, CLASS_DICT[stype]) + ) + lines.append(" }") + lines.append("") + lines.append(" def get_organizations(self):") + lines.append(" #REQUIRED: define an organization using this format") + lines.append(" #where org_name is something like Seattle City Council") + lines.append(" #and classification is described here:") + + lines.append( + ' org = Organization(name="org_name", classification="legislature")' + ) + lines.append("") + + lines.append(" # OPTIONAL: add posts to your organizaion using this format,") + lines.append( + " # where label is a human-readable description of the post " + '(eg "Ward 8 councilmember")' + ) + lines.append( + " # and role is the position type (eg councilmember, alderman, mayor...)" + ) + lines.append(" # skip entirely if you're not writing a people scraper.") + lines.append( + ' org.add_post(label="position_description", role="position_type")' + ) + lines.append("") + lines.append(" #REQUIRED: yield the organization") + lines.append(" yield org") + lines.append("") + + with open(os.path.join(dirname, "__init__.py"), "w") as of: + of.write("\n".join(lines)) # write scraper files for stype in scraper_types: - lines = ['from pupa.scrape import Scraper'] - lines.append('from pupa.scrape import {}'.format(CLASS_DICT[stype])) - lines.append('') - lines.append('') - lines.append('class {}{}Scraper(Scraper):'.format(camel_case, CLASS_DICT[stype])) - lines.append('') - lines.append(' def scrape(self):') - lines.append(' # needs to be implemented') - lines.append(' pass') - lines.append('') - with open(os.path.join(dirname, stype + '.py'), 'w') as of: - of.write('\n'.join(lines)) + lines = ["from pupa.scrape import Scraper"] + lines.append("from pupa.scrape import {}".format(CLASS_DICT[stype])) + lines.append("") + lines.append("") + lines.append( + "class {}{}Scraper(Scraper):".format(camel_case, CLASS_DICT[stype]) + ) + lines.append("") + lines.append(" def scrape(self):") + lines.append(" # needs to be implemented") + lines.append(" pass") + lines.append("") + with open(os.path.join(dirname, stype + ".py"), "w") as of: + of.write("\n".join(lines)) class Command(BaseCommand): - name = 'init' - help = 'start a new pupa scraper' + name = "init" + help = "start a new pupa scraper" def add_args(self): - self.add_argument('module', type=str, help='name of the new scraper module') + self.add_argument("module", type=str, help="name of the new scraper module") def handle(self, args, other): if os.path.exists(args.module): - raise CommandError('Directory {} already exists'.format(repr(args.module))) + raise CommandError("Directory {} already exists".format(repr(args.module))) division = None while not division: - division = prompt('division id (see https://github.com/opencivicdata/' - 'ocd-division-ids/tree/master/identifiers): ') + division = prompt( + "division id (see https://github.com/opencivicdata/" + "ocd-division-ids/tree/master/identifiers): " + ) if not division: print("\nERROR: Division ID is required.\n") try: Division.get(division) except (ValueError, IndexError): - raise CommandError('Division ID {} is invalid'.format(repr(division))) + raise CommandError("Division ID {} is invalid".format(repr(division))) - name = prompt('jurisdiction name (e.g. City of Seattle): ') - classification = prompt('classification (can be: {}): ' - .format(', '.join(JURISDICTION_CLASSIFICATIONS))) - url = prompt('official url (e.g. http://www.seattle.gov/): ') + name = prompt("jurisdiction name (e.g. City of Seattle): ") + classification = prompt( + "classification (can be: {}): ".format( + ", ".join(JURISDICTION_CLASSIFICATIONS) + ) + ) + url = prompt("official url (e.g. http://www.seattle.gov/): ") os.makedirs(args.module) @@ -108,14 +130,23 @@ def handle(self, args, other): selected_scraper_types = [] for stype in CLASS_DICT.keys(): if selected_scraper_types: - default = 'N' - hint = '[y/N]' + default = "N" + hint = "[y/N]" else: - default = 'Y' - hint = '[Y/n]' - result = prompt('create {} scraper? {}: '.format(stype, hint), default).upper() - if result == 'Y': + default = "Y" + hint = "[Y/n]" + result = prompt( + "create {} scraper? {}: ".format(stype, hint), default + ).upper() + if result == "Y": selected_scraper_types.append(stype) - write_jurisdiction_template(args.module, args.module, name, division, classification, url, - selected_scraper_types) + write_jurisdiction_template( + args.module, + args.module, + name, + division, + classification, + url, + selected_scraper_types, + ) diff --git a/pupa/cli/commands/party.py b/pupa/cli/commands/party.py index aedbc392..1ce5a718 100644 --- a/pupa/cli/commands/party.py +++ b/pupa/cli/commands/party.py @@ -4,26 +4,29 @@ class Command(BaseCommand): - name = 'party' - help = 'command line tool to manage parties' + name = "party" + help = "command line tool to manage parties" def add_args(self): - self.add_argument('action', type=str, help='add|list') - self.add_argument('party_name', type=str, nargs='?') + self.add_argument("action", type=str, help="add|list") + self.add_argument("party_name", type=str, nargs="?") def handle(self, args, other): django.setup() from opencivicdata.core.models import Organization - if args.action == 'add': - o, created = Organization.objects.get_or_create(name=args.party_name, - classification='party') + if args.action == "add": + o, created = Organization.objects.get_or_create( + name=args.party_name, classification="party" + ) if created: - print('added {}'.format(o)) + print("added {}".format(o)) else: - print('{} already exists'.format(o)) - elif args.action == 'list': - for party in Organization.objects.filter(classification='party').order_by('name'): + print("{} already exists".format(o)) + elif args.action == "list": + for party in Organization.objects.filter(classification="party").order_by( + "name" + ): print(party.name) else: raise CommandError('party action must be "add" or "list"') diff --git a/pupa/cli/commands/update.py b/pupa/cli/commands/update.py index 033ca4e8..b9441202 100644 --- a/pupa/cli/commands/update.py +++ b/pupa/cli/commands/update.py @@ -17,7 +17,7 @@ from .base import BaseCommand -ALL_ACTIONS = ('scrape', 'import') +ALL_ACTIONS = ("scrape", "import") class _Unset: @@ -42,23 +42,26 @@ def override_settings(settings, overrides): def print_report(report): - plan = report['plan'] - print('{} ({})'.format(plan['module'], ', '.join(plan['actions']))) - for scraper, args in plan['scrapers'].items(): - print(' {}: {}'.format(scraper, args)) - if 'scrape' in report: - for type, details in sorted(report['scrape'].items()): - print(type + ' scrape:') - print(' duration: ', (details['end'] - details['start'])) - print(' objects:') - for objtype, num in sorted(details['objects'].items()): - print(' {}: {}'.format(objtype, num)) - if 'import' in report: - print('import:') - for type, changes in sorted(report['import'].items()): - if(changes['insert'] or changes['update'] or changes['noop']): - print(' {}: {} new {} updated {} noop'.format(type, changes['insert'], - changes['update'], changes['noop'])) + plan = report["plan"] + print("{} ({})".format(plan["module"], ", ".join(plan["actions"]))) + for scraper, args in plan["scrapers"].items(): + print(" {}: {}".format(scraper, args)) + if "scrape" in report: + for type, details in sorted(report["scrape"].items()): + print(type + " scrape:") + print(" duration: ", (details["end"] - details["start"])) + print(" objects:") + for objtype, num in sorted(details["objects"].items()): + print(" {}: {}".format(objtype, num)) + if "import" in report: + print("import:") + for type, changes in sorted(report["import"].items()): + if changes["insert"] or changes["update"] or changes["noop"]: + print( + " {}: {} new {} updated {} noop".format( + type, changes["insert"], changes["update"], changes["noop"] + ) + ) @transaction.atomic @@ -67,7 +70,7 @@ def save_report(report, jurisdiction): from opencivicdata.core.models import Jurisdiction as JurisdictionModel # set end time - report['end'] = utils.utcnow() + report["end"] = utils.utcnow() # if there's an error on the first run, the jurisdiction doesn't exist # yet, we opt for skipping creation of RunPlan until there's been at least @@ -76,81 +79,116 @@ def save_report(report, jurisdiction): JurisdictionModel.objects.get(pk=jurisdiction) except JurisdictionModel.DoesNotExist: logger = logging.getLogger("pupa") - logger.warning('could not save RunPlan, no successful runs of {} yet'.format( - jurisdiction) + logger.warning( + "could not save RunPlan, no successful runs of {} yet".format(jurisdiction) ) return - plan = RunPlan.objects.create(jurisdiction_id=jurisdiction, - success=report['success'], - start_time=report['start'], - end_time=report['end'], - exception=report.get('exception', ''), - traceback=report.get('traceback', ''), - ) - - for scraper, details in report.get('scrape', {}).items(): - args = ' '.join('{k}={v}'.format(k=k, v=v) - for k, v in report['plan']['scrapers'].get(scraper, {}).items()) - sr = plan.scrapers.create(scraper=scraper, args=args, - start_time=details['start'], end_time=details['end']) - for object_type, num in details['objects'].items(): + plan = RunPlan.objects.create( + jurisdiction_id=jurisdiction, + success=report["success"], + start_time=report["start"], + end_time=report["end"], + exception=report.get("exception", ""), + traceback=report.get("traceback", ""), + ) + + for scraper, details in report.get("scrape", {}).items(): + args = " ".join( + "{k}={v}".format(k=k, v=v) + for k, v in report["plan"]["scrapers"].get(scraper, {}).items() + ) + sr = plan.scrapers.create( + scraper=scraper, + args=args, + start_time=details["start"], + end_time=details["end"], + ) + for object_type, num in details["objects"].items(): sr.scraped_objects.create(object_type=object_type, count=num) - for object_type, changes in report.get('import', {}).items(): - if changes['insert'] or changes['update'] or changes['noop']: + for object_type, changes in report.get("import", {}).items(): + if changes["insert"] or changes["update"] or changes["noop"]: plan.imported_objects.create( object_type=object_type, - insert_count=changes['insert'], - update_count=changes['update'], - noop_count=changes['noop'], - start_time=changes['start'], - end_time=changes['end'], + insert_count=changes["insert"], + update_count=changes["update"], + noop_count=changes["noop"], + start_time=changes["start"], + end_time=changes["end"], ) class Command(BaseCommand): - name = 'update' - help = 'update pupa data' + name = "update" + help = "update pupa data" def add_args(self): # what to scrape - self.add_argument('module', type=str, help='path to scraper module') + self.add_argument("module", type=str, help="path to scraper module") for arg in ALL_ACTIONS: - self.add_argument('--' + arg, dest='actions', action='append_const', const=arg, - help='only run {} post-scrape step'.format(arg)) + self.add_argument( + "--" + arg, + dest="actions", + action="append_const", + const=arg, + help="only run {} post-scrape step".format(arg), + ) # scraper arguments - self.add_argument('--nonstrict', action='store_false', dest='strict', - help='skip validation on save') - self.add_argument('--fastmode', action='store_true', - help='use cache and turn off throttling') + self.add_argument( + "--nonstrict", + action="store_false", + dest="strict", + help="skip validation on save", + ) + self.add_argument( + "--fastmode", action="store_true", help="use cache and turn off throttling" + ) # settings overrides - self.add_argument('--datadir', help='data directory', dest='SCRAPED_DATA_DIR') - self.add_argument('--cachedir', help='cache directory', dest='CACHE_DIR') - self.add_argument('-r', '--rpm', help='scraper rpm', type=int, dest='SCRAPELIB_RPM') - self.add_argument('--timeout', help='scraper timeout', type=int, dest='SCRAPELIB_TIMEOUT') - self.add_argument('--no-verify', help='skip tls verification', - action='store_false', dest='SCRAPELIB_VERIFY') - self.add_argument('--retries', help='scraper retries', type=int, dest='SCRAPELIB_RETRIES') - self.add_argument('--retry_wait', help='scraper retry wait', type=int, - dest='SCRAPELIB_RETRY_WAIT_SECONDS') + self.add_argument("--datadir", help="data directory", dest="SCRAPED_DATA_DIR") + self.add_argument("--cachedir", help="cache directory", dest="CACHE_DIR") + self.add_argument( + "-r", "--rpm", help="scraper rpm", type=int, dest="SCRAPELIB_RPM" + ) + self.add_argument( + "--timeout", help="scraper timeout", type=int, dest="SCRAPELIB_TIMEOUT" + ) + self.add_argument( + "--no-verify", + help="skip tls verification", + action="store_false", + dest="SCRAPELIB_VERIFY", + ) + self.add_argument( + "--retries", help="scraper retries", type=int, dest="SCRAPELIB_RETRIES" + ) + self.add_argument( + "--retry_wait", + help="scraper retry wait", + type=int, + dest="SCRAPELIB_RETRY_WAIT_SECONDS", + ) def get_jurisdiction(self, module_name): # get the jurisdiction object module = importlib.import_module(module_name) for obj in module.__dict__.values(): # ensure we're dealing with a subclass of Jurisdiction - if (isinstance(obj, type) and - issubclass(obj, Jurisdiction) and - getattr(obj, 'division_id', None) and - obj.classification): + if ( + isinstance(obj, type) + and issubclass(obj, Jurisdiction) + and getattr(obj, "division_id", None) + and obj.classification + ): return obj(), module - raise CommandError('Unable to import Jurisdiction subclass from ' + - module_name + - '. Jurisdiction subclass may be missing a ' + - 'division_id or classification.') + raise CommandError( + "Unable to import Jurisdiction subclass from " + + module_name + + ". Jurisdiction subclass may be missing a " + + "division_id or classification." + ) def do_scrape(self, juris, args, scrapers): # make output and cache dirs @@ -158,70 +196,86 @@ def do_scrape(self, juris, args, scrapers): datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) utils.makedirs(datadir) # clear json from data dir - for f in glob.glob(datadir + '/*.json'): + for f in glob.glob(datadir + "/*.json"): os.remove(f) report = {} # do jurisdiction - jscraper = JurisdictionScraper(juris, datadir, strict_validation=args.strict, - fastmode=args.fastmode) - report['jurisdiction'] = jscraper.do_scrape() + jscraper = JurisdictionScraper( + juris, datadir, strict_validation=args.strict, fastmode=args.fastmode + ) + report["jurisdiction"] = jscraper.do_scrape() for scraper_name, scrape_args in scrapers.items(): ScraperCls = juris.scrapers[scraper_name] - scraper = ScraperCls(juris, datadir, strict_validation=args.strict, - fastmode=args.fastmode) + scraper = ScraperCls( + juris, datadir, strict_validation=args.strict, fastmode=args.fastmode + ) report[scraper_name] = scraper.do_scrape(**scrape_args) return report def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily - from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, - PostImporter, MembershipImporter, BillImporter, - VoteEventImporter, EventImporter) + from pupa.importers import ( + JurisdictionImporter, + OrganizationImporter, + PersonImporter, + PostImporter, + MembershipImporter, + BillImporter, + VoteEventImporter, + EventImporter, + ) from pupa.reports import generate_session_report from pupa.models import SessionDataQualityReport + datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) - membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, - org_importer, post_importer) - bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) - vote_event_importer = VoteEventImporter(juris.jurisdiction_id, person_importer, - org_importer, bill_importer) - event_importer = EventImporter(juris.jurisdiction_id, - org_importer, - person_importer, - bill_importer, - vote_event_importer) + membership_importer = MembershipImporter( + juris.jurisdiction_id, person_importer, org_importer, post_importer + ) + bill_importer = BillImporter( + juris.jurisdiction_id, org_importer, person_importer + ) + vote_event_importer = VoteEventImporter( + juris.jurisdiction_id, person_importer, org_importer, bill_importer + ) + event_importer = EventImporter( + juris.jurisdiction_id, + org_importer, + person_importer, + bill_importer, + vote_event_importer, + ) report = {} with transaction.atomic(): - print('import jurisdictions...') + print("import jurisdictions...") report.update(juris_importer.import_directory(datadir)) if settings.ENABLE_PEOPLE_AND_ORGS: - print('import organizations...') + print("import organizations...") report.update(org_importer.import_directory(datadir)) - print('import people...') + print("import people...") report.update(person_importer.import_directory(datadir)) - print('import posts...') + print("import posts...") report.update(post_importer.import_directory(datadir)) - print('import memberships...') + print("import memberships...") report.update(membership_importer.import_directory(datadir)) if settings.ENABLE_BILLS: - print('import bills...') + print("import bills...") report.update(bill_importer.import_directory(datadir)) if settings.ENABLE_EVENTS: - print('import events...') + print("import events...") report.update(event_importer.import_directory(datadir)) if settings.ENABLE_VOTES: - print('import vote events...') + print("import vote events...") report.update(vote_event_importer.import_directory(datadir)) # compile info on all sessions that were updated in this run @@ -231,7 +285,9 @@ def do_import(self, juris, args): for session in seen_sessions: new_report = generate_session_report(session) with transaction.atomic(): - SessionDataQualityReport.objects.filter(legislative_session=session).delete() + SessionDataQualityReport.objects.filter( + legislative_session=session + ).delete() new_report.save() return report @@ -246,22 +302,23 @@ def check_session_list(self, juris): scraped_sessions = juris.get_session_list() if not scraped_sessions: - raise CommandError('no sessions from {}.get_session_list()'.format(scraper)) + raise CommandError("no sessions from {}.get_session_list()".format(scraper)) # copy the list to avoid modifying it sessions = set(juris.ignored_scraped_sessions) for session in juris.legislative_sessions: - sessions.add(session.get('_scraped_name', session['identifier'])) + sessions.add(session.get("_scraped_name", session["identifier"])) unaccounted_sessions = list(set(scraped_sessions) - sessions) if unaccounted_sessions: raise CommandError( ( - 'Session(s) {sessions} were reported by {scraper}.get_session_list() ' - 'but were not found in {scraper}.legislative_sessions or ' - '{scraper}.ignored_scraped_sessions.' + "Session(s) {sessions} were reported by " + "{scraper}.get_session_list() " + "but were not found in {scraper}.legislative_sessions or " + "{scraper}.ignored_scraped_sessions." ).format( - sessions=', '.join(unaccounted_sessions), + sessions=", ".join(unaccounted_sessions), scraper=scraper, ) ) @@ -269,38 +326,40 @@ def check_session_list(self, juris): def handle(self, args, other): juris, module = self.get_jurisdiction(args.module) overrides = {} - overrides.update(getattr(module, 'settings', {})) - overrides.update({ - key: value for key, value in vars(args).items() - if value is not None - }) + overrides.update(getattr(module, "settings", {})) + overrides.update( + {key: value for key, value in vars(args).items() if value is not None} + ) with override_settings(settings, overrides): return self.do_handle(args, other, juris) def do_handle(self, args, other, juris): - available_scrapers = getattr(juris, 'scrapers', {}) - default_scrapers = getattr(juris, 'default_scrapers', None) + available_scrapers = getattr(juris, "scrapers", {}) + default_scrapers = getattr(juris, "default_scrapers", None) scrapers = OrderedDict() if not available_scrapers: - raise CommandError('no scrapers defined on jurisdiction') + raise CommandError("no scrapers defined on jurisdiction") if other: # parse arg list in format: (scraper (k:v)+)+ cur_scraper = None for arg in other: - if '=' in arg: + if "=" in arg: if not cur_scraper: - raise CommandError('argument {} before scraper name'.format(arg)) - k, v = arg.split('=', 1) + raise CommandError( + "argument {} before scraper name".format(arg) + ) + k, v = arg.split("=", 1) scrapers[cur_scraper][k] = v elif arg in juris.scrapers: cur_scraper = arg scrapers[cur_scraper] = {} else: - raise CommandError('no such scraper: module={} scraper={}'.format(args.module, - arg)) + raise CommandError( + "no such scraper: module={} scraper={}".format(args.module, arg) + ) elif default_scrapers is not None: scrapers = {s: {} for s in default_scrapers} else: @@ -310,33 +369,38 @@ def do_handle(self, args, other, juris): if not args.actions: args.actions = ALL_ACTIONS - if 'import' in args.actions: + if "import" in args.actions: django.setup() # print the plan - report = {'plan': {'module': args.module, 'actions': args.actions, 'scrapers': scrapers}, - 'start': utils.utcnow(), - } + report = { + "plan": { + "module": args.module, + "actions": args.actions, + "scrapers": scrapers, + }, + "start": utils.utcnow(), + } print_report(report) - if 'scrape' in args.actions: + if "scrape" in args.actions: self.check_session_list(juris) try: - if 'scrape' in args.actions: - report['scrape'] = self.do_scrape(juris, args, scrapers) - if 'import' in args.actions: - report['import'] = self.do_import(juris, args) - report['success'] = True + if "scrape" in args.actions: + report["scrape"] = self.do_scrape(juris, args, scrapers) + if "import" in args.actions: + report["import"] = self.do_import(juris, args) + report["success"] = True except Exception as exc: - report['success'] = False - report['exception'] = exc - report['traceback'] = traceback.format_exc() - if 'import' in args.actions: + report["success"] = False + report["exception"] = exc + report["traceback"] = traceback.format_exc() + if "import" in args.actions: save_report(report, juris.jurisdiction_id) raise - if 'import' in args.actions: + if "import" in args.actions: save_report(report, juris.jurisdiction_id) print_report(report) diff --git a/pupa/exceptions.py b/pupa/exceptions.py index 042f967c..c35db220 100644 --- a/pupa/exceptions.py +++ b/pupa/exceptions.py @@ -1,81 +1,85 @@ - - class PupaError(Exception): - """ Base class for exceptions from within Pupa """ + """Base class for exceptions from within Pupa""" class PupaInternalError(PupaError): - """ Indication something went wrong inside of Pupa that never should happen """ + """Indication something went wrong inside of Pupa that never should happen""" class CommandError(PupaError): - """ Errors from within pupa CLI """ + """Errors from within pupa CLI""" # import-related errors class DataImportError(PupaError): - """ A generic error related to the import process. """ + """A generic error related to the import process.""" class InvalidVoteEventError(DataImportError): - """ Attempt to create a vote event without an identifier or bill_id """ + """Attempt to create a vote event without an identifier or bill_id""" class NoMembershipsError(DataImportError): - """ An attempt was made to import a person without any memberships. """ + """An attempt was made to import a person without any memberships.""" def __init__(self, ids): - super(NoMembershipsError, self).__init__('no memberships for {} people: \n{}'.format( - len(ids), ', '.join(ids)) + super(NoMembershipsError, self).__init__( + "no memberships for {} people: \n{}".format(len(ids), ", ".join(ids)) ) class SameNameError(DataImportError): - """ Attempt was made to import two people with the same name. """ + """Attempt was made to import two people with the same name.""" def __init__(self, name): - super(SameNameError, self).__init__('multiple people with same name "{}" in Jurisdiction ' - '- must provide birth_date to disambiguate' - .format(name)) + super(SameNameError, self).__init__( + 'multiple people with same name "{}" in Jurisdiction ' + "- must provide birth_date to disambiguate".format(name) + ) class SameOrgNameError(DataImportError): - """ Attempt was made to import two orgs with the same name. """ + """Attempt was made to import two orgs with the same name.""" def __init__(self, name): - super(SameOrgNameError, self).__init__('multiple orgs with same name "{}" in Jurisdiction ' - .format(name)) + super(SameOrgNameError, self).__init__( + 'multiple orgs with same name "{}" in Jurisdiction '.format(name) + ) class DuplicateItemError(DataImportError): - """ Attempt was made to import items that resolve to the same database item. """ + """Attempt was made to import items that resolve to the same database item.""" def __init__(self, data, obj, data_sources=None): super(DuplicateItemError, self).__init__( - 'attempt to import data that would conflict with ' - 'data already in the import: {} ' - '(already imported as {})\n' - 'obj1 sources: {}\nobj2 sources: {}'.format( + "attempt to import data that would conflict with " + "data already in the import: {} " + "(already imported as {})\n" + "obj1 sources: {}\nobj2 sources: {}".format( data, obj, - list(obj.sources.values_list('url', flat=True) - if hasattr(obj, 'sources') else []), - [s['url'] for s in data_sources or []] - )) + list( + obj.sources.values_list("url", flat=True) + if hasattr(obj, "sources") + else [] + ), + [s["url"] for s in data_sources or []], + ) + ) class UnresolvedIdError(DataImportError): - """ Attempt was made to resolve an id that has no result. """ + """Attempt was made to resolve an id that has no result.""" # scrape-related errors class ScrapeError(PupaError): - """ A generic error related to the scrape process. """ + """A generic error related to the scrape process.""" class ScrapeValueError(PupaError, ValueError): - """ An invalid value was passed to a pupa scrape object. """ + """An invalid value was passed to a pupa scrape object.""" diff --git a/pupa/ext/ansistrm.py b/pupa/ext/ansistrm.py index 1df58dfd..89aa937b 100644 --- a/pupa/ext/ansistrm.py +++ b/pupa/ext/ansistrm.py @@ -11,42 +11,42 @@ class ColorizingStreamHandler(logging.StreamHandler): # color names to indices color_map = { - 'black': 0, - 'red': 1, - 'green': 2, - 'yellow': 3, - 'blue': 4, - 'magenta': 5, - 'cyan': 6, - 'white': 7, + "black": 0, + "red": 1, + "green": 2, + "yellow": 3, + "blue": 4, + "magenta": 5, + "cyan": 6, + "white": 7, } # levels to (background, foreground, bold/intense) - if os.name == 'nt': + if os.name == "nt": level_map = { - logging.DEBUG: (None, 'blue', True), - logging.INFO: (None, 'white', False), - logging.WARNING: (None, 'yellow', True), - logging.ERROR: (None, 'red', True), - logging.CRITICAL: ('red', 'white', True), + logging.DEBUG: (None, "blue", True), + logging.INFO: (None, "white", False), + logging.WARNING: (None, "yellow", True), + logging.ERROR: (None, "red", True), + logging.CRITICAL: ("red", "white", True), } else: level_map = { - logging.DEBUG: (None, 'blue', False), - logging.INFO: (None, 'white', False), - logging.WARNING: (None, 'yellow', False), - logging.ERROR: (None, 'red', False), - logging.CRITICAL: ('red', 'white', True), + logging.DEBUG: (None, "blue", False), + logging.INFO: (None, "white", False), + logging.WARNING: (None, "yellow", False), + logging.ERROR: (None, "red", False), + logging.CRITICAL: ("red", "white", True), } - csi = '\x1b[' - reset = '\x1b[0m' + csi = "\x1b[" + reset = "\x1b[0m" @property def is_tty(self): # bluff for Jenkins - if os.environ.get('JENKINS_URL'): + if os.environ.get("JENKINS_URL"): return True - isatty = getattr(self.stream, 'isatty', None) + isatty = getattr(self.stream, "isatty", None) return isatty and isatty() def emit(self, record): @@ -57,39 +57,42 @@ def emit(self, record): stream.write(message) else: self.output_colorized(message) - stream.write(getattr(self, 'terminator', '\n')) + stream.write(getattr(self, "terminator", "\n")) self.flush() except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) - if os.name != 'nt': + if os.name != "nt": + def output_colorized(self, message): self.stream.write(message) + else: import re - ansi_esc = re.compile(r'\x1b\[((?:\d+)(?:;(?:\d+))*)m') + + ansi_esc = re.compile(r"\x1b\[((?:\d+)(?:;(?:\d+))*)m") nt_color_map = { - 0: 0x00, # black - 1: 0x04, # red - 2: 0x02, # green - 3: 0x06, # yellow - 4: 0x01, # blue - 5: 0x05, # magenta - 6: 0x03, # cyan - 7: 0x07, # white + 0: 0x00, # black + 1: 0x04, # red + 2: 0x02, # green + 3: 0x06, # yellow + 4: 0x01, # blue + 5: 0x05, # magenta + 6: 0x03, # cyan + 7: 0x07, # white } def output_colorized(self, message): parts = self.ansi_esc.split(message) write = self.stream.write h = None - fd = getattr(self.stream, 'fileno', None) + fd = getattr(self.stream, "fileno", None) if fd is not None: fd = fd() - if fd in (1, 2): # stdout or stderr + if fd in (1, 2): # stdout or stderr h = ctypes.windll.kernel32.GetStdHandle(-10 - fd) while parts: text = parts.pop(0) @@ -98,7 +101,7 @@ def output_colorized(self, message): if parts: params = parts.pop(0) if h is not None: - params = [int(p) for p in params.split(';')] + params = [int(p) for p in params.split(";")] color = 0 for p in params: if 40 <= p <= 47: @@ -106,13 +109,12 @@ def output_colorized(self, message): elif 30 <= p <= 37: color |= self.nt_color_map[p - 30] elif p == 1: - color |= 0x08 # foreground intensity on - elif p == 0: # reset to default color + color |= 0x08 # foreground intensity on + elif p == 0: # reset to default color color = 0x07 else: - pass # error condition ignored - ctypes.windll.kernel32.SetConsoleTextAttribute(h, - color) + pass # error condition ignored + ctypes.windll.kernel32.SetConsoleTextAttribute(h, color) def colorize(self, message, record): if record.levelno in self.level_map: @@ -123,19 +125,20 @@ def colorize(self, message, record): if fg in self.color_map: params.append(str(self.color_map[fg] + 30)) if bold: - params.append('1') + params.append("1") if params: - message = ''.join((self.csi, ';'.join(params), - 'm', message, self.reset)) + message = "".join( + (self.csi, ";".join(params), "m", message, self.reset) + ) return message def format(self, record): message = logging.StreamHandler.format(self, record) if self.is_tty: # Don't colorize any traceback - parts = message.split('\n', 1) + parts = message.split("\n", 1) parts[0] = self.colorize(parts[0], record) - message = '\n'.join(parts) + message = "\n".join(parts) return message @@ -143,11 +146,12 @@ def main(): root = logging.getLogger() root.setLevel(logging.DEBUG) root.addHandler(ColorizingStreamHandler()) - logging.debug('DEBUG') - logging.info('INFO') - logging.warning('WARNING') - logging.error('ERROR') - logging.critical('CRITICAL') + logging.debug("DEBUG") + logging.info("INFO") + logging.warning("WARNING") + logging.error("ERROR") + logging.critical("CRITICAL") + -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pupa/importers/base.py b/pupa/importers/base.py index 3fa0c40d..4da6f45c 100644 --- a/pupa/importers/base.py +++ b/pupa/importers/base.py @@ -17,7 +17,7 @@ def omnihash(obj): - """ recursively hash unhashable objects """ + """recursively hash unhashable objects""" if isinstance(obj, set): return hash(frozenset(omnihash(e) for e in obj)) elif isinstance(obj, (tuple, list)): @@ -29,7 +29,7 @@ def omnihash(obj): def items_differ(jsonitems, dbitems, subfield_dict): - """ check whether or not jsonitems and dbitems differ """ + """check whether or not jsonitems and dbitems differ""" # short circuit common cases if len(jsonitems) == len(dbitems) == 0: @@ -45,12 +45,14 @@ def items_differ(jsonitems, dbitems, subfield_dict): # go over dbitems looking for matches for dbitem in dbitems: - order = getattr(dbitem, 'order', None) + order = getattr(dbitem, "order", None) match = None for i, jsonitem in enumerate(jsonitems): # check if all keys (excluding subfields) match for k in keys: - if k not in subfield_dict and getattr(dbitem, k) != jsonitem.get(k, None): + if k not in subfield_dict and getattr(dbitem, k) != jsonitem.get( + k, None + ): break else: # all fields match so far, possibly equal, just check subfields now @@ -61,7 +63,9 @@ def items_differ(jsonitems, dbitems, subfield_dict): break else: # if the dbitem sets 'order', then the order matters - if order is not None and int(order) != original_jsonitems.index(jsonitem): + if order is not None and int(order) != original_jsonitems.index( + jsonitem + ): break # these items are equal, so let's mark it for removal match = i @@ -74,16 +78,17 @@ def items_differ(jsonitems, dbitems, subfield_dict): # exists in db but not json return True - # if we get here, jsonitems has to be empty because we asserted that the length was - # the same and we found a match for each thing in dbitems, here's a safety check just in case - if jsonitems: # pragma: no cover + # if we get here, jsonitems has to be empty because we asserted that + # the length was the same and we found a match for each thing in dbitems, + # here's a safety check just in case + if jsonitems: # pragma: no cover return True return False class BaseImporter(object): - """ BaseImporter + """BaseImporter Override: get_object(data) @@ -91,6 +96,7 @@ class BaseImporter(object): prepare_for_db(data) [optional] postimport() [optional] """ + _type = None model_class = None related_models = {} @@ -118,7 +124,8 @@ def __init__(self, jurisdiction_id): def get_session_id(self, identifier): if identifier not in self.session_cache: self.session_cache[identifier] = LegislativeSession.objects.get( - identifier=identifier, jurisdiction_id=self.jurisdiction_id).id + identifier=identifier, jurisdiction_id=self.jurisdiction_id + ).id return self.session_cache[identifier] # no-ops to be overriden @@ -130,23 +137,24 @@ def postimport(self): def resolve_json_id(self, json_id, allow_no_match=False): """ - Given an id found in scraped JSON, return a DB id for the object. + Given an id found in scraped JSON, return a DB id for the object. - params: - json_id: id from json - allow_no_match: just return None if id can't be resolved + params: + json_id: id from json + allow_no_match: just return None if id can't be resolved - returns: - database id + returns: + database id - raises: - ValueError if id couldn't be resolved + raises: + ValueError if id couldn't be resolved """ if not json_id: return None - if json_id.startswith('~'): - # keep caches of all the pseudo-ids to avoid doing 1000s of lookups during import + if json_id.startswith("~"): + # keep caches of all the pseudo-ids to avoid doing 1000s of lookups + # during import if json_id not in self.pseudo_id_cache: spec = get_pseudo_id(json_id) spec = self.limit_spec(spec) @@ -160,11 +168,13 @@ def resolve_json_id(self, json_id, allow_no_match=False): self.pseudo_id_cache[json_id] = ids.pop() errmsg = None elif not ids: - errmsg = 'cannot resolve pseudo id to {}: {}'.format( - self.model_class.__name__, json_id) + errmsg = "cannot resolve pseudo id to {}: {}".format( + self.model_class.__name__, json_id + ) else: - errmsg = 'multiple objects returned for {} pseudo id {}: {}'.format( - self.model_class.__name__, json_id, ids) + errmsg = "multiple objects returned for {} pseudo id {}: {}".format( + self.model_class.__name__, json_id, ids + ) # either raise or log error if errmsg: @@ -183,14 +193,14 @@ def resolve_json_id(self, json_id, allow_no_match=False): try: return self.json_to_db_id[json_id] except KeyError: - raise UnresolvedIdError('cannot resolve id: {}'.format(json_id)) + raise UnresolvedIdError("cannot resolve id: {}".format(json_id)) def import_directory(self, datadir): - """ import a JSON directory into the database """ + """import a JSON directory into the database""" def json_stream(): # load all json, mapped by json_id - for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')): + for fname in glob.glob(os.path.join(datadir, self._type + "_*.json")): with open(fname) as f: yield json.load(f) @@ -198,16 +208,16 @@ def json_stream(): def _prepare_imports(self, dicts): - """ filters the import stream to remove duplicates + """filters the import stream to remove duplicates - also serves as a good place to override if anything special has to be done to the - order of the import stream (see OrganizationImporter) + also serves as a good place to override if anything special has to be + done to the order of the import stream (see OrganizationImporter) """ # hash(json): id seen_hashes = {} for data in dicts: - json_id = data.pop('_id') + json_id = data.pop("_id") # map duplicates (using omnihash to tell if json dicts are identical-ish) objhash = omnihash(data) @@ -218,37 +228,40 @@ def _prepare_imports(self, dicts): self.duplicates[json_id] = seen_hashes[objhash] def import_data(self, data_items): - """ import a bunch of dicts together """ + """import a bunch of dicts together""" # keep counts of all actions record = { - 'insert': 0, 'update': 0, 'noop': 0, - 'start': utcnow(), - 'records': { - 'insert': [], - 'update': [], - 'noop': [], - } + "insert": 0, + "update": 0, + "noop": 0, + "start": utcnow(), + "records": { + "insert": [], + "update": [], + "noop": [], + }, } for json_id, data in self._prepare_imports(data_items): obj_id, what = self.import_item(data) self.json_to_db_id[json_id] = obj_id - record['records'][what].append(obj_id) + record["records"][what].append(obj_id) record[what] += 1 - # all objects are loaded, a perfect time to do inter-object resolution and other tasks + # all objects are loaded, a perfect time to do + # inter-object resolution and other tasks self.postimport() - record['end'] = utcnow() + record["end"] = utcnow() return {self._type: record} def import_item(self, data): - """ function used by import_data """ - what = 'noop' + """function used by import_data""" + what = "noop" # remove the JSON _id (may still be there if called directly) - data.pop('_id', None) + data.pop("_id", None) # add fields/etc. data = self.apply_transformers(data) @@ -260,7 +273,7 @@ def import_item(self, data): obj = None # remove pupa_id which does not belong in the OCD data models - pupa_id = data.pop('pupa_id', None) + pupa_id = data.pop("pupa_id", None) # pull related fields off related = {} @@ -270,18 +283,18 @@ def import_item(self, data): # obj existed, check if we need to do an update if obj: if obj.id in self.json_to_db_id.values(): - raise DuplicateItemError(data, obj, related.get('sources', [])) + raise DuplicateItemError(data, obj, related.get("sources", [])) # check base object for changes for key, value in data.items(): if getattr(obj, key) != value and key not in obj.locked_fields: setattr(obj, key, value) - what = 'update' + what = "update" updated = self._update_related(obj, related, self.related_models) if updated: - what = 'update' + what = "update" - if what == 'update': + if what == "update": obj.last_updated = utcnow() # Refresh the object's last_seen field whether or not we updated @@ -289,12 +302,13 @@ def import_item(self, data): # need to create the data else: - what = 'insert' + what = "insert" try: obj = self.model_class.objects.create(**data) except Exception as e: - raise DataImportError('{} while importing {} as {}'.format(e, data, - self.model_class)) + raise DataImportError( + "{} while importing {} as {}".format(e, data, self.model_class) + ) self._create_related(obj, related, self.related_models) # Fire post-save signal after related objects are created to allow @@ -302,9 +316,11 @@ def import_item(self, data): post_save.send(sender=self.model_class, instance=obj, created=True) if pupa_id: - Identifier.objects.get_or_create(identifier=pupa_id, - jurisdiction_id=self.jurisdiction_id, - defaults={'content_object': obj}) + Identifier.objects.get_or_create( + identifier=pupa_id, + jurisdiction_id=self.jurisdiction_id, + defaults={"content_object": obj}, + ) return obj.id, what @@ -331,11 +347,13 @@ def _update_related(self, obj, related, subfield_dict): # default to doing nothing do_delete = do_update = False - if items and dbitems_count: # we have items, so does db, check for conflict - do_delete = do_update = items_differ(items, dbitems, subfield_dict[field][2]) - elif items and not dbitems_count: # we have items, db doesn't, just update + if items and dbitems_count: # we have items, so does db, check for conflict + do_delete = do_update = items_differ( + items, dbitems, subfield_dict[field][2] + ) + elif items and not dbitems_count: # we have items, db doesn't, just update do_update = True - elif not items and dbitems_count: # db has items, we don't, just delete + elif not items and dbitems_count: # db has items, we don't, just delete do_delete = True # otherwise: no items or dbitems, so nothing is done @@ -344,8 +362,9 @@ def _update_related(self, obj, related, subfield_dict): new_items = [] # build a list of keyfields to existing database objects keylist = self.merge_related[field] - keyed_dbitems = {tuple(getattr(item, k) for k in keylist): - item for item in dbitems} + keyed_dbitems = { + tuple(getattr(item, k) for k in keylist): item for item in dbitems + } # go through 'new' items # if item with the same keyfields exists: @@ -394,7 +413,7 @@ def _create_related(self, obj, related, subfield_dict): subrelated[subfield] = item.pop(subfield) if field in self.preserve_order: - item['order'] = order + item["order"] = order item[reverse_id_field] = obj.id @@ -402,13 +421,17 @@ def _create_related(self, obj, related, subfield_dict): subobjects.append(Subtype(**item)) all_subrelated.append(subrelated) except Exception as e: - raise DataImportError('{} while importing {} as {}'.format(e, item, Subtype)) + raise DataImportError( + "{} while importing {} as {}".format(e, item, Subtype) + ) # add all subobjects at once (really great for actions & votes) try: Subtype.objects.bulk_create(subobjects) except Exception as e: - raise DataImportError('{} while importing {} as {}'.format(e, subobjects, Subtype)) + raise DataImportError( + "{} while importing {} as {}".format(e, subobjects, Subtype) + ) # after import the subobjects, import their subsubobjects for subobj, subrel in zip(subobjects, all_subrelated): @@ -417,9 +440,11 @@ def _create_related(self, obj, related, subfield_dict): def lookup_obj_id(self, pupa_id, model): content_type = ContentType.objects.get_for_model(model) try: - obj_id = Identifier.objects.get(identifier=pupa_id, - content_type=content_type, - jurisdiction_id=self.jurisdiction_id).object_id + obj_id = Identifier.objects.get( + identifier=pupa_id, + content_type=content_type, + jurisdiction_id=self.jurisdiction_id, + ).object_id except Identifier.DoesNotExist: obj_id = None diff --git a/pupa/importers/bills.py b/pupa/importers/bills.py index 32b715e5..0a732f15 100644 --- a/pupa/importers/bills.py +++ b/pupa/importers/bills.py @@ -1,28 +1,49 @@ -from opencivicdata.legislative.models import (Bill, RelatedBill, BillAbstract, BillTitle, - BillIdentifier, BillAction, BillActionRelatedEntity, - BillSponsorship, BillSource, BillDocument, - BillVersion, BillDocumentLink, BillVersionLink) +from opencivicdata.legislative.models import ( + Bill, + RelatedBill, + BillAbstract, + BillTitle, + BillIdentifier, + BillAction, + BillActionRelatedEntity, + BillSponsorship, + BillSource, + BillDocument, + BillVersion, + BillDocumentLink, + BillVersionLink, +) from .base import BaseImporter from ..exceptions import PupaInternalError class BillImporter(BaseImporter): - _type = 'bill' + _type = "bill" model_class = Bill - related_models = {'abstracts': (BillAbstract, 'bill_id', {}), - 'other_titles': (BillTitle, 'bill_id', {}), - 'other_identifiers': (BillIdentifier, 'bill_id', {}), - 'actions': (BillAction, 'bill_id', { - 'related_entities': (BillActionRelatedEntity, 'action_id', {})}), - 'related_bills': (RelatedBill, 'bill_id', {}), - 'sponsorships': (BillSponsorship, 'bill_id', {}), - 'sources': (BillSource, 'bill_id', {}), - 'documents': (BillDocument, 'bill_id', { - 'links': (BillDocumentLink, 'document_id', {})}), - 'versions': (BillVersion, 'bill_id', { - 'links': (BillVersionLink, 'version_id', {})}), - } - preserve_order = {'actions'} + related_models = { + "abstracts": (BillAbstract, "bill_id", {}), + "other_titles": (BillTitle, "bill_id", {}), + "other_identifiers": (BillIdentifier, "bill_id", {}), + "actions": ( + BillAction, + "bill_id", + {"related_entities": (BillActionRelatedEntity, "action_id", {})}, + ), + "related_bills": (RelatedBill, "bill_id", {}), + "sponsorships": (BillSponsorship, "bill_id", {}), + "sources": (BillSource, "bill_id", {}), + "documents": ( + BillDocument, + "bill_id", + {"links": (BillDocumentLink, "document_id", {})}, + ), + "versions": ( + BillVersion, + "bill_id", + {"links": (BillVersionLink, "version_id", {})}, + ), + } + preserve_order = {"actions"} def __init__(self, jurisdiction_id, org_importer, person_importer): super(BillImporter, self).__init__(jurisdiction_id) @@ -31,64 +52,78 @@ def __init__(self, jurisdiction_id, org_importer, person_importer): def get_object(self, bill): spec = { - 'legislative_session_id': bill['legislative_session_id'], - 'identifier': bill['identifier'], + "legislative_session_id": bill["legislative_session_id"], + "identifier": bill["identifier"], } - if 'from_organization_id' in bill: - spec['from_organization_id'] = bill['from_organization_id'] + if "from_organization_id" in bill: + spec["from_organization_id"] = bill["from_organization_id"] - return self.model_class.objects.prefetch_related('actions__related_entities', - 'versions__links', - 'documents__links', - ).get(**spec) + return self.model_class.objects.prefetch_related( + "actions__related_entities", + "versions__links", + "documents__links", + ).get(**spec) def limit_spec(self, spec): - spec['legislative_session__jurisdiction_id'] = self.jurisdiction_id + spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id return spec def prepare_for_db(self, data): - data['legislative_session_id'] = self.get_session_id(data.pop('legislative_session')) + data["legislative_session_id"] = self.get_session_id( + data.pop("legislative_session") + ) - if data['from_organization']: - data['from_organization_id'] = self.org_importer.resolve_json_id( - data.pop('from_organization')) + if data["from_organization"]: + data["from_organization_id"] = self.org_importer.resolve_json_id( + data.pop("from_organization") + ) - for action in data['actions']: - action['organization_id'] = self.org_importer.resolve_json_id( - action['organization_id']) - for entity in action['related_entities']: - if 'organization_id' in entity: - entity['organization_id'] = self.org_importer.resolve_json_id( - entity['organization_id']) - elif 'person_id' in entity: - entity['person_id'] = self.person_importer.resolve_json_id( - entity['person_id']) + for action in data["actions"]: + action["organization_id"] = self.org_importer.resolve_json_id( + action["organization_id"] + ) + for entity in action["related_entities"]: + if "organization_id" in entity: + entity["organization_id"] = self.org_importer.resolve_json_id( + entity["organization_id"] + ) + elif "person_id" in entity: + entity["person_id"] = self.person_importer.resolve_json_id( + entity["person_id"] + ) - for sponsor in data['sponsorships']: - if 'person_id' in sponsor: - sponsor['person_id'] = self.person_importer.resolve_json_id( - sponsor['person_id'], allow_no_match=True) + for sponsor in data["sponsorships"]: + if "person_id" in sponsor: + sponsor["person_id"] = self.person_importer.resolve_json_id( + sponsor["person_id"], allow_no_match=True + ) - if 'organization_id' in sponsor: - sponsor['organization_id'] = self.org_importer.resolve_json_id( - sponsor['organization_id'], allow_no_match=True) + if "organization_id" in sponsor: + sponsor["organization_id"] = self.org_importer.resolve_json_id( + sponsor["organization_id"], allow_no_match=True + ) return data def postimport(self): - # go through all RelatedBill objs that are attached to a bill in this jurisdiction and - # are currently unresolved + # go through all RelatedBill objs that are attached to a bill in this + # jurisdiction and are currently unresolved for rb in RelatedBill.objects.filter( - bill__legislative_session__jurisdiction_id=self.jurisdiction_id, - related_bill=None): - candidates = list(Bill.objects.filter( - legislative_session__identifier=rb.legislative_session, - legislative_session__jurisdiction_id=self.jurisdiction_id, - identifier=rb.identifier) + bill__legislative_session__jurisdiction_id=self.jurisdiction_id, + related_bill=None, + ): + candidates = list( + Bill.objects.filter( + legislative_session__identifier=rb.legislative_session, + legislative_session__jurisdiction_id=self.jurisdiction_id, + identifier=rb.identifier, + ) ) if len(candidates) == 1: rb.related_bill = candidates[0] rb.save() - elif len(candidates) > 1: # pragma: no cover + elif len(candidates) > 1: # pragma: no cover # if we ever see this, we need to add additional fields on the relation - raise PupaInternalError('multiple related_bill candidates found for {}'.format(rb)) + raise PupaInternalError( + "multiple related_bill candidates found for {}".format(rb) + ) diff --git a/pupa/importers/events.py b/pupa/importers/events.py index 8d4e0dac..0558d66d 100644 --- a/pupa/importers/events.py +++ b/pupa/importers/events.py @@ -1,36 +1,66 @@ from .base import BaseImporter from pupa.utils import get_pseudo_id, _make_pseudo_id -from opencivicdata.legislative.models import (Event, EventLocation, EventSource, EventDocument, - EventDocumentLink, EventLink, EventParticipant, - EventMedia, EventMediaLink, EventAgendaItem, - EventRelatedEntity, EventAgendaMedia, - EventAgendaMediaLink) +from opencivicdata.legislative.models import ( + Event, + EventLocation, + EventSource, + EventDocument, + EventDocumentLink, + EventLink, + EventParticipant, + EventMedia, + EventMediaLink, + EventAgendaItem, + EventRelatedEntity, + EventAgendaMedia, + EventAgendaMediaLink, +) class EventImporter(BaseImporter): - _type = 'event' + _type = "event" model_class = Event related_models = { - 'sources': (EventSource, 'event_id', {}), - 'documents': (EventDocument, 'event_id', { - 'links': (EventDocumentLink, 'document_id', {}) - }), - 'links': (EventLink, 'event_id', {}), - 'participants': (EventParticipant, 'event_id', {}), - 'media': (EventMedia, 'event_id', { - 'links': (EventMediaLink, 'media_id', {}), - }), - 'agenda': (EventAgendaItem, 'event_id', { - 'related_entities': (EventRelatedEntity, 'agenda_item_id', {}), - 'media': (EventAgendaMedia, 'agenda_item_id', { - 'links': (EventAgendaMediaLink, 'media_id', {}), - }), - }) + "sources": (EventSource, "event_id", {}), + "documents": ( + EventDocument, + "event_id", + {"links": (EventDocumentLink, "document_id", {})}, + ), + "links": (EventLink, "event_id", {}), + "participants": (EventParticipant, "event_id", {}), + "media": ( + EventMedia, + "event_id", + { + "links": (EventMediaLink, "media_id", {}), + }, + ), + "agenda": ( + EventAgendaItem, + "event_id", + { + "related_entities": (EventRelatedEntity, "agenda_item_id", {}), + "media": ( + EventAgendaMedia, + "agenda_item_id", + { + "links": (EventAgendaMediaLink, "media_id", {}), + }, + ), + }, + ), } - preserve_order = ('agenda',) + preserve_order = ("agenda",) - def __init__(self, jurisdiction_id, org_importer, person_importer, bill_importer, - vote_event_importer): + def __init__( + self, + jurisdiction_id, + org_importer, + person_importer, + bill_importer, + vote_event_importer, + ): super(EventImporter, self).__init__(jurisdiction_id) self.org_importer = org_importer self.person_importer = person_importer @@ -38,68 +68,70 @@ def __init__(self, jurisdiction_id, org_importer, person_importer, bill_importer self.vote_event_importer = vote_event_importer def get_object(self, event): - if event.get('pupa_id'): - e_id = self.lookup_obj_id(event['pupa_id'], Event) + if event.get("pupa_id"): + e_id = self.lookup_obj_id(event["pupa_id"], Event) if e_id: - spec = {'id': e_id} + spec = {"id": e_id} else: return None else: spec = { - 'name': event['name'], - 'description': event['description'], - 'start_date': event['start_date'], - 'end_date': event['end_date'], - 'jurisdiction_id': self.jurisdiction_id + "name": event["name"], + "description": event["description"], + "start_date": event["start_date"], + "end_date": event["end_date"], + "jurisdiction_id": self.jurisdiction_id, } return self.model_class.objects.get(**spec) def get_location(self, location_data): - obj, created = EventLocation.objects.get_or_create(name=location_data['name'], - url=location_data.get('url', ''), - jurisdiction_id=self.jurisdiction_id) + obj, created = EventLocation.objects.get_or_create( + name=location_data["name"], + url=location_data.get("url", ""), + jurisdiction_id=self.jurisdiction_id, + ) # TODO: geocode here? return obj def prepare_for_db(self, data): - data['jurisdiction_id'] = self.jurisdiction_id - if data['location']: - data['location'] = self.get_location(data['location']) + data["jurisdiction_id"] = self.jurisdiction_id + if data["location"]: + data["location"] = self.get_location(data["location"]) - data['start_date'] = data['start_date'] - data['end_date'] = data.get('end_date', "") + data["start_date"] = data["start_date"] + data["end_date"] = data.get("end_date", "") - for participant in data['participants']: - if 'person_id' in participant: - participant['person_id'] = self.person_importer.resolve_json_id( - participant['person_id'], - allow_no_match=True) - elif 'organization_id' in participant: - participant['organization_id'] = self.org_importer.resolve_json_id( - participant['organization_id'], - allow_no_match=True) + for participant in data["participants"]: + if "person_id" in participant: + participant["person_id"] = self.person_importer.resolve_json_id( + participant["person_id"], allow_no_match=True + ) + elif "organization_id" in participant: + participant["organization_id"] = self.org_importer.resolve_json_id( + participant["organization_id"], allow_no_match=True + ) - for item in data['agenda']: - for entity in item['related_entities']: - if 'person_id' in entity: - entity['person_id'] = self.person_importer.resolve_json_id( - entity['person_id'], - allow_no_match=True) - elif 'organization_id' in entity: - entity['organization_id'] = self.org_importer.resolve_json_id( - entity['organization_id'], - allow_no_match=True) - elif 'bill_id' in entity: + for item in data["agenda"]: + for entity in item["related_entities"]: + if "person_id" in entity: + entity["person_id"] = self.person_importer.resolve_json_id( + entity["person_id"], allow_no_match=True + ) + elif "organization_id" in entity: + entity["organization_id"] = self.org_importer.resolve_json_id( + entity["organization_id"], allow_no_match=True + ) + elif "bill_id" in entity: # unpack and repack bill psuedo id in case filters alter it - bill = get_pseudo_id(entity['bill_id']) + bill = get_pseudo_id(entity["bill_id"]) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) - entity['bill_id'] = self.bill_importer.resolve_json_id( - bill, - allow_no_match=True) - elif 'vote_event_id' in entity: - entity['vote_event_id'] = self.vote_event_importer.resolve_json_id( - entity['vote_event_id'], - allow_no_match=True) + entity["bill_id"] = self.bill_importer.resolve_json_id( + bill, allow_no_match=True + ) + elif "vote_event_id" in entity: + entity["vote_event_id"] = self.vote_event_importer.resolve_json_id( + entity["vote_event_id"], allow_no_match=True + ) return data diff --git a/pupa/importers/jurisdiction.py b/pupa/importers/jurisdiction.py index 43406c17..df4ccc3e 100644 --- a/pupa/importers/jurisdiction.py +++ b/pupa/importers/jurisdiction.py @@ -4,16 +4,19 @@ class JurisdictionImporter(BaseImporter): - _type = 'jurisdiction' + _type = "jurisdiction" model_class = Jurisdiction - related_models = {'legislative_sessions': (LegislativeSession, 'jurisdiction_id', {})} - merge_related = {'legislative_sessions': ['identifier']} + related_models = { + "legislative_sessions": (LegislativeSession, "jurisdiction_id", {}) + } + merge_related = {"legislative_sessions": ["identifier"]} def get_object(self, data): - return self.model_class.objects.get(division_id=data['division_id'], - classification=data['classification']) + return self.model_class.objects.get( + division_id=data["division_id"], classification=data["classification"] + ) def prepare_for_db(self, data): - for s in data['legislative_sessions']: - s.pop('_scraped_name', None) + for s in data["legislative_sessions"]: + s.pop("_scraped_name", None) return data diff --git a/pupa/importers/memberships.py b/pupa/importers/memberships.py index 744b0a2b..876c7b22 100644 --- a/pupa/importers/memberships.py +++ b/pupa/importers/memberships.py @@ -1,15 +1,20 @@ -from opencivicdata.core.models import Membership, MembershipContactDetail, MembershipLink +from opencivicdata.core.models import ( + Membership, + MembershipContactDetail, + MembershipLink, +) from .base import BaseImporter from ..utils import get_pseudo_id from ..exceptions import NoMembershipsError class MembershipImporter(BaseImporter): - _type = 'membership' + _type = "membership" model_class = Membership - related_models = {'contact_details': (MembershipContactDetail, 'membership_id', {}), - 'links': (MembershipLink, 'membership_id', {}) - } + related_models = { + "contact_details": (MembershipContactDetail, "membership_id", {}), + "links": (MembershipLink, "membership_id", {}), + } def __init__(self, jurisdiction_id, person_importer, org_importer, post_importer): super(MembershipImporter, self).__init__(jurisdiction_id) @@ -19,47 +24,57 @@ def __init__(self, jurisdiction_id, person_importer, org_importer, post_importer self.seen_person_ids = set() def get_object(self, membership): - spec = {'organization_id': membership['organization_id'], - 'person_id': membership['person_id'], - 'label': membership['label'], - 'role': membership['role']} + spec = { + "organization_id": membership["organization_id"], + "person_id": membership["person_id"], + "label": membership["label"], + "role": membership["role"], + } # post_id is optional - might exist in DB but not scraped here? - if membership['post_id']: - spec['post_id'] = membership['post_id'] + if membership["post_id"]: + spec["post_id"] = membership["post_id"] - if membership['person_name']: - spec['person_name'] = membership['person_name'] + if membership["person_name"]: + spec["person_name"] = membership["person_name"] - if membership['start_date']: - spec['start_date'] = membership['start_date'] + if membership["start_date"]: + spec["start_date"] = membership["start_date"] else: # if this is a historical role, only update historical roles - spec['end_date'] = membership['end_date'] + spec["end_date"] = membership["end_date"] return self.model_class.objects.get(**spec) def prepare_for_db(self, data): # check if the organization is not tied to a jurisdiction - if data['organization_id'].startswith('~'): - pseudo_id = get_pseudo_id(data['organization_id']) - is_party = (pseudo_id.get('classification') == 'party') + if data["organization_id"].startswith("~"): + pseudo_id = get_pseudo_id(data["organization_id"]) + is_party = pseudo_id.get("classification") == "party" else: - # we have to assume it is not a party if we want to avoid doing a lookup here + # we have to assume it is not a party if we want to avoid + # doing a lookup here is_party = False - data['organization_id'] = self.org_importer.resolve_json_id(data['organization_id']) - data['person_id'] = self.person_importer.resolve_json_id(data['person_id'], - allow_no_match=True) - data['post_id'] = self.post_importer.resolve_json_id(data['post_id']) + data["organization_id"] = self.org_importer.resolve_json_id( + data["organization_id"] + ) + data["person_id"] = self.person_importer.resolve_json_id( + data["person_id"], allow_no_match=True + ) + data["post_id"] = self.post_importer.resolve_json_id(data["post_id"]) if not is_party: # track that we had a membership for this person - self.seen_person_ids.add(data['person_id']) + self.seen_person_ids.add(data["person_id"]) return data def postimport(self): - person_ids = set(self.person_importer.json_to_db_id.values()) - self.seen_person_ids + person_ids = ( + set(self.person_importer.json_to_db_id.values()) - self.seen_person_ids + ) if person_ids: - reverse_id_dict = {v: k for k, v in self.person_importer.json_to_db_id.items()} + reverse_id_dict = { + v: k for k, v in self.person_importer.json_to_db_id.items() + } person_ids = [reverse_id_dict[pid] for pid in person_ids] raise NoMembershipsError(person_ids) diff --git a/pupa/importers/organizations.py b/pupa/importers/organizations.py index 2b6dcb0c..f8fafd25 100644 --- a/pupa/importers/organizations.py +++ b/pupa/importers/organizations.py @@ -1,7 +1,12 @@ from django.db.models import Q -from opencivicdata.core.models import (Organization, OrganizationIdentifier, OrganizationName, - OrganizationContactDetail, OrganizationLink, - OrganizationSource) +from opencivicdata.core.models import ( + Organization, + OrganizationIdentifier, + OrganizationName, + OrganizationContactDetail, + OrganizationLink, + OrganizationSource, +) from .base import BaseImporter from ..utils import get_pseudo_id from ..utils.topsort import Network @@ -9,57 +14,57 @@ class OrganizationImporter(BaseImporter): - _type = 'organization' + _type = "organization" model_class = Organization - related_models = {'identifiers': (OrganizationIdentifier, 'organization_id', {}), - 'other_names': (OrganizationName, 'organization_id', {}), - 'contact_details': (OrganizationContactDetail, 'organization_id', {}), - 'links': (OrganizationLink, 'organization_id', {}), - 'sources': (OrganizationSource, 'organization_id', {}), - } + related_models = { + "identifiers": (OrganizationIdentifier, "organization_id", {}), + "other_names": (OrganizationName, "organization_id", {}), + "contact_details": (OrganizationContactDetail, "organization_id", {}), + "links": (OrganizationLink, "organization_id", {}), + "sources": (OrganizationSource, "organization_id", {}), + } def get_object(self, org): - spec = {'classification': org['classification'], - 'parent_id': org['parent_id']} + spec = {"classification": org["classification"], "parent_id": org["parent_id"]} # add jurisdiction_id unless this is a party - jid = org.get('jurisdiction_id') + jid = org.get("jurisdiction_id") if jid: - spec['jurisdiction_id'] = jid + spec["jurisdiction_id"] = jid - all_names = [org['name']] + [o['name'] for o in org['other_names']] + all_names = [org["name"]] + [o["name"] for o in org["other_names"]] - query = (Q(**spec) & - (Q(name__in=all_names) | Q(other_names__name__in=all_names))) - matches = list(self.model_class.objects.filter(query).distinct('id')) + query = Q(**spec) & (Q(name__in=all_names) | Q(other_names__name__in=all_names)) + matches = list(self.model_class.objects.filter(query).distinct("id")) matches_length = len(matches) if matches_length == 1: return matches[0] elif matches_length == 0: raise self.model_class.DoesNotExist( - 'No Organization: {} in {}'.format(all_names, self.jurisdiction_id)) + "No Organization: {} in {}".format(all_names, self.jurisdiction_id) + ) else: - raise SameOrgNameError(org['name']) + raise SameOrgNameError(org["name"]) def prepare_for_db(self, data): - data['parent_id'] = self.resolve_json_id(data['parent_id']) + data["parent_id"] = self.resolve_json_id(data["parent_id"]) - if data['classification'] != 'party': - data['jurisdiction_id'] = self.jurisdiction_id + if data["classification"] != "party": + data["jurisdiction_id"] = self.jurisdiction_id return data def limit_spec(self, spec): - if spec.get('classification') != 'party': - spec['jurisdiction_id'] = self.jurisdiction_id + if spec.get("classification") != "party": + spec["jurisdiction_id"] = self.jurisdiction_id - name = spec.pop('name', None) + name = spec.pop("name", None) if name: - return (Q(**spec) & - (Q(name=name) | Q(other_names__name=name))) + return Q(**spec) & (Q(name=name) | Q(other_names__name=name)) return spec def _prepare_imports(self, dicts): - """ an override for prepare imports that sorts the imports by parent_id dependencies """ + """an override for prepare imports that sorts the imports + by parent_id dependencies""" # all pseudo parent ids we've seen pseudo_ids = set() # pseudo matches @@ -70,8 +75,8 @@ def _prepare_imports(self, dicts): # collect parent pseudo_ids for _, data in prepared.items(): - parent_id = data.get('parent_id', None) or '' - if parent_id.startswith('~'): + parent_id = data.get("parent_id", None) or "" + if parent_id.startswith("~"): pseudo_ids.add(parent_id) # turn pseudo_ids into a tuple of dictionaries @@ -88,7 +93,9 @@ def _prepare_imports(self, dicts): break if match: if ppid in pseudo_matches: - raise UnresolvedIdError('multiple matches for pseudo id: ' + ppid) + raise UnresolvedIdError( + "multiple matches for pseudo id: " + ppid + ) pseudo_matches[ppid] = json_id # toposort the nodes so parents are imported first @@ -97,7 +104,7 @@ def _prepare_imports(self, dicts): import_order = [] for json_id, data in prepared.items(): - parent_id = data.get('parent_id', None) + parent_id = data.get("parent_id", None) # resolve pseudo_ids to their json id before building the network if parent_id in pseudo_matches: @@ -116,7 +123,7 @@ def _prepare_imports(self, dicts): in_network.add(jid) # ensure all data made it into network (paranoid check, should never fail) - if in_network != set(prepared.keys()): # pragma: no cover + if in_network != set(prepared.keys()): # pragma: no cover raise PupaInternalError("import is missing nodes in network set") return import_order diff --git a/pupa/importers/people.py b/pupa/importers/people.py index 1b1370ae..15367c75 100644 --- a/pupa/importers/people.py +++ b/pupa/importers/people.py @@ -1,35 +1,42 @@ from collections import defaultdict from django.db.models import Q -from opencivicdata.core.models import (Person, PersonIdentifier, PersonName, PersonContactDetail, - PersonLink, PersonSource) +from opencivicdata.core.models import ( + Person, + PersonIdentifier, + PersonName, + PersonContactDetail, + PersonLink, + PersonSource, +) from .base import BaseImporter from ..exceptions import SameNameError class PersonImporter(BaseImporter): - _type = 'person' + _type = "person" model_class = Person - related_models = {'identifiers': (PersonIdentifier, 'person_id', {}), - 'other_names': (PersonName, 'person_id', {}), - 'contact_details': (PersonContactDetail, 'person_id', {}), - 'links': (PersonLink, 'person_id', {}), - 'sources': (PersonSource, 'person_id', {}), - } + related_models = { + "identifiers": (PersonIdentifier, "person_id", {}), + "other_names": (PersonName, "person_id", {}), + "contact_details": (PersonContactDetail, "person_id", {}), + "links": (PersonLink, "person_id", {}), + "sources": (PersonSource, "person_id", {}), + } def _prepare_imports(self, dicts): dicts = list(super(PersonImporter, self)._prepare_imports(dicts)) by_name = defaultdict(list) for _, person in dicts: - by_name[person['name']].append(person) - for other in person['other_names']: - by_name[other['name']].append(person) + by_name[person["name"]].append(person) + for other in person["other_names"]: + by_name[other["name"]].append(person) # check for duplicates for name, people in by_name.items(): if len(people) > 1: for person in people: - if person['birth_date'] == '': + if person["birth_date"] == "": raise SameNameError(name) return dicts @@ -40,38 +47,47 @@ def limit_spec(self, spec): based on the memberships -> organization -> jurisdiction, so we scope the resolution. """ - if list(spec.keys()) == ['name']: + if list(spec.keys()) == ["name"]: # if we're just resolving on name, include other names and family name - name = spec['name'] - return ((Q(name=name) | Q(other_names__name=name) | Q(family_name=name)) & - Q(memberships__organization__jurisdiction_id=self.jurisdiction_id)) - spec['memberships__organization__jurisdiction_id'] = self.jurisdiction_id + name = spec["name"] + return (Q(name=name) | Q(other_names__name=name) | Q(family_name=name)) & Q( + memberships__organization__jurisdiction_id=self.jurisdiction_id + ) + spec["memberships__organization__jurisdiction_id"] = self.jurisdiction_id return spec def get_object(self, person): - all_names = [person['name']] + [o['name'] for o in person['other_names']] + all_names = [person["name"]] + [o["name"] for o in person["other_names"]] - matches = list(self.model_class.objects.filter( - Q(memberships__organization__jurisdiction_id=self.jurisdiction_id), - (Q(name__in=all_names) | Q(other_names__name__in=all_names)) - ).distinct('id')) + matches = list( + self.model_class.objects.filter( + Q(memberships__organization__jurisdiction_id=self.jurisdiction_id), + (Q(name__in=all_names) | Q(other_names__name__in=all_names)), + ).distinct("id") + ) matches_length = len(matches) if matches_length == 1 and not matches[0].birth_date: return matches[0] elif matches_length == 0: raise self.model_class.DoesNotExist( - 'No Person: {} in {}'.format(all_names, self.jurisdiction_id)) + "No Person: {} in {}".format(all_names, self.jurisdiction_id) + ) else: # Try and match based on birth_date. - if person['birth_date']: + if person["birth_date"]: for match in matches: - if person['birth_date'] and match.birth_date == person['birth_date']: + if ( + person["birth_date"] + and match.birth_date == person["birth_date"] + ): return match # If we got here, no match based on birth_date, a new person? raise self.model_class.DoesNotExist( - 'No Person: {} in {} with birth_date {}'.format( - all_names, self.jurisdiction_id, person['birth_date'])) + "No Person: {} in {} with birth_date {}".format( + all_names, self.jurisdiction_id, person["birth_date"] + ) + ) - raise SameNameError(person['name']) + raise SameNameError(person["name"]) diff --git a/pupa/importers/posts.py b/pupa/importers/posts.py index 16b4f129..9715da5d 100644 --- a/pupa/importers/posts.py +++ b/pupa/importers/posts.py @@ -3,31 +3,34 @@ class PostImporter(BaseImporter): - _type = 'post' + _type = "post" model_class = Post - related_models = {'contact_details': (PostContactDetail, 'post_id', {}), - 'links': (PostLink, 'post_id', {}) - } + related_models = { + "contact_details": (PostContactDetail, "post_id", {}), + "links": (PostLink, "post_id", {}), + } def __init__(self, jurisdiction_id, org_importer): super(PostImporter, self).__init__(jurisdiction_id) self.org_importer = org_importer def prepare_for_db(self, data): - data['organization_id'] = self.org_importer.resolve_json_id(data['organization_id']) + data["organization_id"] = self.org_importer.resolve_json_id( + data["organization_id"] + ) return data def get_object(self, post): spec = { - 'organization_id': post['organization_id'], - 'label': post['label'], + "organization_id": post["organization_id"], + "label": post["label"], } - if post['role']: - spec['role'] = post['role'] + if post["role"]: + spec["role"] = post["role"] return self.model_class.objects.get(**spec) def limit_spec(self, spec): - spec['organization__jurisdiction_id'] = self.jurisdiction_id + spec["organization__jurisdiction_id"] = self.jurisdiction_id return spec diff --git a/pupa/importers/vote_events.py b/pupa/importers/vote_events.py index 614508ee..41fa15ac 100644 --- a/pupa/importers/vote_events.py +++ b/pupa/importers/vote_events.py @@ -1,19 +1,25 @@ -from opencivicdata.legislative.models import (VoteEvent, VoteCount, PersonVote, VoteSource, - BillAction) +from opencivicdata.legislative.models import ( + VoteEvent, + VoteCount, + PersonVote, + VoteSource, + BillAction, +) from pupa.utils import get_pseudo_id, _make_pseudo_id from .base import BaseImporter from ..exceptions import InvalidVoteEventError class VoteEventImporter(BaseImporter): - _type = 'vote_event' + _type = "vote_event" model_class = VoteEvent - related_models = {'counts': (VoteCount, 'vote_event_id', {}), - 'votes': (PersonVote, 'vote_event_id', {}), - 'sources': (VoteSource, 'vote_event_id', {})} + related_models = { + "counts": (VoteCount, "vote_event_id", {}), + "votes": (PersonVote, "vote_event_id", {}), + "sources": (VoteSource, "vote_event_id", {}), + } - def __init__(self, jurisdiction_id, person_importer, org_importer, - bill_importer): + def __init__(self, jurisdiction_id, person_importer, org_importer, bill_importer): super(VoteEventImporter, self).__init__(jurisdiction_id) self.person_importer = person_importer @@ -24,87 +30,108 @@ def __init__(self, jurisdiction_id, person_importer, org_importer, self.vote_events_to_delete = set() def get_object(self, vote_event): - spec = {'legislative_session_id': vote_event['legislative_session_id']} + spec = {"legislative_session_id": vote_event["legislative_session_id"]} - if not vote_event['identifier'] and not vote_event['bill_id']: + if not vote_event["identifier"] and not vote_event["bill_id"]: raise InvalidVoteEventError( - 'attempt to save a VoteEvent without an "identifier" or "bill_id"') + 'attempt to save a VoteEvent without an "identifier" or "bill_id"' + ) - if vote_event['bill_id']: - if vote_event['bill_id'] not in self.seen_bill_ids: - self.seen_bill_ids.add(vote_event['bill_id']) + if vote_event["bill_id"]: + if vote_event["bill_id"] not in self.seen_bill_ids: + self.seen_bill_ids.add(vote_event["bill_id"]) # keep a list of all the vote event ids that should be deleted self.vote_events_to_delete.update( - self.model_class.objects.filter(bill_id=vote_event['bill_id']).values_list( - 'id', flat=True) + self.model_class.objects.filter( + bill_id=vote_event["bill_id"] + ).values_list("id", flat=True) ) - spec['bill_id'] = vote_event['bill_id'] + spec["bill_id"] = vote_event["bill_id"] - if vote_event.get('pupa_id'): - ve_id = self.lookup_obj_id(vote_event['pupa_id'], VoteEvent) + if vote_event.get("pupa_id"): + ve_id = self.lookup_obj_id(vote_event["pupa_id"], VoteEvent) if ve_id: - spec = {'id': ve_id} + spec = {"id": ve_id} else: return None - elif vote_event['identifier']: + elif vote_event["identifier"]: # if there's an identifier, just use it and the bill_id and the session - spec['identifier'] = vote_event['identifier'] + spec["identifier"] = vote_event["identifier"] else: # otherwise use the motion, start_date, and org as well - spec.update({ - 'motion_text': vote_event['motion_text'], - 'start_date': vote_event['start_date'], - 'organization_id': vote_event['organization_id'] - }) + spec.update( + { + "motion_text": vote_event["motion_text"], + "start_date": vote_event["start_date"], + "organization_id": vote_event["organization_id"], + } + ) - return self.model_class.objects.prefetch_related('votes__voter').get(**spec) + return self.model_class.objects.prefetch_related("votes__voter").get(**spec) def limit_spec(self, spec): - spec['legislative_session__jurisdiction_id'] = self.jurisdiction_id + spec["legislative_session__jurisdiction_id"] = self.jurisdiction_id return spec def prepare_for_db(self, data): - data['legislative_session_id'] = self.get_session_id(data.pop('legislative_session')) - data['organization_id'] = self.org_importer.resolve_json_id(data.pop('organization')) + data["legislative_session_id"] = self.get_session_id( + data.pop("legislative_session") + ) + data["organization_id"] = self.org_importer.resolve_json_id( + data.pop("organization") + ) - bill = data.pop('bill') - if bill and bill.startswith('~'): + bill = data.pop("bill") + if bill and bill.startswith("~"): # unpack psuedo id and apply filter in case there are any that alter it bill = get_pseudo_id(bill) self.bill_importer.apply_transformers(bill) bill = _make_pseudo_id(**bill) - data['bill_id'] = self.bill_importer.resolve_json_id(bill) - bill_action = data.pop('bill_action') + data["bill_id"] = self.bill_importer.resolve_json_id(bill) + bill_action = data.pop("bill_action") if bill_action: try: - action = BillAction.objects.get(bill_id=data['bill_id'], - description=bill_action, - date=data['start_date'], - organization_id=data['organization_id'], - ) + action = BillAction.objects.get( + bill_id=data["bill_id"], + description=bill_action, + date=data["start_date"], + organization_id=data["organization_id"], + ) # seen_action_ids is for ones being added in this import # action.vote is already set if action was set on prior import - if action.id in self.seen_action_ids or hasattr(action, 'vote'): - self.warning('can not match two VoteEvents to %s: %s', - action.id, bill_action) + if action.id in self.seen_action_ids or hasattr(action, "vote"): + self.warning( + "can not match two VoteEvents to %s: %s", action.id, bill_action + ) else: - data['bill_action_id'] = action.id + data["bill_action_id"] = action.id self.seen_action_ids.add(action.id) except BillAction.DoesNotExist: - self.warning('could not match VoteEvent to %s %s %s', - bill, bill_action, data['start_date']) + self.warning( + "could not match VoteEvent to %s %s %s", + bill, + bill_action, + data["start_date"], + ) except BillAction.MultipleObjectsReturned as e: - self.warning('could not match VoteEvent to %s %s %s: %s', - bill, bill_action, data['start_date'], e) + self.warning( + "could not match VoteEvent to %s %s %s: %s", + bill, + bill_action, + data["start_date"], + e, + ) - for vote in data['votes']: - vote['voter_id'] = self.person_importer.resolve_json_id(vote['voter_id'], - allow_no_match=True) + for vote in data["votes"]: + vote["voter_id"] = self.person_importer.resolve_json_id( + vote["voter_id"], allow_no_match=True + ) return data def postimport(self): - # be sure not to delete vote events that were imported (meaning updated) this time through + # be sure not to delete vote events that were + # imported (meaning updated) this time through self.vote_events_to_delete.difference_update(self.json_to_db_id.values()) # everything remaining, goodbye self.model_class.objects.filter(id__in=self.vote_events_to_delete).delete() diff --git a/pupa/migrations/0001_initial.py b/pupa/migrations/0001_initial.py index 6ba30250..84b26e07 100644 --- a/pupa/migrations/0001_initial.py +++ b/pupa/migrations/0001_initial.py @@ -7,72 +7,136 @@ class Migration(migrations.Migration): dependencies = [ - ('core', '0001_initial'), - ('legislative', '0001_initial'), + ("core", "0001_initial"), + ("legislative", "0001_initial"), ] operations = [ migrations.CreateModel( - name='ImportObjects', + name="ImportObjects", fields=[ - ('id', models.AutoField(primary_key=True, auto_created=True, serialize=False, verbose_name='ID')), - ('object_type', models.CharField(max_length=20, choices=[('jurisdiction', 'Jurisdiction'), ('person', 'Person'), ('organization', 'Organization'), ('post', 'Post'), ('membership', 'Membership'), ('bill', 'Bill'), ('vote_event', 'VoteEvent'), ('event', 'Event')])), - ('insert_count', models.PositiveIntegerField()), - ('update_count', models.PositiveIntegerField()), - ('noop_count', models.PositiveIntegerField()), - ('start_time', models.DateTimeField()), - ('end_time', models.DateTimeField()), + ( + "id", + models.AutoField( + primary_key=True, + auto_created=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "object_type", + models.CharField( + max_length=20, + choices=[ + ("jurisdiction", "Jurisdiction"), + ("person", "Person"), + ("organization", "Organization"), + ("post", "Post"), + ("membership", "Membership"), + ("bill", "Bill"), + ("vote_event", "VoteEvent"), + ("event", "Event"), + ], + ), + ), + ("insert_count", models.PositiveIntegerField()), + ("update_count", models.PositiveIntegerField()), + ("noop_count", models.PositiveIntegerField()), + ("start_time", models.DateTimeField()), + ("end_time", models.DateTimeField()), ], - options={ - }, + options={}, bases=(models.Model,), ), migrations.CreateModel( - name='RunPlan', + name="RunPlan", fields=[ - ('id', models.AutoField(primary_key=True, auto_created=True, serialize=False, verbose_name='ID')), - ('success', models.BooleanField(default=True)), - ('jurisdiction', models.ForeignKey(to='core.Jurisdiction', on_delete=models.CASCADE)), + ( + "id", + models.AutoField( + primary_key=True, + auto_created=True, + serialize=False, + verbose_name="ID", + ), + ), + ("success", models.BooleanField(default=True)), + ( + "jurisdiction", + models.ForeignKey(to="core.Jurisdiction", on_delete=models.CASCADE), + ), ], - options={ - }, + options={}, bases=(models.Model,), ), migrations.AddField( - model_name='importobjects', - name='report', - field=models.ForeignKey(to='pupa.RunPlan', on_delete=models.CASCADE), + model_name="importobjects", + name="report", + field=models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE), preserve_default=True, ), migrations.CreateModel( - name='ScrapeObjects', + name="ScrapeObjects", fields=[ - ('id', models.AutoField(primary_key=True, auto_created=True, serialize=False, verbose_name='ID')), - ('object_type', models.CharField(max_length=20, choices=[('jurisdiction', 'Jurisdiction'), ('person', 'Person'), ('organization', 'Organization'), ('post', 'Post'), ('membership', 'Membership'), ('bill', 'Bill'), ('vote_event', 'VoteEvent'), ('event', 'Event')])), - ('count', models.PositiveIntegerField()), + ( + "id", + models.AutoField( + primary_key=True, + auto_created=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "object_type", + models.CharField( + max_length=20, + choices=[ + ("jurisdiction", "Jurisdiction"), + ("person", "Person"), + ("organization", "Organization"), + ("post", "Post"), + ("membership", "Membership"), + ("bill", "Bill"), + ("vote_event", "VoteEvent"), + ("event", "Event"), + ], + ), + ), + ("count", models.PositiveIntegerField()), ], - options={ - }, + options={}, bases=(models.Model,), ), migrations.CreateModel( - name='ScrapeReport', + name="ScrapeReport", fields=[ - ('id', models.AutoField(primary_key=True, auto_created=True, serialize=False, verbose_name='ID')), - ('scraper', models.CharField(max_length=300)), - ('args', models.CharField(max_length=300)), - ('start_time', models.DateTimeField()), - ('end_time', models.DateTimeField()), - ('plan', models.ForeignKey(to='pupa.RunPlan', on_delete=models.CASCADE)), + ( + "id", + models.AutoField( + primary_key=True, + auto_created=True, + serialize=False, + verbose_name="ID", + ), + ), + ("scraper", models.CharField(max_length=300)), + ("args", models.CharField(max_length=300)), + ("start_time", models.DateTimeField()), + ("end_time", models.DateTimeField()), + ( + "plan", + models.ForeignKey(to="pupa.RunPlan", on_delete=models.CASCADE), + ), ], - options={ - }, + options={}, bases=(models.Model,), ), migrations.AddField( - model_name='scrapeobjects', - name='report', - field=models.ForeignKey(to='pupa.ScrapeReport', on_delete=models.CASCADE), + model_name="scrapeobjects", + name="report", + field=models.ForeignKey(to="pupa.ScrapeReport", on_delete=models.CASCADE), preserve_default=True, ), ] diff --git a/pupa/migrations/0002_auto_20150906_1458.py b/pupa/migrations/0002_auto_20150906_1458.py index 005441d1..a00d18ee 100644 --- a/pupa/migrations/0002_auto_20150906_1458.py +++ b/pupa/migrations/0002_auto_20150906_1458.py @@ -9,28 +9,44 @@ class Migration(migrations.Migration): dependencies = [ - ('pupa', '0001_initial'), + ("pupa", "0001_initial"), ] operations = [ migrations.AlterField( - model_name='importobjects', - name='report', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='imported_objects', to='pupa.RunPlan'), + model_name="importobjects", + name="report", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="imported_objects", + to="pupa.RunPlan", + ), ), migrations.AlterField( - model_name='runplan', - name='jurisdiction', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='runs', to='core.Jurisdiction'), + model_name="runplan", + name="jurisdiction", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="runs", + to="core.Jurisdiction", + ), ), migrations.AlterField( - model_name='scrapeobjects', - name='report', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scraped_objects', to='pupa.ScrapeReport'), + model_name="scrapeobjects", + name="report", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="scraped_objects", + to="pupa.ScrapeReport", + ), ), migrations.AlterField( - model_name='scrapereport', - name='plan', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='scrapers', to='pupa.RunPlan'), + model_name="scrapereport", + name="plan", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="scrapers", + to="pupa.RunPlan", + ), ), ] diff --git a/pupa/migrations/0003_auto_20151118_0408.py b/pupa/migrations/0003_auto_20151118_0408.py index 3de74070..3cf3ef5a 100644 --- a/pupa/migrations/0003_auto_20151118_0408.py +++ b/pupa/migrations/0003_auto_20151118_0408.py @@ -10,30 +10,34 @@ class Migration(migrations.Migration): dependencies = [ - ('pupa', '0002_auto_20150906_1458'), + ("pupa", "0002_auto_20150906_1458"), ] operations = [ migrations.AddField( - model_name='runplan', - name='end_time', - field=models.DateTimeField(default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc)), + model_name="runplan", + name="end_time", + field=models.DateTimeField( + default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc) + ), preserve_default=False, ), migrations.AddField( - model_name='runplan', - name='exception', - field=models.TextField(blank=True, default=''), + model_name="runplan", + name="exception", + field=models.TextField(blank=True, default=""), ), migrations.AddField( - model_name='runplan', - name='start_time', - field=models.DateTimeField(default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc)), + model_name="runplan", + name="start_time", + field=models.DateTimeField( + default=datetime.datetime(2015, 1, 1, 0, 0, 0, 0, tzinfo=utc) + ), preserve_default=False, ), migrations.AddField( - model_name='runplan', - name='traceback', - field=models.TextField(blank=True, default=''), + model_name="runplan", + name="traceback", + field=models.TextField(blank=True, default=""), ), ] diff --git a/pupa/migrations/0004_identifier.py b/pupa/migrations/0004_identifier.py index 023621ac..3de9876c 100644 --- a/pupa/migrations/0004_identifier.py +++ b/pupa/migrations/0004_identifier.py @@ -9,18 +9,32 @@ class Migration(migrations.Migration): dependencies = [ - ('contenttypes', '0002_remove_content_type_name'), - ('pupa', '0003_auto_20151118_0408'), + ("contenttypes", "0002_remove_content_type_name"), + ("pupa", "0003_auto_20151118_0408"), ] operations = [ migrations.CreateModel( - name='Identifier', + name="Identifier", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('identifier', models.CharField(max_length=500)), - ('object_id', models.PositiveIntegerField()), - ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='contenttypes.ContentType')), + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("identifier", models.CharField(max_length=500)), + ("object_id", models.PositiveIntegerField()), + ( + "content_type", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="contenttypes.ContentType", + ), + ), ], ), ] diff --git a/pupa/migrations/0005_auto_20170522_1935.py b/pupa/migrations/0005_auto_20170522_1935.py index e870ea26..9851fd2d 100644 --- a/pupa/migrations/0005_auto_20170522_1935.py +++ b/pupa/migrations/0005_auto_20170522_1935.py @@ -8,18 +8,18 @@ class Migration(migrations.Migration): dependencies = [ - ('pupa', '0004_identifier'), + ("pupa", "0004_identifier"), ] operations = [ migrations.AlterField( - model_name='identifier', - name='identifier', + model_name="identifier", + name="identifier", field=models.CharField(max_length=300), ), migrations.AlterField( - model_name='identifier', - name='object_id', + model_name="identifier", + name="object_id", field=models.CharField(max_length=300), ), ] diff --git a/pupa/migrations/0006_identifier_jurisdiction.py b/pupa/migrations/0006_identifier_jurisdiction.py index 9e01ed9e..157fc778 100644 --- a/pupa/migrations/0006_identifier_jurisdiction.py +++ b/pupa/migrations/0006_identifier_jurisdiction.py @@ -9,15 +9,20 @@ class Migration(migrations.Migration): dependencies = [ - ('core', '0001_initial'), - ('pupa', '0005_auto_20170522_1935'), + ("core", "0001_initial"), + ("pupa", "0005_auto_20170522_1935"), ] operations = [ migrations.AddField( - model_name='identifier', - name='jurisdiction', - field=models.ForeignKey(default='', on_delete=django.db.models.deletion.CASCADE, related_name='pupa_ids', to='core.Jurisdiction'), + model_name="identifier", + name="jurisdiction", + field=models.ForeignKey( + default="", + on_delete=django.db.models.deletion.CASCADE, + related_name="pupa_ids", + to="core.Jurisdiction", + ), preserve_default=False, ), ] diff --git a/pupa/migrations/0007_sessiondataqualityreport.py b/pupa/migrations/0007_sessiondataqualityreport.py index 57159a3c..726ab660 100644 --- a/pupa/migrations/0007_sessiondataqualityreport.py +++ b/pupa/migrations/0007_sessiondataqualityreport.py @@ -8,27 +8,47 @@ class Migration(migrations.Migration): dependencies = [ - ('legislative', '0005_auto_20171005_2028'), - ('pupa', '0006_identifier_jurisdiction'), + ("legislative", "0005_auto_20171005_2028"), + ("pupa", "0006_identifier_jurisdiction"), ] operations = [ migrations.CreateModel( - name='SessionDataQualityReport', + name="SessionDataQualityReport", fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('bills_missing_actions', models.PositiveIntegerField()), - ('bills_missing_sponsors', models.PositiveIntegerField()), - ('bills_missing_versions', models.PositiveIntegerField()), - ('votes_missing_voters', models.PositiveIntegerField()), - ('votes_missing_bill', models.PositiveIntegerField()), - ('votes_missing_yes_count', models.PositiveIntegerField()), - ('votes_missing_no_count', models.PositiveIntegerField()), - ('votes_with_bad_counts', models.PositiveIntegerField()), - ('unmatched_sponsor_people', django.contrib.postgres.fields.jsonb.JSONField()), - ('unmatched_sponsor_organizations', django.contrib.postgres.fields.jsonb.JSONField()), - ('unmatched_voters', django.contrib.postgres.fields.jsonb.JSONField()), - ('legislative_session', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='legislative.LegislativeSession')), + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("bills_missing_actions", models.PositiveIntegerField()), + ("bills_missing_sponsors", models.PositiveIntegerField()), + ("bills_missing_versions", models.PositiveIntegerField()), + ("votes_missing_voters", models.PositiveIntegerField()), + ("votes_missing_bill", models.PositiveIntegerField()), + ("votes_missing_yes_count", models.PositiveIntegerField()), + ("votes_missing_no_count", models.PositiveIntegerField()), + ("votes_with_bad_counts", models.PositiveIntegerField()), + ( + "unmatched_sponsor_people", + django.contrib.postgres.fields.jsonb.JSONField(), + ), + ( + "unmatched_sponsor_organizations", + django.contrib.postgres.fields.jsonb.JSONField(), + ), + ("unmatched_voters", django.contrib.postgres.fields.jsonb.JSONField()), + ( + "legislative_session", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="legislative.LegislativeSession", + ), + ), ], ), ] diff --git a/pupa/models.py b/pupa/models.py index a7288c53..5ca8c94e 100644 --- a/pupa/models.py +++ b/pupa/models.py @@ -7,28 +7,30 @@ OBJECT_TYPES = ( - ('jurisdiction', 'Jurisdiction'), - ('person', 'Person'), - ('organization', 'Organization'), - ('post', 'Post'), - ('membership', 'Membership'), - ('bill', 'Bill'), - ('vote_event', 'VoteEvent'), - ('event', 'Event'), + ("jurisdiction", "Jurisdiction"), + ("person", "Person"), + ("organization", "Organization"), + ("post", "Post"), + ("membership", "Membership"), + ("bill", "Bill"), + ("vote_event", "VoteEvent"), + ("event", "Event"), ) class RunPlan(models.Model): - jurisdiction = models.ForeignKey(Jurisdiction, related_name='runs', on_delete=models.CASCADE) + jurisdiction = models.ForeignKey( + Jurisdiction, related_name="runs", on_delete=models.CASCADE + ) success = models.BooleanField(default=True) start_time = models.DateTimeField() end_time = models.DateTimeField() - exception = models.TextField(blank=True, default='') - traceback = models.TextField(blank=True, default='') + exception = models.TextField(blank=True, default="") + traceback = models.TextField(blank=True, default="") class ScrapeReport(models.Model): - plan = models.ForeignKey(RunPlan, related_name='scrapers', on_delete=models.CASCADE) + plan = models.ForeignKey(RunPlan, related_name="scrapers", on_delete=models.CASCADE) scraper = models.CharField(max_length=300) args = models.CharField(max_length=300) start_time = models.DateTimeField() @@ -36,14 +38,17 @@ class ScrapeReport(models.Model): class ScrapeObjects(models.Model): - report = models.ForeignKey(ScrapeReport, related_name='scraped_objects', - on_delete=models.CASCADE) + report = models.ForeignKey( + ScrapeReport, related_name="scraped_objects", on_delete=models.CASCADE + ) object_type = models.CharField(max_length=20, choices=OBJECT_TYPES) count = models.PositiveIntegerField() class ImportObjects(models.Model): - report = models.ForeignKey(RunPlan, related_name='imported_objects', on_delete=models.CASCADE) + report = models.ForeignKey( + RunPlan, related_name="imported_objects", on_delete=models.CASCADE + ) object_type = models.CharField(max_length=20, choices=OBJECT_TYPES) insert_count = models.PositiveIntegerField() update_count = models.PositiveIntegerField() @@ -54,20 +59,23 @@ class ImportObjects(models.Model): class Identifier(models.Model): identifier = models.CharField(max_length=300) - jurisdiction = models.ForeignKey(Jurisdiction, - related_name='pupa_ids', - on_delete=models.CASCADE, - ) + jurisdiction = models.ForeignKey( + Jurisdiction, + related_name="pupa_ids", + on_delete=models.CASCADE, + ) content_type = models.ForeignKey(ContentType, on_delete=models.CASCADE) object_id = models.CharField(max_length=300) - content_object = GenericForeignKey('content_type', 'object_id') + content_object = GenericForeignKey("content_type", "object_id") - def __str__(self): # __unicode__ on Python 2 + def __str__(self): # __unicode__ on Python 2 return self.identifier class SessionDataQualityReport(models.Model): - legislative_session = models.ForeignKey(LegislativeSession, on_delete=models.CASCADE) + legislative_session = models.ForeignKey( + LegislativeSession, on_delete=models.CASCADE + ) bills_missing_actions = models.PositiveIntegerField() bills_missing_sponsors = models.PositiveIntegerField() diff --git a/pupa/reports/__init__.py b/pupa/reports/__init__.py index edeb7003..088545d9 100644 --- a/pupa/reports/__init__.py +++ b/pupa/reports/__init__.py @@ -1 +1 @@ -from .session import generate_session_report # noqa +from .session import generate_session_report # noqa diff --git a/pupa/reports/session.py b/pupa/reports/session.py index 894e5a73..52d97ada 100644 --- a/pupa/reports/session.py +++ b/pupa/reports/session.py @@ -1,65 +1,102 @@ from django.db.models import Count, Subquery, OuterRef, Q, F -from opencivicdata.legislative.models import (Bill, VoteEvent, VoteCount, PersonVote, - BillSponsorship) +from opencivicdata.legislative.models import ( + Bill, + VoteEvent, + VoteCount, + PersonVote, + BillSponsorship, +) from ..models import SessionDataQualityReport def _simple_count(ModelCls, session, **filter): - return ModelCls.objects.filter(legislative_session_id=session).filter(**filter).count() + return ( + ModelCls.objects.filter(legislative_session_id=session).filter(**filter).count() + ) def generate_session_report(session): report = { - 'bills_missing_actions': _simple_count(Bill, session, actions__isnull=True), - 'bills_missing_sponsors': _simple_count(Bill, session, sponsorships__isnull=True), - 'bills_missing_versions': _simple_count(Bill, session, versions__isnull=True), - 'votes_missing_bill': _simple_count(VoteEvent, session, bill__isnull=True), - 'votes_missing_voters': _simple_count(VoteEvent, session, votes__isnull=True), - 'votes_missing_yes_count': 0, - 'votes_missing_no_count': 0, - 'votes_with_bad_counts': 0, + "bills_missing_actions": _simple_count(Bill, session, actions__isnull=True), + "bills_missing_sponsors": _simple_count( + Bill, session, sponsorships__isnull=True + ), + "bills_missing_versions": _simple_count(Bill, session, versions__isnull=True), + "votes_missing_bill": _simple_count(VoteEvent, session, bill__isnull=True), + "votes_missing_voters": _simple_count(VoteEvent, session, votes__isnull=True), + "votes_missing_yes_count": 0, + "votes_missing_no_count": 0, + "votes_with_bad_counts": 0, } voteevents = VoteEvent.objects.filter(legislative_session_id=session) queryset = voteevents.annotate( - yes_sum=Count('pk', filter=Q(votes__option='yes')), - no_sum=Count('pk', filter=Q(votes__option='no')), - other_sum=Count('pk', filter=Q(votes__option='other')), - yes_count=Subquery(VoteCount.objects.filter(vote_event=OuterRef('pk'), - option='yes').values('value')), - no_count=Subquery(VoteCount.objects.filter(vote_event=OuterRef('pk'), - option='no').values('value')), - other_count=Subquery(VoteCount.objects.filter(vote_event=OuterRef('pk'), - option='other').values('value')), + yes_sum=Count("pk", filter=Q(votes__option="yes")), + no_sum=Count("pk", filter=Q(votes__option="no")), + other_sum=Count("pk", filter=Q(votes__option="other")), + yes_count=Subquery( + VoteCount.objects.filter(vote_event=OuterRef("pk"), option="yes").values( + "value" + ) + ), + no_count=Subquery( + VoteCount.objects.filter(vote_event=OuterRef("pk"), option="no").values( + "value" + ) + ), + other_count=Subquery( + VoteCount.objects.filter(vote_event=OuterRef("pk"), option="other").values( + "value" + ) + ), ) for vote in queryset: if vote.yes_count is None: - report['votes_missing_yes_count'] += 1 + report["votes_missing_yes_count"] += 1 vote.yes_count = 0 if vote.no_count is None: - report['votes_missing_no_count'] += 1 + report["votes_missing_no_count"] += 1 vote.no_count = 0 if vote.other_count is None: vote.other_count = 0 - if (vote.yes_sum != vote.yes_count or - vote.no_sum != vote.no_count or - vote.other_sum != vote.other_count): - report['votes_with_bad_counts'] += 1 + if ( + vote.yes_sum != vote.yes_count + or vote.no_sum != vote.no_count + or vote.other_sum != vote.other_count + ): + report["votes_with_bad_counts"] += 1 # handle unmatched - queryset = BillSponsorship.objects.filter(bill__legislative_session_id=session, - entity_type='person', person_id=None - ).values('name').annotate(num=Count('name')) - report['unmatched_sponsor_people'] = {item['name']: item['num'] for item in queryset} - queryset = BillSponsorship.objects.filter(bill__legislative_session_id=session, - entity_type='organization', person_id=None - ).values('name').annotate(num=Count('name')) - report['unmatched_sponsor_organizations'] = {item['name']: item['num'] for item in queryset} - queryset = PersonVote.objects.filter(vote_event__legislative_session_id=session, - voter__isnull=True).values(name=F('voter_name')).annotate( - num=Count('voter_name') - ) - report['unmatched_voters'] = {item['name']: item['num'] for item in queryset} + queryset = ( + BillSponsorship.objects.filter( + bill__legislative_session_id=session, entity_type="person", person_id=None + ) + .values("name") + .annotate(num=Count("name")) + ) + report["unmatched_sponsor_people"] = { + item["name"]: item["num"] for item in queryset + } + queryset = ( + BillSponsorship.objects.filter( + bill__legislative_session_id=session, + entity_type="organization", + person_id=None, + ) + .values("name") + .annotate(num=Count("name")) + ) + report["unmatched_sponsor_organizations"] = { + item["name"]: item["num"] for item in queryset + } + queryset = ( + PersonVote.objects.filter( + vote_event__legislative_session_id=session, voter__isnull=True + ) + .values(name=F("voter_name")) + .annotate(num=Count("voter_name")) + ) + report["unmatched_voters"] = {item["name"]: item["num"] for item in queryset} return SessionDataQualityReport(legislative_session_id=session, **report) diff --git a/pupa/scrape/base.py b/pupa/scrape/base.py index f786d429..872b4d12 100644 --- a/pupa/scrape/base.py +++ b/pupa/scrape/base.py @@ -14,16 +14,14 @@ from pupa.exceptions import ScrapeError, ScrapeValueError -@FormatChecker.cls_checks('uri-blank') +@FormatChecker.cls_checks("uri-blank") def uri_blank(value): - return value == '' or FormatChecker().conforms(value, 'uri') + return value == "" or FormatChecker().conforms(value, "uri") -@FormatChecker.cls_checks('uri') +@FormatChecker.cls_checks("uri") def check_uri(val): - return val \ - and val.startswith(('http://', 'https://', 'ftp://')) \ - and ' ' not in val + return val and val.startswith(("http://", "https://", "ftp://")) and " " not in val def cleanup_list(obj, default): @@ -37,9 +35,11 @@ def cleanup_list(obj, default): class Scraper(scrapelib.Scraper): - """ Base class for all scrapers """ + """Base class for all scrapers""" - def __init__(self, jurisdiction, datadir, *, strict_validation=True, fastmode=False): + def __init__( + self, jurisdiction, datadir, *, strict_validation=True, fastmode=False + ): super(Scraper, self).__init__() # set options @@ -77,21 +77,27 @@ def __init__(self, jurisdiction, datadir, *, strict_validation=True, fastmode=Fa def save_object(self, obj): """ - Save object to disk as JSON. + Save object to disk as JSON. - Generally shouldn't be called directly. + Generally shouldn't be called directly. """ obj.pre_save(self.jurisdiction.jurisdiction_id) - filename = '{0}_{1}.json'.format(obj._type, obj._id).replace('/', '-') + filename = "{0}_{1}.json".format(obj._type, obj._id).replace("/", "-") - self.info('save %s %s as %s', obj._type, obj, filename) - self.debug(json.dumps(OrderedDict(sorted(obj.as_dict().items())), - cls=utils.JSONEncoderPlus, indent=4, separators=(',', ': '))) + self.info("save %s %s as %s", obj._type, obj, filename) + self.debug( + json.dumps( + OrderedDict(sorted(obj.as_dict().items())), + cls=utils.JSONEncoderPlus, + indent=4, + separators=(",", ": "), + ) + ) self.output_names[obj._type].add(filename) - with open(os.path.join(self.datadir, filename), 'w') as f: + with open(os.path.join(self.datadir, filename), "w") as f: json.dump(obj.as_dict(), f, cls=utils.JSONEncoderPlus) # validate after writing, allows for inspection on failure @@ -108,36 +114,41 @@ def save_object(self, obj): self.save_object(obj) def do_scrape(self, **kwargs): - record = {'objects': defaultdict(int)} + record = {"objects": defaultdict(int)} self.output_names = defaultdict(set) - record['start'] = utils.utcnow() + record["start"] = utils.utcnow() for obj in self.scrape(**kwargs) or []: - if hasattr(obj, '__iter__'): + if hasattr(obj, "__iter__"): for iterobj in obj: self.save_object(iterobj) else: self.save_object(obj) - record['end'] = utils.utcnow() - record['skipped'] = getattr(self, 'skipped', 0) + record["end"] = utils.utcnow() + record["skipped"] = getattr(self, "skipped", 0) if not self.output_names: - raise ScrapeError('no objects returned from {} scrape'.format(self.__class__.__name__)) + raise ScrapeError( + "no objects returned from {} scrape".format(self.__class__.__name__) + ) for _type, nameset in self.output_names.items(): - record['objects'][_type] += len(nameset) + record["objects"][_type] += len(nameset) return record def latest_session(self): - return self.jurisdiction.legislative_sessions[-1]['identifier'] + return self.jurisdiction.legislative_sessions[-1]["identifier"] def scrape(self, **kwargs): - raise NotImplementedError(self.__class__.__name__ + ' must provide a scrape() method') + raise NotImplementedError( + self.__class__.__name__ + " must provide a scrape() method" + ) class BaseBillScraper(Scraper): skipped = 0 class ContinueScraping(Exception): - """ indicate that scraping should continue without saving an object """ + """indicate that scraping should continue without saving an object""" + pass def scrape(self, legislative_session, **kwargs): @@ -146,7 +157,7 @@ def scrape(self, legislative_session, **kwargs): try: yield self.get_bill(bill_id, **extras) except self.ContinueScraping as exc: - self.warning('skipping %s: %r', bill_id, exc) + self.warning("skipping %s: %r", bill_id, exc) self.skipped += 1 continue @@ -189,35 +200,43 @@ def validate(self, schema=None): "datetime", lambda c, d: isinstance(d, (datetime.date, datetime.datetime)) ) type_checker = type_checker.redefine( - "date", lambda c, d: (isinstance(d, datetime.date) - and not isinstance(d, datetime.datetime)) + "date", + lambda c, d: ( + isinstance(d, datetime.date) and not isinstance(d, datetime.datetime) + ), ) - ValidatorCls = jsonschema.validators.extend(Draft3Validator, type_checker=type_checker) + ValidatorCls = jsonschema.validators.extend( + Draft3Validator, type_checker=type_checker + ) validator = ValidatorCls(schema, format_checker=FormatChecker()) errors = [str(error) for error in validator.iter_errors(self.as_dict())] if errors: - raise ScrapeValueError('validation of {} {} failed: {}'.format( - self.__class__.__name__, self._id, '\n\t'+'\n\t'.join(errors) - )) + raise ScrapeValueError( + "validation of {} {} failed: {}".format( + self.__class__.__name__, self._id, "\n\t" + "\n\t".join(errors) + ) + ) def pre_save(self, jurisdiction_id): pass def as_dict(self): d = {} - for attr in self._schema['properties'].keys(): + for attr in self._schema["properties"].keys(): if hasattr(self, attr): d[attr] = getattr(self, attr) - d['_id'] = self._id + d["_id"] = self._id return d # operators def __setattr__(self, key, val): - if key[0] != '_' and key not in self._schema['properties'].keys(): - raise ScrapeValueError('property "{}" not in {} schema'.format(key, self._type)) + if key[0] != "_" and key not in self._schema["properties"].keys(): + raise ScrapeValueError( + 'property "{}" not in {} schema'.format(key, self._type) + ) super(BaseModel, self).__setattr__(key, val) @@ -226,9 +245,9 @@ def __init__(self): super(SourceMixin, self).__init__() self.sources = [] - def add_source(self, url, *, note=''): - """ Add a source URL from which data was collected """ - new = {'url': url, 'note': note} + def add_source(self, url, *, note=""): + """Add a source URL from which data was collected""" + new = {"url": url, "note": note} self.sources.append(new) @@ -237,7 +256,7 @@ def __init__(self): super(ContactDetailMixin, self).__init__() self.contact_details = [] - def add_contact_detail(self, *, type, value, note=''): + def add_contact_detail(self, *, type, value, note=""): self.contact_details.append({"type": type, "value": value, "note": note}) @@ -246,7 +265,7 @@ def __init__(self): super(LinkMixin, self).__init__() self.links = [] - def add_link(self, url, *, note=''): + def add_link(self, url, *, note=""): self.links.append({"note": note, "url": url}) @@ -255,7 +274,7 @@ def __init__(self): super(IdentifierMixin, self).__init__() self.identifiers = [] - def add_identifier(self, identifier, *, scheme=''): + def add_identifier(self, identifier, *, scheme=""): self.identifiers.append({"identifier": identifier, "scheme": scheme}) @@ -264,21 +283,22 @@ def __init__(self): super(OtherNameMixin, self).__init__() self.other_names = [] - def add_name(self, name, *, start_date='', end_date='', note=''): - other_name = {'name': name} + def add_name(self, name, *, start_date="", end_date="", note=""): + other_name = {"name": name} if start_date: - other_name['start_date'] = start_date + other_name["start_date"] = start_date if end_date: - other_name['end_date'] = end_date + other_name["end_date"] = end_date if note: - other_name['note'] = note + other_name["note"] = note self.other_names.append(other_name) class AssociatedLinkMixin(object): - def _add_associated_link(self, collection, note, url, *, media_type, text, on_duplicate, - date=''): - if on_duplicate not in ['error', 'ignore']: + def _add_associated_link( + self, collection, note, url, *, media_type, text, on_duplicate, date="" + ): + if on_duplicate not in ["error", "ignore"]: raise ScrapeValueError("on_duplicate must be 'error' or 'ignore'") try: @@ -286,28 +306,30 @@ def _add_associated_link(self, collection, note, url, *, media_type, text, on_du except AttributeError: associated = self[collection] - ver = {'note': note, 'links': [], 'date': date} + ver = {"note": note, "links": [], "date": date} - # keep a list of the links we've seen, we need to iterate over whole list on each add - # unfortunately this means adds are O(n) + # keep a list of the links we've seen, we need to iterate over whole + # list on each add unfortunately this means adds are O(n) seen_links = set() matches = 0 for item in associated: - for link in item['links']: - seen_links.add(link['url']) + for link in item["links"]: + seen_links.add(link["url"]) if all(ver.get(x) == item.get(x) for x in ["note", "date"]): matches = matches + 1 ver = item - # it should be impossible to have multiple matches found unless someone is bypassing - # _add_associated_link + # it should be impossible to have multiple matches found unless someone + # is bypassing _add_associated_link assert matches <= 1, "multiple matches found in _add_associated_link" if url in seen_links: - if on_duplicate == 'error': - raise ScrapeValueError("Duplicate entry in '%s' - URL: '%s'" % (collection, url)) + if on_duplicate == "error": + raise ScrapeValueError( + "Duplicate entry in '%s' - URL: '%s'" % (collection, url) + ) else: # This means we're in ignore mode. This situation right here # means we should *skip* adding this link silently and continue @@ -319,9 +341,9 @@ def _add_associated_link(self, collection, note, url, *, media_type, text, on_du return None # OK. This is either new or old. Let's just go for it. - ret = {'url': url, 'media_type': media_type, 'text': text} + ret = {"url": url, "media_type": media_type, "text": text} - ver['links'].append(ret) + ver["links"].append(ret) if matches == 0: # in the event we've got a new entry; let's just insert it into diff --git a/pupa/scrape/bill.py b/pupa/scrape/bill.py index 7e3b3771..950ff4aa 100644 --- a/pupa/scrape/bill.py +++ b/pupa/scrape/bill.py @@ -5,14 +5,13 @@ class Action(dict): - def add_related_entity(self, name, entity_type, entity_id=None): ent = { - 'name': name, - 'entity_type': entity_type, - entity_type + '_id': entity_id, + "name": name, + "entity_type": entity_type, + entity_type + "_id": entity_id, } - self['related_entities'].append(ent) + self["related_entities"].append(ent) return ent @@ -21,18 +20,28 @@ class Bill(SourceMixin, AssociatedLinkMixin, BaseModel): An Open Civic Data bill. """ - _type = 'bill' + _type = "bill" _schema = schema - def __init__(self, identifier, legislative_session, title, *, chamber=None, - from_organization=None, classification=None): + def __init__( + self, + identifier, + legislative_session, + title, + *, + chamber=None, + from_organization=None, + classification=None + ): super(Bill, self).__init__() self.identifier = identifier self.legislative_session = legislative_session self.title = title - self.classification = cleanup_list(classification, ['bill']) - self.from_organization = pseudo_organization(from_organization, chamber, 'legislature') + self.classification = cleanup_list(classification, ["bill"]) + self.from_organization = pseudo_organization( + from_organization, chamber, "legislature" + ) self.actions = [] self.other_identifiers = [] @@ -44,31 +53,55 @@ def __init__(self, identifier, legislative_session, title, *, chamber=None, self.abstracts = [] self.versions = [] - def add_action(self, description, date, *, organization=None, chamber=None, - classification=None, related_entities=None, extras=None): - action = Action(description=description, date=date, - organization_id=pseudo_organization(organization, chamber, 'legislature'), - classification=cleanup_list(classification, []), related_entities=[], - extras=extras or {}) + def add_action( + self, + description, + date, + *, + organization=None, + chamber=None, + classification=None, + related_entities=None, + extras=None + ): + action = Action( + description=description, + date=date, + organization_id=pseudo_organization(organization, chamber, "legislature"), + classification=cleanup_list(classification, []), + related_entities=[], + extras=extras or {}, + ) self.actions.append(action) return action def add_related_bill(self, identifier, legislative_session, relation_type): # will we need jurisdiction, organization? - self.related_bills.append({ - "identifier": identifier, - "legislative_session": legislative_session, - "relation_type": relation_type - }) - - def add_sponsorship(self, name, classification, entity_type, primary, *, chamber=None, - entity_id=None): + self.related_bills.append( + { + "identifier": identifier, + "legislative_session": legislative_session, + "relation_type": relation_type, + } + ) + + def add_sponsorship( + self, + name, + classification, + entity_type, + primary, + *, + chamber=None, + entity_id=None + ): sp = { "name": name, "classification": classification, "entity_type": entity_type, "primary": primary, - # set these so that all JSON objects have the same keys, prevents import errors + # set these so that all JSON objects have the same keys, + # prevents import errors "person_id": None, "organization_id": None, } @@ -76,40 +109,70 @@ def add_sponsorship(self, name, classification, entity_type, primary, *, chamber if entity_type: if not entity_id: entity_id = _make_pseudo_id(name=name) - sp[entity_type + '_id'] = entity_id + sp[entity_type + "_id"] = entity_id self.sponsorships.append(sp) - def add_sponsorship_by_identifier(self, name, classification, entity_type, - primary, *, scheme, identifier, chamber=None): - return self.add_sponsorship(name, classification, entity_type, primary, - chamber=chamber, entity_id=_make_pseudo_id( - identifiers__scheme=scheme, - identifiers__identifier=identifier) - ) + def add_sponsorship_by_identifier( + self, + name, + classification, + entity_type, + primary, + *, + scheme, + identifier, + chamber=None + ): + return self.add_sponsorship( + name, + classification, + entity_type, + primary, + chamber=chamber, + entity_id=_make_pseudo_id( + identifiers__scheme=scheme, identifiers__identifier=identifier + ), + ) def add_subject(self, subject): self.subject.append(subject) - def add_abstract(self, abstract, note, date=''): + def add_abstract(self, abstract, note, date=""): self.abstracts.append({"note": note, "abstract": abstract, "date": date}) - def add_title(self, title, note=''): + def add_title(self, title, note=""): self.other_titles.append({"note": note, "title": title}) - def add_identifier(self, identifier, note='', scheme=''): - self.other_identifiers.append({"note": note, "identifier": identifier, 'scheme': scheme}) - - def add_document_link(self, note, url, *, date='', media_type='', text='', - on_duplicate='error'): - return self._add_associated_link(collection='documents', note=note, url=url, date=date, - text=text, media_type=media_type, - on_duplicate=on_duplicate) - - def add_version_link(self, note, url, *, date='', media_type='', text='', - on_duplicate='error'): - return self._add_associated_link(collection='versions', note=note, url=url, date=date, - text=text, media_type=media_type, - on_duplicate=on_duplicate) + def add_identifier(self, identifier, note="", scheme=""): + self.other_identifiers.append( + {"note": note, "identifier": identifier, "scheme": scheme} + ) + + def add_document_link( + self, note, url, *, date="", media_type="", text="", on_duplicate="error" + ): + return self._add_associated_link( + collection="documents", + note=note, + url=url, + date=date, + text=text, + media_type=media_type, + on_duplicate=on_duplicate, + ) + + def add_version_link( + self, note, url, *, date="", media_type="", text="", on_duplicate="error" + ): + return self._add_associated_link( + collection="versions", + note=note, + url=url, + date=date, + text=text, + media_type=media_type, + on_duplicate=on_duplicate, + ) def __str__(self): - return self.identifier + ' in ' + self.legislative_session + return self.identifier + " in " + self.legislative_session diff --git a/pupa/scrape/event.py b/pupa/scrape/event.py index 783baec5..bb6ec7f8 100644 --- a/pupa/scrape/event.py +++ b/pupa/scrape/event.py @@ -8,74 +8,89 @@ class EventAgendaItem(dict, AssociatedLinkMixin): event = None def __init__(self, description, event): - super(EventAgendaItem, self).__init__({ - "description": description, - "classification": [], - "related_entities": [], - "subjects": [], - "media": [], - "notes": [], - "order": str(len(event.agenda)), - "extras": {}, - }) + super(EventAgendaItem, self).__init__( + { + "description": description, + "classification": [], + "related_entities": [], + "subjects": [], + "media": [], + "notes": [], + "order": str(len(event.agenda)), + "extras": {}, + } + ) self.event = event def add_subject(self, what): - self['subjects'].append(what) + self["subjects"].append(what) def add_classification(self, what): - self['classification'].append(what) + self["classification"].append(what) - def add_vote_event(self, vote_event, *, id=None, note='consideration'): - self.add_entity(name=vote_event, entity_type='vote_event', id=id, note=note) + def add_vote_event(self, vote_event, *, id=None, note="consideration"): + self.add_entity(name=vote_event, entity_type="vote_event", id=id, note=note) - def add_committee(self, committee, *, id=None, note='participant'): - self.add_entity(name=committee, entity_type='organization', id=id, note=note) + def add_committee(self, committee, *, id=None, note="participant"): + self.add_entity(name=committee, entity_type="organization", id=id, note=note) - def add_bill(self, bill, *, id=None, note='consideration'): - self.add_entity(name=bill, entity_type='bill', id=id, note=note) + def add_bill(self, bill, *, id=None, note="consideration"): + self.add_entity(name=bill, entity_type="bill", id=id, note=note) - def add_person(self, person, *, id=None, note='participant'): - self.add_entity(name=person, entity_type='person', id=id, note=note) + def add_person(self, person, *, id=None, note="participant"): + self.add_entity(name=person, entity_type="person", id=id, note=note) - def add_media_link(self, note, url, media_type, *, text='', type='media', - on_duplicate='error'): - return self._add_associated_link(collection='media', note=note, url=url, text=text, - media_type=media_type, on_duplicate=on_duplicate) + def add_media_link( + self, note, url, media_type, *, text="", type="media", on_duplicate="error" + ): + return self._add_associated_link( + collection="media", + note=note, + url=url, + text=text, + media_type=media_type, + on_duplicate=on_duplicate, + ) def add_entity(self, name, entity_type, *, id, note): - ret = { - "name": name, - "entity_type": entity_type, - "note": note - } + ret = {"name": name, "entity_type": entity_type, "note": note} if id: - ret['id'] = id + ret["id"] = id elif entity_type: - if entity_type in ('organization', 'person'): + if entity_type in ("organization", "person"): id = _make_pseudo_id(name=name) - elif entity_type in ('bill', 'vote_event'): + elif entity_type in ("bill", "vote_event"): id = _make_pseudo_id(identifier=name) else: - raise ScrapeValueError('attempt to call add_entity with unsupported ' - 'entity type: {}'.format(entity_type)) - ret[entity_type + '_id'] = id + raise ScrapeValueError( + "attempt to call add_entity with unsupported " + "entity type: {}".format(entity_type) + ) + ret[entity_type + "_id"] = id - self['related_entities'].append(ret) + self["related_entities"].append(ret) class Event(BaseModel, SourceMixin, AssociatedLinkMixin, LinkMixin): """ Details for an event in .format """ - _type = 'event' + + _type = "event" _schema = schema - def __init__(self, name, start_date, *, - location_name=None, - all_day=False, description="", end_date="", - status="confirmed", classification="event" - ): + def __init__( + self, + name, + start_date, + *, + location_name=None, + all_day=False, + description="", + end_date="", + status="confirmed", + classification="event" + ): super(Event, self).__init__() self.start_date = start_date self.all_day = all_day @@ -94,50 +109,67 @@ def __init__(self, name, start_date, *, self.agenda = [] def __str__(self): - return '{} {}'.format(self.start_date, self.name.strip()) + return "{} {}".format(self.start_date, self.name.strip()) def set_location(self, name, *, note="", url="", coordinates=None): - self.location = {"name": name, "note": note, "url": url, "coordinates": coordinates} - - def add_participant(self, name, type, *, id=None, note='participant'): - p = { + self.location = { "name": name, - "entity_type": type, - "note": note + "note": note, + "url": url, + "coordinates": coordinates, } + + def add_participant(self, name, type, *, id=None, note="participant"): + p = {"name": name, "entity_type": type, "note": note} if id: - p['id'] = id + p["id"] = id elif type: id = _make_pseudo_id(name=name) - p[type + '_id'] = id + p[type + "_id"] = id self.participants.append(p) - def add_person(self, name, *, id=None, note='participant'): - return self.add_participant(name=name, type='person', id=id, note=note) + def add_person(self, name, *, id=None, note="participant"): + return self.add_participant(name=name, type="person", id=id, note=note) - def add_committee(self, name, *, id=None, note='participant'): - return self.add_participant(name=name, type='organization', id=id, note=note) + def add_committee(self, name, *, id=None, note="participant"): + return self.add_participant(name=name, type="organization", id=id, note=note) def add_agenda_item(self, description): obj = EventAgendaItem(description, self) self.agenda.append(obj) return obj - def add_media_link(self, note, url, media_type, *, text='', - type='media', on_duplicate='error', date=''): - return self._add_associated_link(collection='media', - note=note, - url=url, - text=text, - media_type=media_type, - on_duplicate=on_duplicate, - date=date) - - def add_document(self, note, url, *, text='', media_type='', on_duplicate='error', date=''): - return self._add_associated_link(collection='documents', - note=note, url=url, - text=text, - media_type=media_type, - on_duplicate=on_duplicate, - date=date) + def add_media_link( + self, + note, + url, + media_type, + *, + text="", + type="media", + on_duplicate="error", + date="" + ): + return self._add_associated_link( + collection="media", + note=note, + url=url, + text=text, + media_type=media_type, + on_duplicate=on_duplicate, + date=date, + ) + + def add_document( + self, note, url, *, text="", media_type="", on_duplicate="error", date="" + ): + return self._add_associated_link( + collection="documents", + note=note, + url=url, + text=text, + media_type=media_type, + on_duplicate=on_duplicate, + date=date, + ) diff --git a/pupa/scrape/jurisdiction.py b/pupa/scrape/jurisdiction.py index 2f3f37e0..bc76fd61 100644 --- a/pupa/scrape/jurisdiction.py +++ b/pupa/scrape/jurisdiction.py @@ -5,9 +5,9 @@ class Jurisdiction(BaseModel): - """ Base class for a jurisdiction """ + """Base class for a jurisdiction""" - _type = 'jurisdiction' + _type = "jurisdiction" _schema = schema # schema objects @@ -31,23 +31,33 @@ def __init__(self): @property def jurisdiction_id(self): - return '{}/{}'.format(self.division_id.replace('ocd-division', 'ocd-jurisdiction'), - self.classification) + return "{}/{}".format( + self.division_id.replace("ocd-division", "ocd-jurisdiction"), + self.classification, + ) _id = jurisdiction_id def as_dict(self): - return {'_id': self.jurisdiction_id, 'id': self.jurisdiction_id, - 'name': self.name, 'url': self.url, 'division_id': self.division_id, - 'classification': self.classification, - 'legislative_sessions': self.legislative_sessions, - 'feature_flags': self.feature_flags, 'extras': self.extras, } + return { + "_id": self.jurisdiction_id, + "id": self.jurisdiction_id, + "name": self.name, + "url": self.url, + "division_id": self.division_id, + "classification": self.classification, + "legislative_sessions": self.legislative_sessions, + "feature_flags": self.feature_flags, + "extras": self.extras, + } def __str__(self): return self.name def get_organizations(self): - raise NotImplementedError('get_organizations is not implemented') # pragma: no cover + raise NotImplementedError( + "get_organizations is not implemented" + ) # pragma: no cover class JurisdictionScraper(Scraper): @@ -60,8 +70,10 @@ def scrape(self): yield org if self.jurisdiction.parties: - warnings.warn('including parties on Jurisdiction is deprecated, ' - 'use "pupa party" command instead') + warnings.warn( + "including parties on Jurisdiction is deprecated, " + 'use "pupa party" command instead' + ) for party in self.jurisdiction.parties: - org = Organization(classification='party', name=party['name']) + org = Organization(classification="party", name=party["name"]) yield org diff --git a/pupa/scrape/popolo.py b/pupa/scrape/popolo.py index 57690e62..cee7e5e6 100644 --- a/pupa/scrape/popolo.py +++ b/pupa/scrape/popolo.py @@ -1,6 +1,12 @@ import copy -from .base import (BaseModel, SourceMixin, LinkMixin, ContactDetailMixin, OtherNameMixin, - IdentifierMixin) +from .base import ( + BaseModel, + SourceMixin, + LinkMixin, + ContactDetailMixin, + OtherNameMixin, + IdentifierMixin, +) from .schemas.post import schema as post_schema from .schemas.person import schema as person_schema from .schemas.membership import schema as membership_schema @@ -10,7 +16,7 @@ # a copy of the org schema without sources org_schema_no_sources = copy.deepcopy(org_schema) -org_schema_no_sources['properties'].pop('sources') +org_schema_no_sources["properties"].pop("sources") class Post(BaseModel, LinkMixin, ContactDetailMixin): @@ -18,12 +24,21 @@ class Post(BaseModel, LinkMixin, ContactDetailMixin): A popolo-style Post """ - _type = 'post' + _type = "post" _schema = post_schema - def __init__(self, *, label, role, organization_id=None, chamber=None, - division_id=None, start_date='', end_date='', - maximum_memberships=1): + def __init__( + self, + *, + label, + role, + organization_id=None, + chamber=None, + division_id=None, + start_date="", + end_date="", + maximum_memberships=1 + ): super(Post, self).__init__() self.label = label self.role = role @@ -42,13 +57,22 @@ class Membership(BaseModel, ContactDetailMixin, LinkMixin): A popolo-style Membership. """ - _type = 'membership' + _type = "membership" _schema = membership_schema - def __init__(self, *, person_id, organization_id, post_id=None, role='', label='', - start_date='', end_date='', on_behalf_of_id=None, - person_name='' - ): + def __init__( + self, + *, + person_id, + organization_id, + post_id=None, + role="", + label="", + start_date="", + end_date="", + on_behalf_of_id=None, + person_name="" + ): """ Constructor for the Membership object. @@ -68,23 +92,44 @@ def __init__(self, *, person_id, organization_id, post_id=None, role='', label=' self.on_behalf_of_id = on_behalf_of_id def __str__(self): - return self.person_id + ' membership in ' + self.organization_id + return self.person_id + " membership in " + self.organization_id -class Person(BaseModel, SourceMixin, ContactDetailMixin, LinkMixin, IdentifierMixin, - OtherNameMixin): +class Person( + BaseModel, + SourceMixin, + ContactDetailMixin, + LinkMixin, + IdentifierMixin, + OtherNameMixin, +): """ Details for a Person in Popolo format. """ - _type = 'person' + _type = "person" _schema = person_schema - def __init__(self, name, *, birth_date='', death_date='', biography='', summary='', image='', - gender='', national_identity='', - # specialty fields - district=None, party=None, primary_org='', role='', - start_date='', end_date='', primary_org_name=None): + def __init__( + self, + name, + *, + birth_date="", + death_date="", + biography="", + summary="", + image="", + gender="", + national_identity="", + # specialty fields + district=None, + party=None, + primary_org="", + role="", + start_date="", + end_date="", + primary_org_name=None + ): super(Person, self).__init__() self.name = name self.birth_date = birth_date @@ -95,27 +140,38 @@ def __init__(self, name, *, birth_date='', death_date='', biography='', summary= self.gender = gender self.national_identity = national_identity if primary_org: - self.add_term(role, primary_org, district=district, - start_date=start_date, end_date=end_date, - org_name=primary_org_name) + self.add_term( + role, + primary_org, + district=district, + start_date=start_date, + end_date=end_date, + org_name=primary_org_name, + ) if party: self.add_party(party) - def add_membership(self, name_or_org, role='member', **kwargs): + def add_membership(self, name_or_org, role="member", **kwargs): """ - add a membership in an organization and return the membership - object in case there are more details to add + add a membership in an organization and return the membership + object in case there are more details to add """ if isinstance(name_or_org, Organization): - membership = Membership(person_id=self._id, - person_name=self.name, - organization_id=name_or_org._id, - role=role, **kwargs) + membership = Membership( + person_id=self._id, + person_name=self.name, + organization_id=name_or_org._id, + role=role, + **kwargs + ) else: - membership = Membership(person_id=self._id, - person_name=self.name, - organization_id=_make_pseudo_id(name=name_or_org), - role=role, **kwargs) + membership = Membership( + person_id=self._id, + person_name=self.name, + organization_id=_make_pseudo_id(name=name_or_org), + role=role, + **kwargs + ) self._related.append(membership) return membership @@ -124,34 +180,55 @@ def add_party(self, party, **kwargs): person_id=self._id, person_name=self.name, organization_id=_make_pseudo_id(classification="party", name=party), - role='member', **kwargs) + role="member", + **kwargs + ) self._related.append(membership) - def add_term(self, role, org_classification, *, district=None, - start_date='', end_date='', label='', org_name=None, - appointment=False): + def add_term( + self, + role, + org_classification, + *, + district=None, + start_date="", + end_date="", + label="", + org_name=None, + appointment=False + ): if org_name: - org_id = _make_pseudo_id(classification=org_classification, - name=org_name) + org_id = _make_pseudo_id(classification=org_classification, name=org_name) else: org_id = _make_pseudo_id(classification=org_classification) if district: if role: - post_id = _make_pseudo_id(label=district, - role=role, - organization__classification=org_classification) + post_id = _make_pseudo_id( + label=district, + role=role, + organization__classification=org_classification, + ) else: - post_id = _make_pseudo_id(label=district, - organization__classification=org_classification) + post_id = _make_pseudo_id( + label=district, organization__classification=org_classification + ) elif appointment: - post_id = _make_pseudo_id(role=role, - organization__classification=org_classification) + post_id = _make_pseudo_id( + role=role, organization__classification=org_classification + ) else: post_id = None - membership = Membership(person_id=self._id, person_name=self.name, - organization_id=org_id, post_id=post_id, - role=role, start_date=start_date, end_date=end_date, label=label) + membership = Membership( + person_id=self._id, + person_name=self.name, + organization_id=org_id, + post_id=post_id, + role=role, + start_date=start_date, + end_date=end_date, + label=label, + ) self._related.append(membership) return membership @@ -159,18 +236,32 @@ def __str__(self): return self.name -class Organization(BaseModel, SourceMixin, ContactDetailMixin, LinkMixin, IdentifierMixin, - OtherNameMixin): +class Organization( + BaseModel, + SourceMixin, + ContactDetailMixin, + LinkMixin, + IdentifierMixin, + OtherNameMixin, +): """ A single popolo-style Organization """ - _type = 'organization' + _type = "organization" _schema = org_schema - def __init__(self, name, *, classification='', parent_id=None, - founding_date='', dissolution_date='', image='', - chamber=None): + def __init__( + self, + name, + *, + classification="", + parent_id=None, + founding_date="", + dissolution_date="", + image="", + chamber=None + ): """ Constructor for the Organization object. """ @@ -188,7 +279,13 @@ def __str__(self): def validate(self): schema = None # these are implicitly declared & do not require sources - if self.classification in ('party', 'legislature', 'upper', 'lower', 'executive'): + if self.classification in ( + "party", + "legislature", + "upper", + "lower", + "executive", + ): schema = org_schema_no_sources return super(Organization, self).validate(schema=schema) @@ -197,24 +294,31 @@ def add_post(self, label, role, **kwargs): self._related.append(post) return post - def add_member(self, name_or_person, role='member', **kwargs): + def add_member(self, name_or_person, role="member", **kwargs): if isinstance(name_or_person, Person): - membership = Membership(person_id=name_or_person._id, - person_name=name_or_person.name, - organization_id=self._id, - role=role, **kwargs) + membership = Membership( + person_id=name_or_person._id, + person_name=name_or_person.name, + organization_id=self._id, + role=role, + **kwargs + ) else: - membership = Membership(person_id=_make_pseudo_id(name=name_or_person), - person_name=name_or_person, - organization_id=self._id, role=role, **kwargs) + membership = Membership( + person_id=_make_pseudo_id(name=name_or_person), + person_name=name_or_person, + organization_id=self._id, + role=role, + **kwargs + ) self._related.append(membership) return membership def pseudo_organization(organization, classification, default=None): - """ helper for setting an appropriate ID for organizations """ + """helper for setting an appropriate ID for organizations""" if organization and classification: - raise ScrapeValueError('cannot specify both classification and organization') + raise ScrapeValueError("cannot specify both classification and organization") elif classification: return _make_pseudo_id(classification=classification) elif organization: diff --git a/pupa/scrape/schemas/bill.py b/pupa/scrape/schemas/bill.py index 58b8fe85..16662e40 100644 --- a/pupa/scrape/schemas/bill.py +++ b/pupa/scrape/schemas/bill.py @@ -16,12 +16,12 @@ "media_type": {"type": "string"}, "url": {"type": "string", "format": "uri"}, }, - "type": "object" + "type": "object", }, "type": "array", }, }, - "type": "object" + "type": "object", }, "type": "array", } @@ -33,8 +33,10 @@ "identifier": {"type": "string", "minLength": 1}, "title": {"type": "string", "minLength": 1}, "from_organization": {"type": ["string", "null"]}, - "classification": {"items": {"type": "string", "enum": common.BILL_CLASSIFICATIONS}, - "type": "array"}, + "classification": { + "items": {"type": "string", "enum": common.BILL_CLASSIFICATIONS}, + "type": "array", + }, "subject": {"items": {"type": "string", "minLength": 1}, "type": "array"}, "abstracts": { "items": { @@ -43,7 +45,8 @@ "note": {"type": "string"}, "date": {"type": "string"}, }, - "type": "object"}, + "type": "object", + }, "type": "array", }, "other_titles": { @@ -52,7 +55,7 @@ "title": {"type": "string", "minLength": 1}, "note": {"type": "string"}, }, - "type": "object" + "type": "object", }, "type": "array", }, @@ -63,7 +66,7 @@ "note": {"type": "string"}, "scheme": {"type": "string"}, }, - "type": "object" + "type": "object", }, "type": "array", }, @@ -73,10 +76,13 @@ "organization": {"type": ["string", "null"]}, "date": fuzzy_datetime, "description": {"type": "string", "minLength": 1}, - "classification": {"items": {"type": "string", - "enum": common.BILL_ACTION_CLASSIFICATIONS}, - "type": "array", - }, + "classification": { + "items": { + "type": "string", + "enum": common.BILL_ACTION_CLASSIFICATIONS, + }, + "type": "array", + }, "related_entities": { "items": { "properties": { @@ -88,16 +94,15 @@ "person_id": {"type": ["string", "null"]}, "organization_id": {"type": ["string", "null"]}, }, - "type": "object" + "type": "object", }, "type": "array", }, }, - "type": "object" + "type": "object", }, "type": "array", }, - "sponsorships": { "items": { "properties": { @@ -111,26 +116,27 @@ "person_id": {"type": ["string", "null"]}, "organization_id": {"type": ["string", "null"]}, }, - "type": "object" + "type": "object", }, "type": "array", }, - "related_bills": { "items": { "properties": { "identifier": {"type": "string", "minLength": 1}, "legislative_session": {"type": "string", "minLength": 1}, - "relation_type": {"enum": common.BILL_RELATION_TYPES, "type": "string"}, + "relation_type": { + "enum": common.BILL_RELATION_TYPES, + "type": "string", + }, }, - "type": "object" + "type": "object", }, "type": "array", }, - "versions": versions_or_documents, "documents": versions_or_documents, "sources": sources, "extras": extras, - } + }, } diff --git a/pupa/scrape/schemas/common.py b/pupa/scrape/schemas/common.py index d7fa7ba4..69c67f0f 100644 --- a/pupa/scrape/schemas/common.py +++ b/pupa/scrape/schemas/common.py @@ -9,8 +9,8 @@ "value": {"type": "string", "minLength": 1}, "note": {"type": "string"}, "label": {"type": "string"}, - } - } + }, + }, } identifiers = { @@ -23,16 +23,19 @@ "type": "array", } -fuzzy_date_string = {"type": "string", - "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"} -fuzzy_date_string_blank = {"type": "string", - "pattern": "^([0-9]{4})?(-[0-9]{2}){0,2}$", - } -fuzzy_datetime_string_blank = {"type": "string", - "pattern": ("^([0-9]{4}((-[0-9]{2}){0,2}|(-[0-9]{2}){2}T" - "[0-9]{2}(:[0-9]{2}){0,2}" - "(Z|[+-][0-9]{2}(:[0-9]{2})?))?)?$"), - } +fuzzy_date_string = {"type": "string", "pattern": "^[0-9]{4}(-[0-9]{2}){0,2}$"} +fuzzy_date_string_blank = { + "type": "string", + "pattern": "^([0-9]{4})?(-[0-9]{2}){0,2}$", +} +fuzzy_datetime_string_blank = { + "type": "string", + "pattern": ( + "^([0-9]{4}((-[0-9]{2}){0,2}|(-[0-9]{2}){2}T" + "[0-9]{2}(:[0-9]{2}){0,2}" + "(Z|[+-][0-9]{2}(:[0-9]{2})?))?)?$" + ), +} fuzzy_date = {"type": [fuzzy_date_string, "date"]} fuzzy_date_blank = {"type": [fuzzy_date_string_blank, "date"]} fuzzy_datetime = {"type": [fuzzy_datetime_string_blank, "datetime"]} @@ -44,11 +47,11 @@ "name": {"type": "string", "minLength": 1}, "start_date": fuzzy_date_blank, "end_date": fuzzy_date_blank, - "note": {"type": "string"} - }, - "type": "object" + "note": {"type": "string"}, + }, + "type": "object", }, - "type": "array" + "type": "array", } @@ -56,11 +59,11 @@ "items": { "properties": { "note": {"type": "string"}, - "url": {"format": "uri", "type": "string"} + "url": {"format": "uri", "type": "string"}, }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", } @@ -70,10 +73,10 @@ "url": {"type": "string", "format": "uri"}, "note": {"type": "string"}, }, - "type": "object" + "type": "object", }, "minItems": 1, - "type": "array" + "type": "array", } extras = { diff --git a/pupa/scrape/schemas/event.py b/pupa/scrape/schemas/event.py index 4bc22630..ff59cdce 100644 --- a/pupa/scrape/schemas/event.py +++ b/pupa/scrape/schemas/event.py @@ -2,7 +2,13 @@ Schema for event objects. """ -from .common import sources, extras, fuzzy_date_blank, fuzzy_datetime, fuzzy_datetime_blank +from .common import ( + sources, + extras, + fuzzy_date_blank, + fuzzy_datetime, + fuzzy_datetime_blank, +) media_schema = { "items": { @@ -17,44 +23,39 @@ "media_type": {"type": "string"}, "url": {"type": "string", "format": "uri"}, }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", }, }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", } schema = { "properties": { "name": {"type": "string", "minLength": 1}, "all_day": {"type": "boolean"}, - 'start_date': fuzzy_datetime, - 'end_date': fuzzy_datetime_blank, + "start_date": fuzzy_datetime, + "end_date": fuzzy_datetime_blank, "status": { "type": "string", "enum": ["cancelled", "tentative", "confirmed", "passed"], }, - "classification": {"type": "string", "minLength": 1}, # TODO: enum + "classification": {"type": "string", "minLength": 1}, # TODO: enum "description": {"type": "string"}, - "location": { "type": ["object", "null"], "properties": { - "name": {"type": "string", "minLength": 1}, - "note": { "type": "string", }, - "url": { "type": ["string", "null"], "format": "uri", }, - "coordinates": { "type": ["object", "null"], "properties": { @@ -65,14 +66,12 @@ "longitude": { "type": "string", "minLength": 1, - } - } + }, + }, }, }, }, - "media": media_schema, - "documents": { "items": { "properties": { @@ -81,73 +80,58 @@ "media_type": {"type": "string", "minLength": 1}, "date": fuzzy_date_blank, }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", }, - "links": { "items": { "properties": { "note": { "type": "string", }, - "url": { - "format": "uri", - "type": "string" - } + "url": {"format": "uri", "type": "string"}, }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", }, - "participants": { "items": { "properties": { - "name": { "type": "string", "minLength": 1, }, - "type": { "enum": ["organization", "person"], "type": "string", }, - "note": { "type": "string", "minLength": 1, }, - }, - "type": "object" + "type": "object", }, - "type": "array" + "type": "array", }, - "agenda": { "items": { "properties": { "description": {"type": "string", "minLength": 1}, - "classification": { "items": {"type": "string", "minLength": 1}, "type": "array", }, - "order": { "type": ["string", "null"], }, - "subjects": { "items": {"type": "string", "minLength": 1}, - "type": "array" + "type": "array", }, - "media": media_schema, - "notes": { "items": { "type": "string", @@ -155,7 +139,6 @@ }, "type": "array", }, - "related_entities": { "items": { "properties": { @@ -163,14 +146,15 @@ "type": "string", "minLength": 1, }, - "name": { "type": "string", "minLength": 1, }, - "note": { - "type": ["string", "null", ], + "type": [ + "string", + "null", + ], "minLength": 1, }, }, @@ -180,16 +164,17 @@ "type": "array", }, }, - "type": "object" + "type": "object", }, "minItems": 0, - "type": "array" + "type": "array", }, "sources": sources, "extras": extras, - 'pupa_id': {"type": ["string", "null"], - "minLength": 1, - }, + "pupa_id": { + "type": ["string", "null"], + "minLength": 1, + }, }, - "type": "object" + "type": "object", } diff --git a/pupa/scrape/schemas/jurisdiction.py b/pupa/scrape/schemas/jurisdiction.py index 9ff401ea..485ca076 100644 --- a/pupa/scrape/schemas/jurisdiction.py +++ b/pupa/scrape/schemas/jurisdiction.py @@ -5,17 +5,21 @@ "properties": { "name": {"type": "string", "minLength": 1}, "url": {"type": "string", "minLength": 1}, - "classification": {"type": "string", "minLength": 1}, # TODO: enum + "classification": {"type": "string", "minLength": 1}, # TODO: enum "division_id": {"type": "string", "minLength": 1}, "legislative_sessions": { - "type": "array", "items": {"type": "object", "properties": { - "name": {"type": "string", "minLength": 1}, - "type": {"type": "string", "enum": ["primary", "special"]}, - "start_date": fuzzy_date_blank, - "end_date": fuzzy_date_blank, - }}, + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "type": {"type": "string", "enum": ["primary", "special"]}, + "start_date": fuzzy_date_blank, + "end_date": fuzzy_date_blank, + }, + }, }, "feature_flags": {"type": "array", "items": {"type": "string", "minLength": 1}}, "extras": extras, - } + }, } diff --git a/pupa/scrape/schemas/membership.py b/pupa/scrape/schemas/membership.py index 0e9fc889..b493c9ac 100644 --- a/pupa/scrape/schemas/membership.py +++ b/pupa/scrape/schemas/membership.py @@ -14,10 +14,9 @@ "contact_details": contact_details, "links": links, "extras": extras, - # division & jurisdiction are additions to popolo "division_id": {"type": ["string", "null"]}, "jurisdiction_id": {"type": "string", "minLength": 1}, }, - "type": "object" + "type": "object", } diff --git a/pupa/scrape/schemas/organization.py b/pupa/scrape/schemas/organization.py index 0c592a6f..bb32fd69 100644 --- a/pupa/scrape/schemas/organization.py +++ b/pupa/scrape/schemas/organization.py @@ -1,5 +1,12 @@ -from .common import (links, contact_details, identifiers, other_names, sources, extras, - fuzzy_date_blank) +from .common import ( + links, + contact_details, + identifiers, + other_names, + sources, + extras, + fuzzy_date_blank, +) from opencivicdata import common schema = { @@ -11,15 +18,15 @@ "type": ["string", "null"], "enum": common.ORGANIZATION_CLASSIFICATIONS, }, - "parent_id": {"type": ["string", "null"], - }, + "parent_id": { + "type": ["string", "null"], + }, "founding_date": fuzzy_date_blank, "dissolution_date": fuzzy_date_blank, "image": {"type": "string", "format": "uri-blank"}, "contact_details": contact_details, "links": links, "sources": sources, - # added to popolo "jurisdiction_id": {"type": "string", "minLength": 1}, "division_id": {"type": ["string", "null"], "minLength": 1}, diff --git a/pupa/scrape/schemas/person.py b/pupa/scrape/schemas/person.py index 0ae41401..c3eec22d 100644 --- a/pupa/scrape/schemas/person.py +++ b/pupa/scrape/schemas/person.py @@ -1,5 +1,12 @@ -from .common import (links, contact_details, identifiers, other_names, sources, extras, - fuzzy_date_blank) +from .common import ( + links, + contact_details, + identifiers, + other_names, + sources, + extras, + fuzzy_date_blank, +) schema = { "properties": { @@ -21,5 +28,5 @@ "sources": sources, "extras": extras, }, - "type": "object" + "type": "object", } diff --git a/pupa/scrape/schemas/post.py b/pupa/scrape/schemas/post.py index 8e73bbe6..55939435 100644 --- a/pupa/scrape/schemas/post.py +++ b/pupa/scrape/schemas/post.py @@ -13,5 +13,5 @@ "links": links, "extras": extras, }, - "type": "object" + "type": "object", } diff --git a/pupa/scrape/schemas/vote_event.py b/pupa/scrape/schemas/vote_event.py index b9939f89..8715feac 100644 --- a/pupa/scrape/schemas/vote_event.py +++ b/pupa/scrape/schemas/vote_event.py @@ -5,18 +5,20 @@ schema = { "type": "object", "properties": { - 'identifier': {"type": "string"}, - 'motion_text': {"type": "string", "minLength": 1}, - 'motion_classification': {"items": {"type": "string", "minLength": 1}, - "type": "array"}, - 'start_date': fuzzy_datetime_blank, - 'end_date': fuzzy_datetime_blank, - 'result': {"type": "string", "enum": common.VOTE_RESULTS}, - 'organization': {"type": ["string", "null"], "minLength": 1}, - 'legislative_session': {"type": "string", "minLength": 1}, - 'bill': {"type": ["string", "null"], "minLength": 1}, - 'bill_action': {"type": ["string", "null"], "minLength": 1}, - 'votes': { + "identifier": {"type": "string"}, + "motion_text": {"type": "string", "minLength": 1}, + "motion_classification": { + "items": {"type": "string", "minLength": 1}, + "type": "array", + }, + "start_date": fuzzy_datetime_blank, + "end_date": fuzzy_datetime_blank, + "result": {"type": "string", "enum": common.VOTE_RESULTS}, + "organization": {"type": ["string", "null"], "minLength": 1}, + "legislative_session": {"type": "string", "minLength": 1}, + "bill": {"type": ["string", "null"], "minLength": 1}, + "bill_action": {"type": ["string", "null"], "minLength": 1}, + "votes": { "items": { "type": "object", "properties": { @@ -27,18 +29,17 @@ }, }, }, - 'counts': { + "counts": { "items": { "properties": { "option": {"type": "string", "enum": common.VOTE_OPTIONS}, "value": {"type": "integer", "minimum": 0}, }, - "type": "object" + "type": "object", }, }, - - 'sources': sources, - 'extras': extras, - 'pupa_id': {"type": ["string", "null"], "minLength": 1}, - } + "sources": sources, + "extras": extras, + "pupa_id": {"type": ["string", "null"], "minLength": 1}, + }, } diff --git a/pupa/scrape/vote_event.py b/pupa/scrape/vote_event.py index 6776496c..1069fca6 100644 --- a/pupa/scrape/vote_event.py +++ b/pupa/scrape/vote_event.py @@ -8,14 +8,24 @@ class VoteEvent(BaseModel, SourceMixin): - _type = 'vote_event' + _type = "vote_event" _schema = schema - def __init__(self, *, motion_text, start_date, classification, result, - legislative_session=None, identifier='', - bill=None, bill_chamber=None, bill_action=None, - organization=None, chamber=None - ): + def __init__( + self, + *, + motion_text, + start_date, + classification, + result, + legislative_session=None, + identifier="", + bill=None, + bill_chamber=None, + bill_action=None, + organization=None, + chamber=None + ): super(VoteEvent, self).__init__() self.legislative_session = legislative_session @@ -32,84 +42,95 @@ def __init__(self, *, motion_text, start_date, classification, result, self.legislative_session = bill.legislative_session if not self.legislative_session: - raise ScrapeValueError('must set legislative_session or bill') + raise ScrapeValueError("must set legislative_session or bill") - self.organization = pseudo_organization(organization, chamber, 'legislature') + self.organization = pseudo_organization(organization, chamber, "legislature") self.votes = [] self.counts = [] def __str__(self): - return '{0} - {1} - {2}'.format(self.legislative_session, self.start_date, - self.motion_text) + return "{0} - {1} - {2}".format( + self.legislative_session, self.start_date, self.motion_text + ) def set_bill(self, bill_or_identifier, *, chamber=None): if not bill_or_identifier: self.bill = None elif isinstance(bill_or_identifier, Bill): if chamber: - raise ScrapeValueError("set_bill takes no arguments when using a `Bill` object") + raise ScrapeValueError( + "set_bill takes no arguments when using a `Bill` object" + ) self.bill = bill_or_identifier._id else: if chamber is None: - chamber = 'legislature' - kwargs = {'identifier': bill_or_identifier, - 'from_organization__classification': chamber, - 'legislative_session__identifier': self.legislative_session - } + chamber = "legislature" + kwargs = { + "identifier": bill_or_identifier, + "from_organization__classification": chamber, + "legislative_session__identifier": self.legislative_session, + } self.bill = _make_pseudo_id(**kwargs) - def vote(self, option, voter, *, note=''): - self.votes.append({"option": option, "voter_name": voter, - "voter_id": _make_pseudo_id(name=voter), 'note': note}) + def vote(self, option, voter, *, note=""): + self.votes.append( + { + "option": option, + "voter_name": voter, + "voter_id": _make_pseudo_id(name=voter), + "note": note, + } + ) - def yes(self, name, *, id=None, note=''): - return self.vote('yes', name, note=note) + def yes(self, name, *, id=None, note=""): + return self.vote("yes", name, note=note) - def no(self, name, *, id=None, note=''): - return self.vote('no', name, note=note) + def no(self, name, *, id=None, note=""): + return self.vote("no", name, note=note) def set_count(self, option, value): for co in self.counts: - if co['option'] == option: - co['value'] = value + if co["option"] == option: + co["value"] = value break else: - self.counts.append({'option': option, 'value': value}) + self.counts.append({"option": option, "value": value}) class OrderVoteEvent: - """ A functor for applying order to voteEvents. - A single OrderVoteEvent instance should be used for all bills in a scrape. - The vote events of each bill must be processed in chronological order, - but the processing of bills may be interleaved (needed in e.g. NH). - Currently, it only fudges midnight dates (start_date and end_date) - by adding the event sequence number in seconds - to the start_date and end_date (if they are well-formed string dates) - In the future, when there is an 'order' field on voteEvents, - it should fill that as well. - This fails softly and silently; - if a valid string date is not found in start_date or end_date, the date is not touched. - This assumes that times are reported as local time, not UTC. - A UTC time that is local midnight will not be touched. - Sometimes one chamber reports the time of a vote, - but the other chamber reports only the date. This is handled. - See the unit tests for examples and more behavior. + """A functor for applying order to voteEvents. + A single OrderVoteEvent instance should be used for all bills in a scrape. + The vote events of each bill must be processed in chronological order, + but the processing of bills may be interleaved (needed in e.g. NH). + Currently, it only fudges midnight dates (start_date and end_date) + by adding the event sequence number in seconds + to the start_date and end_date (if they are well-formed string dates) + In the future, when there is an 'order' field on voteEvents, + it should fill that as well. + This fails softly and silently; + if a valid string date is not found in start_date or end_date, + the date is not touched. + This assumes that times are reported as local time, not UTC. + A UTC time that is local midnight will not be touched. + Sometimes one chamber reports the time of a vote, + but the other chamber reports only the date. This is handled. + See the unit tests for examples and more behavior. """ - _midnight = r'\d\d\d\d-\d\d-\d\dT00:00:00.*' - _timeless = r'\d\d\d\d-\d\d-\d\d' + + _midnight = r"\d\d\d\d-\d\d-\d\dT00:00:00.*" + _timeless = r"\d\d\d\d-\d\d-\d\d" class OrderBillVoteEvent: - """ Order VoteEvents for a single bill - """ + """Order VoteEvents for a single bill""" def __init__(self): - self.order = 0 # voteEvent sequence number. 1st voteEvent is 1. + self.order = 0 # voteEvent sequence number. 1st voteEvent is 1. def __call__(self, voteEvent): self.order += 1 voteEvent.start_date = self._adjust_date(voteEvent.start_date) - if hasattr(voteEvent, 'end_date'): + if hasattr(voteEvent, "end_date"): voteEvent.end_date = self._adjust_date(voteEvent.end_date) def _adjust_date(self, date): @@ -118,19 +139,19 @@ def _adjust_date(self, date): return date if re.fullmatch(OrderVoteEvent._timeless, date): - d2 = date + 'T00:00:00' + d2 = date + "T00:00:00" elif re.fullmatch(OrderVoteEvent._midnight, date): d2 = date else: return date - assert self.order <= 60*60 - mins = '{:02d}'.format(self.order // 60) - secs = '{:02d}'.format(self.order % 60) + assert self.order <= 60 * 60 + mins = "{:02d}".format(self.order // 60) + secs = "{:02d}".format(self.order % 60) # yyyy-mm-ddThh:mm:dd+05:00 # 0123456789012345678 - return d2[:14] + mins + ':' + secs + d2[19:] + return d2[:14] + mins + ":" + secs + d2[19:] def __init__(self): self.orderers = {} @@ -139,7 +160,8 @@ def __call__(self, session_id, bill_id, voteEvent): """ Record order of voteEvent within bill. - The "order" field is not yet implemented; this fudges voteEvent start_date and end_date. + The "order" field is not yet implemented; this fudges voteEvent + start_date and end_date. See OrderVoteEvent docstring for details. :param session_id: session id diff --git a/pupa/settings.py b/pupa/settings.py index 26483bbc..7e8ceccb 100644 --- a/pupa/settings.py +++ b/pupa/settings.py @@ -4,12 +4,19 @@ import dj_database_url -DATABASE_URL = os.environ.get('DATABASE_URL', 'postgis://pupa:pupa@localhost/opencivicdata') -SECRET_KEY = 'non-secret' -INSTALLED_APPS = ('django.contrib.contenttypes', - 'opencivicdata.core.apps.BaseConfig', - 'opencivicdata.legislative.apps.BaseConfig', - 'pupa') +DATABASE_URL = os.environ.get( + "DATABASE_URL", "postgis://pupa:pupa@localhost/opencivicdata" +) +SECRET_KEY = "non-secret" +INSTALLED_APPS = ( + "django.contrib.contenttypes", + "opencivicdata.core.apps.BaseConfig", + "opencivicdata.legislative.apps.BaseConfig", + "pupa", +) + +ALLOWED_HOSTS = ["localhost"] +SILENCED_SYSTEM_CHECKS = ["fields.E904"] # scrape settings @@ -19,8 +26,8 @@ SCRAPELIB_RETRY_WAIT_SECONDS = 10 SCRAPELIB_VERIFY = True -CACHE_DIR = os.path.join(os.getcwd(), '_cache') -SCRAPED_DATA_DIR = os.path.join(os.getcwd(), '_data') +CACHE_DIR = os.path.join(os.getcwd(), "_cache") +SCRAPED_DATA_DIR = os.path.join(os.getcwd(), "_data") # import settings @@ -29,9 +36,7 @@ ENABLE_VOTES = True ENABLE_EVENTS = True -IMPORT_TRANSFORMERS = { - 'bill': [] -} +IMPORT_TRANSFORMERS = {"bill": []} # Django settings DEBUG = False @@ -39,43 +44,37 @@ MIDDLEWARE_CLASSES = () LOGGING = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': "%(asctime)s %(levelname)s %(name)s: %(message)s", - 'datefmt': '%H:%M:%S' + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "standard": { + "format": "%(asctime)s %(levelname)s %(name)s: %(message)s", + "datefmt": "%H:%M:%S", } }, - 'handlers': { - 'default': {'level': 'DEBUG', - 'class': 'pupa.ext.ansistrm.ColorizingStreamHandler', - 'formatter': 'standard'}, - }, - 'loggers': { - '': { - 'handlers': ['default'], 'level': 'DEBUG', 'propagate': True - }, - 'scrapelib': { - 'handlers': ['default'], 'level': 'INFO', 'propagate': False - }, - 'requests': { - 'handlers': ['default'], 'level': 'WARN', 'propagate': False - }, - 'boto': { - 'handlers': ['default'], 'level': 'WARN', 'propagate': False + "handlers": { + "default": { + "level": "DEBUG", + "class": "pupa.ext.ansistrm.ColorizingStreamHandler", + "formatter": "standard", }, }, + "loggers": { + "": {"handlers": ["default"], "level": "DEBUG", "propagate": True}, + "scrapelib": {"handlers": ["default"], "level": "INFO", "propagate": False}, + "requests": {"handlers": ["default"], "level": "WARN", "propagate": False}, + "boto": {"handlers": ["default"], "level": "WARN", "propagate": False}, + }, } sys.path.insert(1, os.getcwd()) -loader = importlib.util.find_spec('pupa_settings') +loader = importlib.util.find_spec("pupa_settings") if loader is None: - print('no pupa_settings on path, using defaults') + print("no pupa_settings on path, using defaults") else: - from pupa_settings import * # NOQA + from pupa_settings import * # NOQA -DATABASES = {'default': dj_database_url.parse(DATABASE_URL)} -DATABASES['default']['ENGINE'] = 'django.contrib.gis.db.backends.postgis' +DATABASES = {"default": dj_database_url.parse(DATABASE_URL)} +DATABASES["default"]["ENGINE"] = "django.contrib.gis.db.backends.postgis" diff --git a/pupa/tests/importers/test_base_importer.py b/pupa/tests/importers/test_base_importer.py index a321b7d8..f9b115ad 100644 --- a/pupa/tests/importers/test_base_importer.py +++ b/pupa/tests/importers/test_base_importer.py @@ -13,37 +13,40 @@ def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") class FakeImporter(BaseImporter): - _type = 'test' + _type = "test" def test_omnihash_python_types(): # string - assert omnihash('test') == omnihash('test') + assert omnihash("test") == omnihash("test") # list - assert omnihash(['this', 'is', 'a', 'list']) == omnihash(['this', 'is', 'a', 'list']) + assert omnihash(["this", "is", "a", "list"]) == omnihash( + ["this", "is", "a", "list"] + ) # set - assert omnihash({'and', 'a', 'set'}) == omnihash({'set', 'set', 'and', 'a'}) + assert omnihash({"and", "a", "set"}) == omnihash({"set", "set", "and", "a"}) # dict w/ set and tuple as well - assert (omnihash({'a': {('fancy', 'nested'): {'dict'}}}) == - omnihash({'a': {('fancy', 'nested'): {'dict'}}})) + assert omnihash({"a": {("fancy", "nested"): {"dict"}}}) == omnihash( + {"a": {("fancy", "nested"): {"dict"}}} + ) def test_import_directory(): # write out some temp data to filesystem datadir = tempfile.mkdtemp() - dicta = {'test': 'A'} - dictb = {'test': 'B'} - open(os.path.join(datadir, 'test_a.json'), 'w').write(json.dumps(dicta)) - open(os.path.join(datadir, 'test_b.json'), 'w').write(json.dumps(dictb)) + dicta = {"test": "A"} + dictb = {"test": "B"} + open(os.path.join(datadir, "test_a.json"), "w").write(json.dumps(dicta)) + open(os.path.join(datadir, "test_b.json"), "w").write(json.dumps(dictb)) # simply ensure that import directory calls import_data with all dicts - ti = FakeImporter('jurisdiction-id') - with mock.patch.object(ti, attribute='import_data') as mockobj: + ti = FakeImporter("jurisdiction-id") + with mock.patch.object(ti, attribute="import_data") as mockobj: ti.import_directory(datadir) # import_data should be called once @@ -62,42 +65,43 @@ def test_import_directory(): def test_apply_transformers(): transformers = { - 'capitalize': lambda x: x.upper(), - 'cap_and_reverse': [lambda x: x.upper(), lambda y: y[::-1]], - 'never_used': lambda x: 1/0, - 'nested': {'replace': lambda x: 'replaced'}, + "capitalize": lambda x: x.upper(), + "cap_and_reverse": [lambda x: x.upper(), lambda y: y[::-1]], + "never_used": lambda x: 1 / 0, + "nested": {"replace": lambda x: "replaced"}, } data = { - 'capitalize': 'words', - 'cap_and_reverse': 'simple', - 'nested': {'replace': None}, + "capitalize": "words", + "cap_and_reverse": "simple", + "nested": {"replace": None}, } - ti = FakeImporter('jid') + ti = FakeImporter("jid") ti.cached_transformers = transformers output = ti.apply_transformers(data) - assert output['capitalize'] == 'WORDS' - assert output['cap_and_reverse'] == 'ELPMIS' - assert output['nested']['replace'] == 'replaced' + assert output["capitalize"] == "WORDS" + assert output["cap_and_reverse"] == "ELPMIS" + assert output["nested"]["replace"] == "replaced" -# doing these next few tests just on a Person because it is the same code that handles it -# but for completeness maybe it is better to do these on each type? +# doing these next few tests just on a Person because it is the same +# code that handles it but for completeness maybe it is better to do +# these on each type? @pytest.mark.django_db def test_last_seen_updates_on_scrape(): create_jurisdiction() - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") - p = Person.objects.create(name='George Washington', family_name='Washington') + p = Person.objects.create(name="George Washington", family_name="Washington") p.memberships.create(organization=o) expected_updated_at = p.updated_at last_seen_before_scrape = p.last_seen # Simulate no-op scrape - scraped_p = ScrapePerson('George Washington').as_dict() - PersonImporter('jid').import_data([scraped_p]) + scraped_p = ScrapePerson("George Washington").as_dict() + PersonImporter("jid").import_data([scraped_p]) p.refresh_from_db() @@ -113,9 +117,9 @@ def test_last_seen_updates_on_scrape(): @pytest.mark.django_db def test_deduplication_identical_object(): - p1 = ScrapePerson('Dwayne').as_dict() - p2 = ScrapePerson('Dwayne').as_dict() - PersonImporter('jid').import_data([p1, p2]) + p1 = ScrapePerson("Dwayne").as_dict() + p2 = ScrapePerson("Dwayne").as_dict() + PersonImporter("jid").import_data([p1, p2]) assert Person.objects.count() == 1 @@ -124,23 +128,26 @@ def test_deduplication_identical_object(): def test_exception_on_identical_objects_in_import_stream(): create_jurisdiction() # these two objects aren't identical, but refer to the same thing - # at the moment we consider this an error (but there may be a better way to handle this?) - o1 = ScrapeOrganization('X-Men', classification='unknown').as_dict() - o2 = ScrapeOrganization('X-Men', founding_date='1970', classification='unknown').as_dict() + # at the moment we consider this an error (but there may be a better + # way to handle this?) + o1 = ScrapeOrganization("X-Men", classification="unknown").as_dict() + o2 = ScrapeOrganization( + "X-Men", founding_date="1970", classification="unknown" + ).as_dict() with pytest.raises(Exception): - OrganizationImporter('jid').import_data([o1, o2]) + OrganizationImporter("jid").import_data([o1, o2]) @pytest.mark.django_db def test_resolve_json_id(): - p1 = ScrapePerson('Dwayne').as_dict() - p2 = ScrapePerson('Dwayne').as_dict() - pi = PersonImporter('jid') + p1 = ScrapePerson("Dwayne").as_dict() + p2 = ScrapePerson("Dwayne").as_dict() + pi = PersonImporter("jid") # do import and get database id - p1_id = p1['_id'] - p2_id = p2['_id'] + p1_id = p1["_id"] + p2_id = p2["_id"] pi.import_data([p1, p2]) db_id = Person.objects.get().id @@ -152,78 +159,78 @@ def test_resolve_json_id(): assert pi.resolve_json_id(None) is None # no such id with pytest.raises(UnresolvedIdError): - pi.resolve_json_id('this-is-invalid') + pi.resolve_json_id("this-is-invalid") @pytest.mark.django_db def test_invalid_fields(): - p1 = ScrapePerson('Dwayne').as_dict() - p1['newfield'] = "shouldn't happen" + p1 = ScrapePerson("Dwayne").as_dict() + p1["newfield"] = "shouldn't happen" with pytest.raises(DataImportError): - PersonImporter('jid').import_data([p1]) + PersonImporter("jid").import_data([p1]) @pytest.mark.django_db def test_invalid_fields_related_item(): - p1 = ScrapePerson('Dwayne') - p1.add_link('http://example.com') + p1 = ScrapePerson("Dwayne") + p1.add_link("http://example.com") p1 = p1.as_dict() - p1['links'][0]['test'] = 3 + p1["links"][0]["test"] = 3 with pytest.raises(DataImportError): - PersonImporter('jid').import_data([p1]) + PersonImporter("jid").import_data([p1]) @pytest.mark.django_db def test_locked_field(): create_jurisdiction() - org = ScrapeOrganization('SHIELD').as_dict() - oi = OrganizationImporter('jid') + org = ScrapeOrganization("SHIELD").as_dict() + oi = OrganizationImporter("jid") oi.import_data([org]) # set date and lock field o = Organization.objects.get() - o.dissolution_date = '2015' - o.locked_fields = ['dissolution_date'] + o.dissolution_date = "2015" + o.locked_fields = ["dissolution_date"] o.save() # reimport - org = ScrapeOrganization('SHIELD').as_dict() - oi = OrganizationImporter('jid') + org = ScrapeOrganization("SHIELD").as_dict() + oi = OrganizationImporter("jid") oi.import_data([org]) o = Organization.objects.get() - assert o.dissolution_date == '2015' - assert o.locked_fields == ['dissolution_date'] + assert o.dissolution_date == "2015" + assert o.locked_fields == ["dissolution_date"] # do it a third time to check for the locked_fields reversion issue - org = ScrapeOrganization('SHIELD').as_dict() - oi = OrganizationImporter('jid') + org = ScrapeOrganization("SHIELD").as_dict() + oi = OrganizationImporter("jid") oi.import_data([org]) o = Organization.objects.get() - assert o.dissolution_date == '2015' - assert o.locked_fields == ['dissolution_date'] + assert o.dissolution_date == "2015" + assert o.locked_fields == ["dissolution_date"] @pytest.mark.django_db def test_locked_field_subitem(): create_jurisdiction() - org = ScrapeOrganization('SHIELD') - org.add_name('S.H.I.E.L.D.') - oi = OrganizationImporter('jid') + org = ScrapeOrganization("SHIELD") + org.add_name("S.H.I.E.L.D.") + oi = OrganizationImporter("jid") oi.import_data([org.as_dict()]) # lock the field o = Organization.objects.get() - o.locked_fields = ['other_names'] + o.locked_fields = ["other_names"] o.save() # reimport - org = ScrapeOrganization('SHIELD').as_dict() - oi = OrganizationImporter('jid') + org = ScrapeOrganization("SHIELD").as_dict() + oi = OrganizationImporter("jid") oi.import_data([org]) o = Organization.objects.get() - assert o.other_names.get().name == 'S.H.I.E.L.D.' + assert o.other_names.get().name == "S.H.I.E.L.D." diff --git a/pupa/tests/importers/test_bill_importer.py b/pupa/tests/importers/test_bill_importer.py index 680e44a9..809ab516 100644 --- a/pupa/tests/importers/test_bill_importer.py +++ b/pupa/tests/importers/test_bill_importer.py @@ -4,98 +4,133 @@ from pupa.scrape import Person as ScrapePerson from pupa.scrape import Organization as ScrapeOrganization from pupa.importers import BillImporter, OrganizationImporter, PersonImporter -from opencivicdata.core.models import Jurisdiction, Person, Organization, Membership, Division +from opencivicdata.core.models import ( + Jurisdiction, + Person, + Organization, + Membership, + Division, +) from opencivicdata.legislative.models import Bill def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - j = Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') - j.legislative_sessions.create(identifier='1899', name='1899') - j.legislative_sessions.create(identifier='1900', name='1900') + Division.objects.create(id="ocd-division/country:us", name="USA") + j = Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") + j.legislative_sessions.create(identifier="1899", name="1899") + j.legislative_sessions.create(identifier="1900", name="1900") def create_org(): - return Organization.objects.create(id='org-id', name='House', classification='lower', - jurisdiction_id='jid') + return Organization.objects.create( + id="org-id", name="House", classification="lower", jurisdiction_id="jid" + ) @pytest.mark.django_db def test_full_bill(): create_jurisdiction() - sp = ScrapePerson('Adam Smith') - org = ScrapeOrganization(name='House', classification='lower') - com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', - parent_id=org._id) - - oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', - classification='tax bill', from_organization=org._id) - - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', - classification='tax bill', from_organization=org._id) - bill.subject = ['taxes', 'axes'] - bill.add_identifier('SB 9') - bill.add_title('Tack & Axe Tax Act') - bill.add_action('introduced in house', '1900-04-01', chamber='lower') - act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') - act.add_related_entity('arbitrary committee', 'organization', com._id) - bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") - bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', - primary=False, entity_id=sp._id) - bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', - primary=True) - bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", - date='1969-10-20') - bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', - media_type='application/pdf') - bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') - bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') - bill.add_source('http://example.com/source') + sp = ScrapePerson("Adam Smith") + org = ScrapeOrganization(name="House", classification="lower") + com = ScrapeOrganization( + name="Arbitrary Committee", classification="committee", parent_id=org._id + ) + + oldbill = ScrapeBill( + "HB 99", + "1899", + "Axe & Tack Tax Act", + classification="tax bill", + from_organization=org._id, + ) + + bill = ScrapeBill( + "HB 1", + "1900", + "Axe & Tack Tax Act", + classification="tax bill", + from_organization=org._id, + ) + bill.subject = ["taxes", "axes"] + bill.add_identifier("SB 9") + bill.add_title("Tack & Axe Tax Act") + bill.add_action("introduced in house", "1900-04-01", chamber="lower") + act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") + act.add_related_entity("arbitrary committee", "organization", com._id) + bill.add_related_bill( + "HB 99", legislative_session="1899", relation_type="prior-session" + ) + bill.add_sponsorship( + "Adam Smith", + classification="extra sponsor", + entity_type="person", + primary=False, + entity_id=sp._id, + ) + bill.add_sponsorship( + "Jane Smith", classification="lead sponsor", entity_type="person", primary=True + ) + bill.add_abstract( + "This is an act about axes and taxes and tacks.", + note="official", + date="1969-10-20", + ) + bill.add_document_link( + "Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf" + ) + bill.add_document_link( + "Fiscal Note", "http://example.com/fn.html", media_type="text/html" + ) + bill.add_version_link( + "Fiscal Note", "http://example.com/v/1", media_type="text/html" + ) + bill.add_source("http://example.com/source") # import bill - oi = OrganizationImporter('jid') + oi = OrganizationImporter("jid") oi.import_data([org.as_dict(), com.as_dict()]) - pi = PersonImporter('jid') + pi = PersonImporter("jid") pi.import_data([sp.as_dict()]) - BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) + BillImporter("jid", oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly - b = Bill.objects.get(identifier='HB 1') - assert b.from_organization.classification == 'lower' + b = Bill.objects.get(identifier="HB 1") + assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification - assert b.subject == ['taxes', 'axes'] - assert b.abstracts.get().note == 'official' - assert b.abstracts.get().date == '1969-10-20' + assert b.subject == ["taxes", "axes"] + assert b.abstracts.get().note == "official" + assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added - assert b.other_titles.get().title == 'Tack & Axe Tax Act' - assert b.other_identifiers.get().identifier == 'SB 9' + assert b.other_titles.get().title == "Tack & Axe Tax Act" + assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) - assert actions[0].organization == Organization.objects.get(classification='lower') + assert actions[0].organization == Organization.objects.get(classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" - assert (actions[1].related_entities.get().organization == - Organization.objects.get(classification='committee')) + assert actions[1].related_entities.get().organization == Organization.objects.get( + classification="committee" + ) # related_bills were added rb = b.related_bills.get() - assert rb.identifier == 'HB 99' + assert rb.identifier == "HB 99" # and bill got resolved - assert rb.related_bill.identifier == 'HB 99' + assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 - person = Person.objects.get(name='Adam Smith') + person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None @@ -120,12 +155,13 @@ def test_bill_chamber_param(): create_jurisdiction() org = create_org() - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', - classification='tax bill', chamber='lower') + bill = ScrapeBill( + "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower" + ) - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') - BillImporter('jid', oi, pi).import_data([bill.as_dict()]) + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") + BillImporter("jid", oi, pi).import_data([bill.as_dict()]) assert Bill.objects.get().from_organization_id == org.id @@ -135,98 +171,100 @@ def test_bill_update(): create_jurisdiction() create_org() - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') - _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) - assert what == 'insert' - _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) - assert what == 'noop' + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") + _, what = BillImporter("jid", oi, pi).import_item(bill.as_dict()) + assert what == "insert" + _, what = BillImporter("jid", oi, pi).import_item(bill.as_dict()) + assert what == "noop" # ensure no new object was created assert Bill.objects.count() == 1 # test basic update - bill = ScrapeBill('HB 1', '1900', '1st Bill', chamber='lower') - _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) - assert what == 'update' - assert Bill.objects.get().title == '1st Bill' + bill = ScrapeBill("HB 1", "1900", "1st Bill", chamber="lower") + _, what = BillImporter("jid", oi, pi).import_item(bill.as_dict()) + assert what == "update" + assert Bill.objects.get().title == "1st Bill" @pytest.mark.django_db def test_bill_update_because_of_subitem(): create_jurisdiction() create_org() - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") # initial bill - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['insert'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["insert"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1 last_updated = obj.updated_at # now let's make sure we get updated when there are second actions - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - bill.add_action('this is a second action', chamber='lower', date='1900-01-02') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + bill.add_action("this is a second action", chamber="lower", date="1900-01-02") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 assert obj.updated_at > last_updated # same 2 actions, noop - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - bill.add_action('this is a second action', chamber='lower', date='1900-01-02') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['noop'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + bill.add_action("this is a second action", chamber="lower", date="1900-01-02") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["noop"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # same 2 actions, different order, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is a second action', chamber='lower', date='1900-01-02') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is a second action", chamber="lower", date="1900-01-02") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # different 2 actions, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - bill.add_action('this is a different second action', chamber='lower', date='1900-01-02') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + bill.add_action( + "this is a different second action", chamber="lower", date="1900-01-02" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # delete an action, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is a second action', chamber='lower', date='1900-01-02') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is a second action", chamber="lower", date="1900-01-02") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1 # delete all actions, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 0 # and back to initial status, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_action('this is an action', chamber='lower', date='1900-01-01') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_action("this is an action", chamber="lower", date="1900-01-01") + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1 @@ -235,53 +273,69 @@ def test_bill_update_because_of_subitem(): def test_bill_update_subsubitem(): create_jurisdiction() create_org() - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") # initial sub-subitem - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['insert'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_version_link( + "printing", "http://example.com/test.pdf", media_type="application/pdf" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["insert"] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1 # a second subsubitem, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') - bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_version_link( + "printing", "http://example.com/test.pdf", media_type="application/pdf" + ) + bill.add_version_link( + "printing", "http://example.com/test.text", media_type="text/plain" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # same thing, noop - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') - bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['noop'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_version_link( + "printing", "http://example.com/test.pdf", media_type="application/pdf" + ) + bill.add_version_link( + "printing", "http://example.com/test.text", media_type="text/plain" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["noop"] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # different link for second one, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') - bill.add_version_link('printing', 'http://example.com/diff-link.txt', media_type='text/plain') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_version_link( + "printing", "http://example.com/test.pdf", media_type="application/pdf" + ) + bill.add_version_link( + "printing", "http://example.com/diff-link.txt", media_type="text/plain" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # delete one, update - bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') - bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') - result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) - assert result['bill']['update'] == 1 + bill = ScrapeBill("HB 1", "1900", "First Bill", chamber="lower") + bill.add_version_link( + "printing", "http://example.com/test.pdf", media_type="application/pdf" + ) + result = BillImporter("jid", oi, pi).import_data([bill.as_dict()]) + assert result["bill"]["update"] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1 @@ -292,27 +346,28 @@ def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', - classification='tax bill', chamber='lower') - bill.add_sponsorship_by_identifier(name="SNODGRASS", - classification='sponsor', - entity_type='person', - primary=True, - identifier="TOTALLY_REAL_ID", - scheme="TOTALLY_REAL_SCHEME") - - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') - - zs = ScrapePerson(name='Zadock Snodgrass') - zs.add_identifier(identifier='TOTALLY_REAL_ID', - scheme='TOTALLY_REAL_SCHEME') + bill = ScrapeBill( + "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower" + ) + bill.add_sponsorship_by_identifier( + name="SNODGRASS", + classification="sponsor", + entity_type="person", + primary=True, + identifier="TOTALLY_REAL_ID", + scheme="TOTALLY_REAL_SCHEME", + ) + + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") + + zs = ScrapePerson(name="Zadock Snodgrass") + zs.add_identifier(identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") pi.import_data([zs.as_dict()]) za_db = Person.objects.get() - Membership.objects.create(person_id=za_db.id, - organization_id=org.id) + Membership.objects.create(person_id=za_db.id, organization_id=org.id) - BillImporter('jid', oi, pi).import_data([bill.as_dict()]) + BillImporter("jid", oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() @@ -324,36 +379,36 @@ def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', - classification='tax bill', chamber='lower') - bill.add_sponsorship_by_identifier(name="SNODGRASS", - classification='sponsor', - entity_type='person', - primary=True, - identifier="TOTALLY_REAL_ID", - scheme="TOTALLY_REAL_SCHEME") - - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') - - zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") - zs.add_identifier(identifier='TOTALLY_REAL_ID', - scheme='TOTALLY_REAL_SCHEME') + bill = ScrapeBill( + "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower" + ) + bill.add_sponsorship_by_identifier( + name="SNODGRASS", + classification="sponsor", + entity_type="person", + primary=True, + identifier="TOTALLY_REAL_ID", + scheme="TOTALLY_REAL_SCHEME", + ) + + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") + + zs = ScrapePerson(name="Zadock Snodgrass", birth_date="1800-01-01") + zs.add_identifier(identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") pi.import_data([zs.as_dict()]) za_db = Person.objects.get() - Membership.objects.create(person_id=za_db.id, - organization_id=org.id) + Membership.objects.create(person_id=za_db.id, organization_id=org.id) - zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") - zs2.add_identifier(identifier='TOTALLY_REAL_ID', - scheme='TOTALLY_REAL_SCHEME') + zs2 = ScrapePerson(name="Zadock Snodgrass", birth_date="1900-01-01") + zs2.add_identifier(identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. - PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) + PersonImporter("another-jurisdiction").import_data([zs.as_dict()]) - BillImporter('jid', oi, pi).import_data([bill.as_dict()]) + BillImporter("jid", oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() @@ -366,17 +421,18 @@ def test_bill_action_extras(): create_jurisdiction() create_org() - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', - classification='tax bill', chamber='lower') - bill.add_action('sample', '1900-01-01', chamber='lower', extras={'test': 3}) + bill = ScrapeBill( + "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower" + ) + bill.add_action("sample", "1900-01-01", chamber="lower", extras={"test": 3}) - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") - BillImporter('jid', oi, pi).import_data([bill.as_dict()]) + BillImporter("jid", oi, pi).import_data([bill.as_dict()]) b = Bill.objects.get() - assert b.actions.all()[0].extras == {'test': 3} + assert b.actions.all()[0].extras == {"test": 3} @pytest.mark.django_db @@ -384,19 +440,21 @@ def test_fix_bill_id(): create_jurisdiction() create_org() - bill = ScrapeBill('HB1', '1900', 'Test Bill ID', - classification='bill', chamber='lower') + bill = ScrapeBill( + "HB1", "1900", "Test Bill ID", classification="bill", chamber="lower" + ) - oi = OrganizationImporter('jid') - pi = PersonImporter('jid') + oi = OrganizationImporter("jid") + pi = PersonImporter("jid") from pupa.settings import IMPORT_TRANSFORMERS - IMPORT_TRANSFORMERS['bill'] = { - 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) + + IMPORT_TRANSFORMERS["bill"] = { + "identifier": lambda x: re.sub(r"([A-Z]*)\s*0*([-\d]+)", r"\1 \2", x, 1) } - bi = BillImporter('jid', oi, pi) + bi = BillImporter("jid", oi, pi) bi.import_data([bill.as_dict()]) - IMPORT_TRANSFORMERS['bill'] = {} + IMPORT_TRANSFORMERS["bill"] = {} b = Bill.objects.get() - assert b.identifier == 'HB 1' + assert b.identifier == "HB 1" diff --git a/pupa/tests/importers/test_event_importer.py b/pupa/tests/importers/test_event_importer.py index fd4d18ff..28e3a22e 100644 --- a/pupa/tests/importers/test_event_importer.py +++ b/pupa/tests/importers/test_event_importer.py @@ -1,20 +1,31 @@ import pytest from pupa.scrape import Event as ScrapeEvent -from pupa.importers import (EventImporter, OrganizationImporter, PersonImporter, BillImporter, - VoteEventImporter) +from pupa.importers import ( + EventImporter, + OrganizationImporter, + PersonImporter, + BillImporter, + VoteEventImporter, +) from opencivicdata.legislative.models import VoteEvent, Bill, Event -from opencivicdata.core.models import (Person, Membership, Organization, Jurisdiction, Division) +from opencivicdata.core.models import ( + Person, + Membership, + Organization, + Jurisdiction, + Division, +) def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - j = Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + j = Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") return j def create_other_jurisdiction(): - Division.objects.create(id='ocd-division/country:ca', name='USA') - j = Jurisdiction.objects.create(id='ojid', division_id='ocd-division/country:ca') + Division.objects.create(id="ocd-division/country:ca", name="USA") + j = Jurisdiction.objects.create(id="ojid", division_id="ocd-division/country:ca") return j @@ -23,22 +34,23 @@ def ge(): name="America's Birthday", start_date="2014-07-04T05:00Z", location_name="America", - all_day=True) + all_day=True, + ) return event -oi = OrganizationImporter('jid') -pi = PersonImporter('jid') -bi = BillImporter('jid', oi, pi) -vei = VoteEventImporter('jid', pi, oi, bi) +oi = OrganizationImporter("jid") +pi = PersonImporter("jid") +bi = BillImporter("jid", oi, pi) +vei = VoteEventImporter("jid", pi, oi, bi) @pytest.mark.django_db def test_related_people_event(): create_jurisdiction() - george = Person.objects.create(id='gw', name='George Washington') - john = Person.objects.create(id='jqp', name='John Q. Public') - o = Organization.objects.create(name='Merica', jurisdiction_id='jid') + george = Person.objects.create(id="gw", name="George Washington") + john = Person.objects.create(id="jqp", name="John Q. Public") + o = Organization.objects.create(name="Merica", jurisdiction_id="jid") Membership.objects.create(person=george, organization=o) Membership.objects.create(person=john, organization=o) @@ -51,30 +63,41 @@ def test_related_people_event(): item.add_person(person="John Q. Public") event.add_person("George Washington") - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 - assert Event.objects.get(name="America's Birthday").participants.first().person_id == 'gw' + assert ( + Event.objects.get(name="America's Birthday").participants.first().person_id + == "gw" + ) - assert Event.objects.get(name="America's Birthday" - ).agenda.first().related_entities.first().person_id == 'jqp' + assert ( + Event.objects.get(name="America's Birthday") + .agenda.first() + .related_entities.first() + .person_id + == "jqp" + ) @pytest.mark.django_db def test_related_vote_event(): j = create_jurisdiction() - session = j.legislative_sessions.create(name='1900', identifier='1900') - org = Organization.objects.create(id='org-id', name='House', classification='lower') - bill = Bill.objects.create(id='bill-1', identifier='HB 1', - legislative_session=session) - VoteEvent.objects.create(id='vote-1', - identifier="Roll no. 12", - bill=bill, - legislative_session=session, - organization=org) + session = j.legislative_sessions.create(name="1900", identifier="1900") + org = Organization.objects.create(id="org-id", name="House", classification="lower") + bill = Bill.objects.create( + id="bill-1", identifier="HB 1", legislative_session=session + ) + VoteEvent.objects.create( + id="vote-1", + identifier="Roll no. 12", + bill=bill, + legislative_session=session, + organization=org, + ) event1 = ge() event2 = ge() @@ -83,23 +106,27 @@ def test_related_vote_event(): item = event.add_agenda_item("Cookies will be served") item.add_vote_event(vote_event="Roll no. 12") - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 - assert Event.objects.get(name="America's Birthday" - ).agenda.first().related_entities.first().vote_event_id == 'vote-1' + assert ( + Event.objects.get(name="America's Birthday") + .agenda.first() + .related_entities.first() + .vote_event_id + == "vote-1" + ) @pytest.mark.django_db def test_related_bill_event(): j = create_jurisdiction() - session = j.legislative_sessions.create(name='1900', identifier='1900') - Organization.objects.create(id='org-id', name='House', classification='lower') - Bill.objects.create(id='bill-1', identifier='HB 101', - legislative_session=session) + session = j.legislative_sessions.create(name="1900", identifier="1900") + Organization.objects.create(id="org-id", name="House", classification="lower") + Bill.objects.create(id="bill-1", identifier="HB 101", legislative_session=session) event1 = ge() event2 = ge() @@ -107,27 +134,35 @@ def test_related_bill_event(): item = event.add_agenda_item("Cookies will be served") item.add_bill(bill="HB 101") - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 - assert Event.objects.get(name="America's Birthday" - ).agenda.first().related_entities.first().bill_id == 'bill-1' + assert ( + Event.objects.get(name="America's Birthday") + .agenda.first() + .related_entities.first() + .bill_id + == "bill-1" + ) @pytest.mark.django_db def test_related_committee_event(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - org = Organization.objects.create(id='org-id', name='House', - classification='lower', - jurisdiction=j) - Organization.objects.create(id='fiscal', name="Fiscal Committee", - classification='committee', - parent=org, - jurisdiction=j) + j.legislative_sessions.create(name="1900", identifier="1900") + org = Organization.objects.create( + id="org-id", name="House", classification="lower", jurisdiction=j + ) + Organization.objects.create( + id="fiscal", + name="Fiscal Committee", + classification="committee", + parent=org, + jurisdiction=j, + ) event1 = ge() event2 = ge() @@ -136,14 +171,19 @@ def test_related_committee_event(): item = event.add_agenda_item("Cookies will be served") item.add_committee(committee="Fiscal Committee") - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 - assert Event.objects.get(name="America's Birthday" - ).agenda.first().related_entities.first().organization_id == 'fiscal' + assert ( + Event.objects.get(name="America's Birthday") + .agenda.first() + .related_entities.first() + .organization_id + == "fiscal" + ) @pytest.mark.django_db @@ -156,15 +196,15 @@ def test_media_event(): item = event.add_agenda_item("Cookies will be served") item.add_media_link( note="Hello, World", - media_type='application/octet-stream', - url="http://hello.world/foo" + media_type="application/octet-stream", + url="http://hello.world/foo", ) - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 @pytest.mark.django_db @@ -174,71 +214,72 @@ def test_media_document(): event2 = ge() for event in [event1, event2]: - event.add_document(note="Presentation", - url="http://example.com/presentation.pdf") + event.add_document( + note="Presentation", url="http://example.com/presentation.pdf" + ) - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 @pytest.mark.django_db def test_full_event(): create_jurisdiction() - george = Person.objects.create(id='gw', name='George Washington') - o = Organization.objects.create(name='Merica', jurisdiction_id='jid') + george = Person.objects.create(id="gw", name="George Washington") + o = Organization.objects.create(name="Merica", jurisdiction_id="jid") Membership.objects.create(person=george, organization=o) event = ge() - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["insert"] == 1 event = ge() - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["noop"] == 1 event = ge() - event.location['name'] = "United States of America" - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['update'] == 1 + event.location["name"] = "United States of America" + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["update"] == 1 event.location = None - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['update'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["update"] == 1 @pytest.mark.django_db def test_pupa_identifier_event(): create_jurisdiction() create_other_jurisdiction() - george = Person.objects.create(id='gw', name='George Washington') - o = Organization.objects.create(name='Merica', jurisdiction_id='jid') + george = Person.objects.create(id="gw", name="George Washington") + o = Organization.objects.create(name="Merica", jurisdiction_id="jid") Membership.objects.create(person=george, organization=o) event = ge() - event.pupa_id = 'foo' + event.pupa_id = "foo" - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["noop"] == 1 - event.name = "America's Anniversary", - event.location['name'] = "United States of America" - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['update'] == 1 + event.name = ("America's Anniversary",) + event.location["name"] = "United States of America" + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["update"] == 1 - event.pupa_id = 'bar' - result = EventImporter('jid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['insert'] == 1 + event.pupa_id = "bar" + result = EventImporter("jid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('ojid', oi, pi, bi, vei).import_data([event.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("ojid", oi, pi, bi, vei).import_data([event.as_dict()]) + assert result["event"]["insert"] == 1 # @pytest.mark.django_db @@ -258,16 +299,22 @@ def test_top_level_media_event(): create_jurisdiction() event1, event2 = ge(), ge() - event1.add_media_link("fireworks", "http://example.com/fireworks.mov", - media_type='application/octet-stream') - event2.add_media_link("fireworks", "http://example.com/fireworks.mov", - media_type='application/octet-stream') + event1.add_media_link( + "fireworks", + "http://example.com/fireworks.mov", + media_type="application/octet-stream", + ) + event2.add_media_link( + "fireworks", + "http://example.com/fireworks.mov", + media_type="application/octet-stream", + ) - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 - result = EventImporter('jid', oi, pi, bi, vei).import_data([event2.as_dict()]) - assert result['event']['noop'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event2.as_dict()]) + assert result["event"]["noop"] == 1 @pytest.mark.django_db @@ -276,11 +323,11 @@ def test_event_agenda_item(): event1 = ge() agenda = event1.add_agenda_item("first item") - agenda['extras'] = {'one': 1, 'two': [2]} + agenda["extras"] = {"one": 1, "two": [2]} - result = EventImporter('jid', oi, pi, bi, vei).import_data([event1.as_dict()]) - assert result['event']['insert'] == 1 + result = EventImporter("jid", oi, pi, bi, vei).import_data([event1.as_dict()]) + assert result["event"]["insert"] == 1 e = Event.objects.get() a = e.agenda.all()[0] - assert a.extras == {'one': 1, 'two': [2]} + assert a.extras == {"one": 1, "two": [2]} diff --git a/pupa/tests/importers/test_jurisdiction_importer.py b/pupa/tests/importers/test_jurisdiction_importer.py index 50bef3a7..67b30de7 100644 --- a/pupa/tests/importers/test_jurisdiction_importer.py +++ b/pupa/tests/importers/test_jurisdiction_importer.py @@ -6,23 +6,23 @@ class FakeJurisdiction(JurisdictionBase): - division_id = 'ocd-division/country:us' - name = 'test' - url = 'http://example.com' - classification = 'government' + division_id = "ocd-division/country:us" + name = "test" + url = "http://example.com" + classification = "government" legislative_sessions = [ - {'identifier': '2015', 'name': '2015 Regular Session'}, - {'identifier': '2016', 'name': '2016 Regular Session'}, + {"identifier": "2015", "name": "2015 Regular Session"}, + {"identifier": "2016", "name": "2016 Regular Session"}, ] @pytest.mark.django_db def test_jurisdiction_import(): - Division.objects.create(id='ocd-division/country:us', name='USA') + Division.objects.create(id="ocd-division/country:us", name="USA") tj = FakeJurisdiction() juris_dict = tj.as_dict() - JurisdictionImporter('jurisdiction-id').import_data([juris_dict]) + JurisdictionImporter("jurisdiction-id").import_data([juris_dict]) dbj = Jurisdiction.objects.get() assert dbj.id == tj.jurisdiction_id @@ -33,28 +33,28 @@ def test_jurisdiction_import(): @pytest.mark.django_db def test_jurisdiction_update(): - Division.objects.create(id='ocd-division/country:us', name='USA') + Division.objects.create(id="ocd-division/country:us", name="USA") tj = FakeJurisdiction() - ji = JurisdictionImporter('jurisdiction-id') + ji = JurisdictionImporter("jurisdiction-id") _, what = ji.import_item(tj.as_dict()) - assert what == 'insert' + assert what == "insert" _, what = ji.import_item(tj.as_dict()) - assert what == 'noop' + assert what == "noop" assert Jurisdiction.objects.count() == 1 - tj.name = 'different name' + tj.name = "different name" obj, what = ji.import_item(tj.as_dict()) - assert what == 'update' + assert what == "update" assert Jurisdiction.objects.count() == 1 - assert Jurisdiction.objects.get().name == 'different name' + assert Jurisdiction.objects.get().name == "different name" @pytest.mark.django_db def test_jurisdiction_merge_related(): - Division.objects.create(id='ocd-division/country:us', name='USA') + Division.objects.create(id="ocd-division/country:us", name="USA") # need to ensure legislative_sessions don't get deleted - ji = JurisdictionImporter('jurisdiction-id') + ji = JurisdictionImporter("jurisdiction-id") tj = FakeJurisdiction() ji.import_item(tj.as_dict()) @@ -68,12 +68,12 @@ def test_jurisdiction_merge_related(): assert LegislativeSession.objects.count() == 2 # now will have three - tj.legislative_sessions.append({'identifier': '2017', 'name': '2017 Session'}) + tj.legislative_sessions.append({"identifier": "2017", "name": "2017 Session"}) ji.import_item(tj.as_dict()) assert LegislativeSession.objects.count() == 3 # and test that the non-identifier fields actually update - tj.legislative_sessions.append({'identifier': '2016', 'name': 'updated'}) + tj.legislative_sessions.append({"identifier": "2016", "name": "updated"}) ji.import_item(tj.as_dict()) assert LegislativeSession.objects.count() == 3 - assert LegislativeSession.objects.get(identifier='2016').name == 'updated' + assert LegislativeSession.objects.get(identifier="2016").name == "updated" diff --git a/pupa/tests/importers/test_membership_importer.py b/pupa/tests/importers/test_membership_importer.py index 774e906c..b2332f0f 100644 --- a/pupa/tests/importers/test_membership_importer.py +++ b/pupa/tests/importers/test_membership_importer.py @@ -7,7 +7,9 @@ class DumbMockImporter(object): - """ this is a mock importer that implements a resolve_json_id that is just a pass-through """ + """this is a mock importer that implements a resolve_json_id that is + just a pass-through""" + json_to_db_id = {} def resolve_json_id(self, json_id, allow_no_match=False): @@ -15,31 +17,47 @@ def resolve_json_id(self, json_id, allow_no_match=False): def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - Jurisdiction.objects.create(id='fnd-jid', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + Jurisdiction.objects.create(id="fnd-jid", division_id="ocd-division/country:us") @pytest.mark.django_db def test_full_membership(): create_jurisdiction() - org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") + org = Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) hari = Person.objects.create(id="hs", name="Hari Seldon") robot = Person.objects.create(id="robot", name="R. Daneel Olivaw") - post = Post.objects.create(id='f', label="founder", role="Founder", organization=org) + post = Post.objects.create( + id="f", label="founder", role="Founder", organization=org + ) # add a membership through a post, with a start date - m1 = ScrapeMembership(person_id=hari.id, organization_id=org.id, - post_id=post.id, start_date='2020-03-10', end_date='2021-06-30') - m1.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') - m1.add_link('http://example.com/link') + m1 = ScrapeMembership( + person_id=hari.id, + organization_id=org.id, + post_id=post.id, + start_date="2020-03-10", + end_date="2021-06-30", + ) + m1.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") + m1.add_link("http://example.com/link") # add a membership direct to an organization, with an end date - m2 = ScrapeMembership(person_id=robot.id, organization_id=org.id, label='member', - role='member', end_date='2019-11-09') + m2 = ScrapeMembership( + person_id=robot.id, + organization_id=org.id, + label="member", + role="member", + end_date="2019-11-09", + ) dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', dumb_imp, dumb_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", dumb_imp, dumb_imp, dumb_imp) memimp.import_data([m1.as_dict(), m2.as_dict()]) # ensure that the memberships attached in the right places @@ -51,46 +69,50 @@ def test_full_membership(): # ensure that the first membership has contact details and links m = hari.memberships.get() cd = m.contact_details.get() - assert cd.type == 'phone' - assert cd.value == '555-555-1234' - assert cd.note == 'this is fake' - assert m.links.all()[0].url == 'http://example.com/link' + assert cd.type == "phone" + assert cd.value == "555-555-1234" + assert cd.note == "this is fake" + assert m.links.all()[0].url == "http://example.com/link" # update the imported memberships (i.e., change attributes that are not # in the spec) and confirm they resolve correctly - memimp2 = MembershipImporter('fnd-jid', dumb_imp, dumb_imp, dumb_imp) + memimp2 = MembershipImporter("fnd-jid", dumb_imp, dumb_imp, dumb_imp) - m1.end_date = '2022-03-10' - m2.extras = {'note': 'bleep blorp'} + m1.end_date = "2022-03-10" + m2.extras = {"note": "bleep blorp"} import_log = memimp2.import_data([m1.as_dict(), m2.as_dict()]) - assert import_log['membership']['insert'] == 0 - assert import_log['membership']['update'] == 2 + assert import_log["membership"]["insert"] == 0 + assert import_log["membership"]["update"] == 2 # confirm the membership resolved based on start date and its end date was updated assert hari.memberships.count() == 1 - assert hari.memberships.get().end_date == '2022-03-10' + assert hari.memberships.get().end_date == "2022-03-10" # confirm the membership resolved based on end date and its extras were updated assert robot.memberships.count() == 1 - assert robot.memberships.get().extras == {'note': 'bleep blorp'} + assert robot.memberships.get().extras == {"note": "bleep blorp"} @pytest.mark.django_db def test_no_membership_for_person(): create_jurisdiction() - Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") + Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) # import a person with no memberships - p = ScrapePerson('a man without a country') - person_imp = PersonImporter('fnd-jid') + p = ScrapePerson("a man without a country") + person_imp = PersonImporter("fnd-jid") person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([]) @@ -99,23 +121,27 @@ def test_no_membership_for_person(): @pytest.mark.django_db def test_no_membership_for_person_including_party(): """ - even though party is specified we should still get a no memberships error because it doesn't - bind the person to a jurisdiction, thus causing duplication + even though party is specified we should still get a no memberships error + because it doesn't bind the person to a jurisdiction, thus causing duplication """ create_jurisdiction() - Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") + Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships - p = ScrapePerson('a man without a country', party='Democratic') - person_imp = PersonImporter('fnd-jid') - org_imp = OrganizationImporter('fnd-jid') + p = ScrapePerson("a man without a country", party="Democratic") + person_imp = PersonImporter("fnd-jid") + org_imp = OrganizationImporter("fnd-jid") person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()]) @@ -128,53 +154,75 @@ def test_multiple_orgs_of_same_class(): same classification within the same jurisdictions """ create_jurisdiction() - Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") - Organization.objects.create(id="fdr", name="Federation", classification="foundation", - jurisdiction_id="fnd-jid") - - hari = ScrapePerson('Hari Seldon', - primary_org='foundation', - role='founder', - primary_org_name='Foundation') - - picard = ScrapePerson('Jean Luc Picard', - primary_org='foundation', - role='founder', - primary_org_name='Federation') - - person_imp = PersonImporter('fnd-jid') + Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) + Organization.objects.create( + id="fdr", + name="Federation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) + + hari = ScrapePerson( + "Hari Seldon", + primary_org="foundation", + role="founder", + primary_org_name="Foundation", + ) + + picard = ScrapePerson( + "Jean Luc Picard", + primary_org="foundation", + role="founder", + primary_org_name="Federation", + ) + + person_imp = PersonImporter("fnd-jid") person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership - org_imp = OrganizationImporter('fnd-jid') + org_imp = OrganizationImporter("fnd-jid") dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp) memimp.import_data([hari._related[0].as_dict(), picard._related[0].as_dict()]) - assert Person.objects.get(name='Hari Seldon' - ).memberships.get().organization.name == 'Foundation' - assert Person.objects.get(name='Jean Luc Picard' - ).memberships.get().organization.name == 'Federation' + assert ( + Person.objects.get(name="Hari Seldon").memberships.get().organization.name + == "Foundation" + ) + assert ( + Person.objects.get(name="Jean Luc Picard").memberships.get().organization.name + == "Federation" + ) @pytest.mark.django_db def test_multiple_posts_class(): create_jurisdiction() - org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") + org = Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) hari = Person.objects.create(id="hs", name="Hari Seldon") - founder = Post.objects.create(id='f', label="founder", role="Founder", organization=org) - chair = Post.objects.create(id='c', label="chair", role="Chair", organization=org) + founder = Post.objects.create( + id="f", label="founder", role="Founder", organization=org + ) + chair = Post.objects.create(id="c", label="chair", role="Chair", organization=org) m1 = ScrapeMembership(person_id=hari.id, organization_id=org.id, post_id=founder.id) m2 = ScrapeMembership(person_id=hari.id, organization_id=org.id, post_id=chair.id) dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', dumb_imp, dumb_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", dumb_imp, dumb_imp, dumb_imp) memimp.import_data([m1.as_dict(), m2.as_dict()]) # ensure that the memberships attached in the right places @@ -188,15 +236,19 @@ def test_multiple_posts_class(): def test_unmatched_person(): create_jurisdiction() - org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", - jurisdiction_id="fnd-jid") + org = Organization.objects.create( + id="fnd", + name="Foundation", + classification="foundation", + jurisdiction_id="fnd-jid", + ) # not a real person, won't have a person_id after import - m1 = ScrapeMembership(person_name='Harry Seldom', organization_id=org.id, - person_id=None - ) + m1 = ScrapeMembership( + person_name="Harry Seldom", organization_id=org.id, person_id=None + ) dumb_imp = DumbMockImporter() - memimp = MembershipImporter('fnd-jid', dumb_imp, dumb_imp, dumb_imp) + memimp = MembershipImporter("fnd-jid", dumb_imp, dumb_imp, dumb_imp) memimp.import_data([m1.as_dict()]) # ensure that the memberships attached in the right places @@ -204,4 +256,4 @@ def test_unmatched_person(): membership = org.memberships.get() assert membership.person_id is None - assert membership.person_name == 'Harry Seldom' + assert membership.person_name == "Harry Seldom" diff --git a/pupa/tests/importers/test_organization_importer.py b/pupa/tests/importers/test_organization_importer.py index 95f2d836..688fe7a6 100644 --- a/pupa/tests/importers/test_organization_importer.py +++ b/pupa/tests/importers/test_organization_importer.py @@ -6,70 +6,72 @@ def create_jurisdictions(): - Division.objects.create(id='ocd-division/country:us', name='USA') - Jurisdiction.objects.create(id='jid1', division_id='ocd-division/country:us') - Jurisdiction.objects.create(id='jid2', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + Jurisdiction.objects.create(id="jid1", division_id="ocd-division/country:us") + Jurisdiction.objects.create(id="jid2", division_id="ocd-division/country:us") def create_org(): - o = Organization.objects.create(name='United Nations', - jurisdiction_id='jid1', - classification='international') - o.other_names.create(name='UN') + o = Organization.objects.create( + name="United Nations", jurisdiction_id="jid1", classification="international" + ) + o.other_names.create(name="UN") @pytest.mark.django_db def test_full_organization(): create_jurisdictions() - org = ScrapeOrganization('United Nations', classification='international') - org.add_identifier('un') - org.add_name('UN', start_date='1945') - org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') - org.add_link('http://example.com/link') - org.add_source('http://example.com/source') + org = ScrapeOrganization("United Nations", classification="international") + org.add_identifier("un") + org.add_name("UN", start_date="1945") + org.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") + org.add_link("http://example.com/link") + org.add_source("http://example.com/source") # import org od = org.as_dict() - OrganizationImporter('jid1').import_data([od]) + OrganizationImporter("jid1").import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() - assert 'ocd-organization' in o.id + assert "ocd-organization" in o.id assert o.name == org.name - assert o.identifiers.all()[0].identifier == 'un' - assert o.identifiers.all()[0].scheme == '' + assert o.identifiers.all()[0].identifier == "un" + assert o.identifiers.all()[0].scheme == "" - assert o.other_names.all()[0].name == 'UN' - assert o.other_names.all()[0].start_date == '1945' + assert o.other_names.all()[0].name == "UN" + assert o.other_names.all()[0].start_date == "1945" - assert o.contact_details.all()[0].type == 'phone' - assert o.contact_details.all()[0].value == '555-555-1234' - assert o.contact_details.all()[0].note == 'this is fake' + assert o.contact_details.all()[0].type == "phone" + assert o.contact_details.all()[0].value == "555-555-1234" + assert o.contact_details.all()[0].note == "this is fake" - assert o.links.all()[0].url == 'http://example.com/link' - assert o.sources.all()[0].url == 'http://example.com/source' + assert o.links.all()[0].url == "http://example.com/link" + assert o.sources.all()[0].url == "http://example.com/source" @pytest.mark.django_db def test_deduplication_similar_but_different(): create_jurisdictions() - o1 = ScrapeOrganization('United Nations', classification='international') + o1 = ScrapeOrganization("United Nations", classification="international") # different classification - o2 = ScrapeOrganization('United Nations', classification='global') + o2 = ScrapeOrganization("United Nations", classification="global") # different name - o3 = ScrapeOrganization('United Nations of Earth', classification='international') + o3 = ScrapeOrganization("United Nations of Earth", classification="international") # has a parent - o4 = ScrapeOrganization('United Nations', classification='international', parent_id=o1._id) + o4 = ScrapeOrganization( + "United Nations", classification="international", parent_id=o1._id + ) # similar, but no duplicates orgs = [o1.as_dict(), o2.as_dict(), o3.as_dict(), o4.as_dict()] - OrganizationImporter('jid1').import_data(orgs) + OrganizationImporter("jid1").import_data(orgs) assert Organization.objects.count() == 4 # should get a new one when jurisdiction_id changes - o5 = ScrapeOrganization('United Nations', classification='international') - OrganizationImporter('jid2').import_data([o5.as_dict()]) + o5 = ScrapeOrganization("United Nations", classification="international") + OrganizationImporter("jid2").import_data([o5.as_dict()]) assert Organization.objects.count() == 5 @@ -77,9 +79,9 @@ def test_deduplication_similar_but_different(): def test_deduplication_other_name_exists(): create_jurisdictions() create_org() - org = ScrapeOrganization('UN', classification='international') + org = ScrapeOrganization("UN", classification="international") od = org.as_dict() - OrganizationImporter('jid1').import_data([od]) + OrganizationImporter("jid1").import_data([od]) assert Organization.objects.all().count() == 1 @@ -87,10 +89,10 @@ def test_deduplication_other_name_exists(): def test_deduplication_other_name_overlaps(): create_jurisdictions() create_org() - org = ScrapeOrganization('The United Nations', classification='international') - org.add_name('United Nations') + org = ScrapeOrganization("The United Nations", classification="international") + org.add_name("United Nations") od = org.as_dict() - OrganizationImporter('jid1').import_data([od]) + OrganizationImporter("jid1").import_data([od]) assert Organization.objects.all().count() == 1 @@ -98,38 +100,46 @@ def test_deduplication_other_name_overlaps(): def test_deduplication_error_overlaps(): create_jurisdictions() - Organization.objects.create(name='World Wrestling Federation', - classification='international', - jurisdiction_id='jid1') - wildlife = Organization.objects.create(name='World Wildlife Fund', - classification='international', - jurisdiction_id='jid1') - wildlife.other_names.create(name='WWF') - - org = ScrapeOrganization('World Wrestling Federation', classification='international') - org.add_name('WWF') + Organization.objects.create( + name="World Wrestling Federation", + classification="international", + jurisdiction_id="jid1", + ) + wildlife = Organization.objects.create( + name="World Wildlife Fund", + classification="international", + jurisdiction_id="jid1", + ) + wildlife.other_names.create(name="WWF") + + org = ScrapeOrganization( + "World Wrestling Federation", classification="international" + ) + org.add_name("WWF") od = org.as_dict() with pytest.raises(SameOrgNameError): - OrganizationImporter('jid1').import_data([od]) + OrganizationImporter("jid1").import_data([od]) @pytest.mark.django_db def test_deduplication_overlap_name_distinct_juris(): create_jurisdictions() - org_jid_1 = Organization.objects.create(name='World Wrestling Federation', - classification='international', - jurisdiction_id='jid1') - org_jid_1.other_names.create(name='WWF') + org_jid_1 = Organization.objects.create( + name="World Wrestling Federation", + classification="international", + jurisdiction_id="jid1", + ) + org_jid_1.other_names.create(name="WWF") org = ScrapeOrganization(name="WWF", classification="international") - org.add_name('WWF') + org.add_name("WWF") - oi1 = OrganizationImporter('jid1') + oi1 = OrganizationImporter("jid1") oi1.import_item(org.as_dict()) assert Organization.objects.count() == 1 - oi2 = OrganizationImporter('jid2') + oi2 = OrganizationImporter("jid2") oi2.import_item(org.as_dict()) assert Organization.objects.count() == 2 @@ -137,83 +147,98 @@ def test_deduplication_overlap_name_distinct_juris(): @pytest.mark.django_db def test_deduplication_parties(): create_jurisdictions() - party = ScrapeOrganization('Wild', classification='party') - OrganizationImporter('jid1').import_data([party.as_dict()]) + party = ScrapeOrganization("Wild", classification="party") + OrganizationImporter("jid1").import_data([party.as_dict()]) assert Organization.objects.count() == 1 # parties shouldn't get jurisdiction id attached, so don't differ on import - party = ScrapeOrganization('Wild', classification='party') - OrganizationImporter('jid2').import_data([party.as_dict()]) + party = ScrapeOrganization("Wild", classification="party") + OrganizationImporter("jid2").import_data([party.as_dict()]) assert Organization.objects.count() == 1 @pytest.mark.django_db def test_deduplication_prevents_identical(): create_jurisdictions() - org1 = ScrapeOrganization('United Nations', classification='international') - org2 = ScrapeOrganization('United Nations', classification='international', - founding_date='1945') - OrganizationImporter('jid1').import_data([org1.as_dict()]) + org1 = ScrapeOrganization("United Nations", classification="international") + org2 = ScrapeOrganization( + "United Nations", classification="international", founding_date="1945" + ) + OrganizationImporter("jid1").import_data([org1.as_dict()]) assert Organization.objects.count() == 1 - OrganizationImporter('jid1').import_data([org2.as_dict()]) + OrganizationImporter("jid1").import_data([org2.as_dict()]) assert Organization.objects.count() == 1 @pytest.mark.django_db def test_pseudo_ids(): create_jurisdictions() - wild = Organization.objects.create(id='1', name='Wild', classification='party') - senate = Organization.objects.create(id='2', name='Senate', classification='upper', - jurisdiction_id='jid1') - house = Organization.objects.create(id='3', name='House', classification='lower', - jurisdiction_id='jid1') - un = Organization.objects.create(id='4', name='United Nations', classification='international', - jurisdiction_id='jid2') - - oi1 = OrganizationImporter('jid1') + wild = Organization.objects.create(id="1", name="Wild", classification="party") + senate = Organization.objects.create( + id="2", name="Senate", classification="upper", jurisdiction_id="jid1" + ) + house = Organization.objects.create( + id="3", name="House", classification="lower", jurisdiction_id="jid1" + ) + un = Organization.objects.create( + id="4", + name="United Nations", + classification="international", + jurisdiction_id="jid2", + ) + + oi1 = OrganizationImporter("jid1") assert oi1.resolve_json_id('~{"classification":"upper"}') == senate.id assert oi1.resolve_json_id('~{"classification":"lower"}') == house.id assert oi1.resolve_json_id('~{"classification":"party", "name":"Wild"}') == wild.id with pytest.raises(UnresolvedIdError): - oi1.resolve_json_id('~{"classification":"international", "name":"United Nations"}') + oi1.resolve_json_id( + '~{"classification":"international", "name":"United Nations"}' + ) - oi2 = OrganizationImporter('jid2') - assert (oi2.resolve_json_id('~{"classification":"international", "name":"United Nations"}') == - un.id) + oi2 = OrganizationImporter("jid2") + assert ( + oi2.resolve_json_id( + '~{"classification":"international", "name":"United Nations"}' + ) + == un.id + ) @pytest.mark.django_db def test_parent_id_resolution(): create_jurisdictions() - parent = ScrapeOrganization('UN', classification='international') - child = ScrapeOrganization('UNESCO', classification='unknown', parent_id=parent._id) - OrganizationImporter('jid1').import_data([parent.as_dict(), child.as_dict()]) + parent = ScrapeOrganization("UN", classification="international") + child = ScrapeOrganization("UNESCO", classification="unknown", parent_id=parent._id) + OrganizationImporter("jid1").import_data([parent.as_dict(), child.as_dict()]) assert Organization.objects.count() == 2 - assert Organization.objects.get(name='UN').children.count() == 1 - assert Organization.objects.get(name='UNESCO').parent.name == 'UN' + assert Organization.objects.get(name="UN").children.count() == 1 + assert Organization.objects.get(name="UNESCO").parent.name == "UN" @pytest.mark.django_db def test_pseudo_parent_id_resolution(): create_jurisdictions() - parent = ScrapeOrganization('UN', classification='international') - child = ScrapeOrganization('UNESCO', classification='unknown', - parent_id='~{"classification": "international"}') - OrganizationImporter('jid1').import_data([parent.as_dict(), child.as_dict()]) + parent = ScrapeOrganization("UN", classification="international") + child = ScrapeOrganization( + "UNESCO", + classification="unknown", + parent_id='~{"classification": "international"}', + ) + OrganizationImporter("jid1").import_data([parent.as_dict(), child.as_dict()]) assert Organization.objects.count() == 2 - assert Organization.objects.get(name='UN').children.count() == 1 - assert Organization.objects.get(name='UNESCO').parent.name == 'UN' + assert Organization.objects.get(name="UN").children.count() == 1 + assert Organization.objects.get(name="UNESCO").parent.name == "UN" @pytest.mark.django_db def test_extras_organization(): create_jurisdictions() - org = ScrapeOrganization('United Nations', classification='international') - org.extras = {"hello": "world", - "foo": {"bar": "baz"}} + org = ScrapeOrganization("United Nations", classification="international") + org.extras = {"hello": "world", "foo": {"bar": "baz"}} od = org.as_dict() - OrganizationImporter('jid1').import_data([od]) + OrganizationImporter("jid1").import_data([od]) o = Organization.objects.get() - assert o.extras['foo']['bar'] == 'baz' + assert o.extras["foo"]["bar"] == "baz" diff --git a/pupa/tests/importers/test_people_importer.py b/pupa/tests/importers/test_people_importer.py index 302f174d..10c93a19 100644 --- a/pupa/tests/importers/test_people_importer.py +++ b/pupa/tests/importers/test_people_importer.py @@ -1,54 +1,60 @@ import pytest from pupa.scrape import Person as ScrapePerson from pupa.importers import PersonImporter -from opencivicdata.core.models import Person, Organization, Membership, Division, Jurisdiction +from opencivicdata.core.models import ( + Person, + Organization, + Membership, + Division, + Jurisdiction, +) from pupa.exceptions import UnresolvedIdError, SameNameError def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") @pytest.mark.django_db def test_full_person(): - person = ScrapePerson('Tom Sawyer') - person.add_identifier('1') - person.add_name('Tommy', start_date='1880') - person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') - person.add_link('http://example.com/link') - person.add_source('http://example.com/source') + person = ScrapePerson("Tom Sawyer") + person.add_identifier("1") + person.add_name("Tommy", start_date="1880") + person.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") + person.add_link("http://example.com/link") + person.add_source("http://example.com/source") # import person pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() - assert 'ocd-person' in p.id + assert "ocd-person" in p.id assert p.name == person.name - assert p.identifiers.all()[0].identifier == '1' - assert p.identifiers.all()[0].scheme == '' + assert p.identifiers.all()[0].identifier == "1" + assert p.identifiers.all()[0].scheme == "" - assert p.other_names.all()[0].name == 'Tommy' - assert p.other_names.all()[0].start_date == '1880' + assert p.other_names.all()[0].name == "Tommy" + assert p.other_names.all()[0].start_date == "1880" - assert p.contact_details.all()[0].type == 'phone' - assert p.contact_details.all()[0].value == '555-555-1234' - assert p.contact_details.all()[0].note == 'this is fake' + assert p.contact_details.all()[0].type == "phone" + assert p.contact_details.all()[0].value == "555-555-1234" + assert p.contact_details.all()[0].note == "this is fake" - assert p.links.all()[0].url == 'http://example.com/link' - assert p.sources.all()[0].url == 'http://example.com/source' + assert p.links.all()[0].url == "http://example.com/link" + assert p.sources.all()[0].url == "http://example.com/source" def create_person(): - # deduplication for people is fairly complicated, it requires a person to have a membership - # in the jurisdiction's organization and have a matching name. let's set that up first for - # the deduplication tests - p = Person.objects.create(name='Dwayne Johnson') - p.other_names.create(name='Rocky') - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') + # deduplication for people is fairly complicated, it requires a person + # to have a membership in the jurisdiction's organization and have a + # matching name. let's set that up first for the deduplication tests + p = Person.objects.create(name="Dwayne Johnson") + p.other_names.create(name="Rocky") + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") Membership.objects.create(person=p, organization=o) @@ -57,9 +63,9 @@ def test_deduplication_same_name(): create_jurisdiction() create_person() # simplest case- just the same name - person = ScrapePerson('Dwayne Johnson') + person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1 @@ -68,9 +74,9 @@ def test_deduplication_other_name_exists(): create_jurisdiction() create_person() # Rocky is already saved in other_names - person = ScrapePerson('Rocky') + person = ScrapePerson("Rocky") pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1 @@ -79,10 +85,10 @@ def test_deduplication_other_name_overlaps(): create_jurisdiction() create_person() # Person has other_name that overlaps w/ existing name - person = ScrapePerson('The Rock') - person.add_name('Dwayne Johnson') + person = ScrapePerson("The Rock") + person.add_name("Dwayne Johnson") pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1 @@ -90,10 +96,11 @@ def test_deduplication_other_name_overlaps(): def test_deduplication_no_name_overlap(): create_jurisdiction() create_person() - # make sure we're not just being ridiculous and avoiding importing anything in the same org - person = ScrapePerson('CM Punk') + # make sure we're not just being ridiculous and avoiding importing + # anything in the same org + person = ScrapePerson("CM Punk") pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 2 @@ -102,9 +109,9 @@ def test_deduplication_no_jurisdiction_overlap(): create_jurisdiction() create_person() # make sure we get a new person if we're in a different org - person = ScrapePerson('Dwayne Johnson') + person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() - PersonImporter('new-jurisdiction-id').import_data([pd]) + PersonImporter("new-jurisdiction-id").import_data([pd]) assert Person.objects.all().count() == 2 @@ -113,15 +120,15 @@ def test_multiple_memberships(): create_jurisdiction() # there was a bug where two or more memberships to the same jurisdiction # would cause an ORM error, this test ensures that it is fixed - p = Person.objects.create(name='Dwayne Johnson') - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') + p = Person.objects.create(name="Dwayne Johnson") + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") Membership.objects.create(person=p, organization=o) - o = Organization.objects.create(name='WWF', jurisdiction_id='jid') + o = Organization.objects.create(name="WWF", jurisdiction_id="jid") Membership.objects.create(person=p, organization=o) - person = ScrapePerson('Dwayne Johnson') + person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() - PersonImporter('jid').import_data([pd]) + PersonImporter("jid").import_data([pd]) # deduplication should still work assert Person.objects.all().count() == 1 @@ -130,98 +137,98 @@ def test_multiple_memberships(): @pytest.mark.django_db def test_same_name_people(): create_jurisdiction() - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") # importing two people with the same name to a pristine database should error - p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') - p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') + p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1") + p2 = ScrapePerson("Dwayne Johnson", image="http://example.com/2") with pytest.raises(SameNameError): - PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) + PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) # importing one person should pass - PersonImporter('jid').import_data([p1.as_dict()]) + PersonImporter("jid").import_data([p1.as_dict()]) # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # importing another person with the same name should fail with pytest.raises(SameNameError): - PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) + PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) # adding birth dates should pass - p1.birth_date = '1970' - p2.birth_date = '1930' - resp = PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) - assert resp['person']['insert'] == 1 - assert resp['person']['noop'] == 0 - assert resp['person']['update'] == 1 + p1.birth_date = "1970" + p2.birth_date = "1930" + resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) + assert resp["person"]["insert"] == 1 + assert resp["person"]["noop"] == 0 + assert resp["person"]["update"] == 1 assert Person.objects.count() == 2 # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # adding a third person with the same name but without a birthday should error - p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') + p3 = ScrapePerson("Dwayne Johnson", image="http://example.com/3") with pytest.raises(SameNameError): - PersonImporter('jid').import_data([p3.as_dict()]) + PersonImporter("jid").import_data([p3.as_dict()]) # and now test that an update works and we can insert a new one with the same name - p1.image = 'http://example.com/1.jpg' - p2.birth_date = '1931' # change birth_date, means a new insert - resp = PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) + p1.image = "http://example.com/1.jpg" + p2.birth_date = "1931" # change birth_date, means a new insert + resp = PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 - assert resp['person']['insert'] == 1 - assert resp['person']['noop'] == 0 - assert resp['person']['update'] == 1 + assert resp["person"]["insert"] == 1 + assert resp["person"]["noop"] == 0 + assert resp["person"]["update"] == 1 @pytest.mark.django_db def test_same_name_people_other_name(): create_jurisdiction() # ensure we're taking other_names into account for the name collision code - Organization.objects.create(name='WWE', jurisdiction_id='jid') - p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') - p2 = ScrapePerson('Rock', image='http://example.com/2') - p2.add_name('Dwayne Johnson') + Organization.objects.create(name="WWE", jurisdiction_id="jid") + p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1") + p2 = ScrapePerson("Rock", image="http://example.com/2") + p2.add_name("Dwayne Johnson") # the people have the same name but are apparently different with pytest.raises(SameNameError): - PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) + PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) @pytest.mark.django_db def test_same_name_second_import(): create_jurisdiction() # ensure two people with the same name don't import without birthdays - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') - p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') - p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') - p1.birth_date = '1970' - p2.birth_date = '1930' + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + p1 = ScrapePerson("Dwayne Johnson", image="http://example.com/1") + p2 = ScrapePerson("Dwayne Johnson", image="http://example.com/2") + p1.birth_date = "1970" + p2.birth_date = "1930" # when we give them birth dates all is well though - PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) + PersonImporter("jid").import_data([p1.as_dict(), p2.as_dict()]) # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) - p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') + p3 = ScrapePerson("Dwayne Johnson", image="http://example.com/3") with pytest.raises(SameNameError): - PersonImporter('jid').import_data([p3.as_dict()]) + PersonImporter("jid").import_data([p3.as_dict()]) @pytest.mark.django_db def test_resolve_json_id(): create_jurisdiction() - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') - p = Person.objects.create(name='Dwayne Johnson', family_name='Johnson') - p.other_names.create(name='Rock') + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + p = Person.objects.create(name="Dwayne Johnson", family_name="Johnson") + p.other_names.create(name="Rock") p.memberships.create(organization=o) - pi = PersonImporter('jid') + pi = PersonImporter("jid") assert pi.resolve_json_id('~{"name": "Dwayne Johnson"}') == p.id assert pi.resolve_json_id('~{"name": "Rock"}') == p.id assert pi.resolve_json_id('~{"name": "Johnson"}') == p.id @@ -230,16 +237,16 @@ def test_resolve_json_id(): @pytest.mark.django_db def test_resolve_json_id_multiple_family_name(): create_jurisdiction() - o = Organization.objects.create(name='WWE', jurisdiction_id='jid') - p1 = Person.objects.create(name='Dwayne Johnson', family_name='Johnson') - p1.other_names.create(name='Rock') - p2 = Person.objects.create(name='Adam Johnson', family_name='Johnson') + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + p1 = Person.objects.create(name="Dwayne Johnson", family_name="Johnson") + p1.other_names.create(name="Rock") + p2 = Person.objects.create(name="Adam Johnson", family_name="Johnson") for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # If there are multiple people with a family name, full name/other name # lookups should work but family name lookups should fail. - pi = PersonImporter('jid') + pi = PersonImporter("jid") assert pi.resolve_json_id('~{"name": "Dwayne Johnson"}') == p1.id assert pi.resolve_json_id('~{"name": "Adam Johnson"}') == p2.id with pytest.raises(UnresolvedIdError): diff --git a/pupa/tests/importers/test_post_importer.py b/pupa/tests/importers/test_post_importer.py index f24d1fe6..11c74a6f 100644 --- a/pupa/tests/importers/test_post_importer.py +++ b/pupa/tests/importers/test_post_importer.py @@ -6,87 +6,124 @@ def create_jurisdictions(): - Division.objects.create(id='ocd-division/country:us', name='USA') - Division.objects.create(id='ocd-division/country:us/state:nc', name='NC') - Jurisdiction.objects.create(id='us', division_id='ocd-division/country:us') - Jurisdiction.objects.create(id='nc', division_id='ocd-division/country:us/state:nc') + Division.objects.create(id="ocd-division/country:us", name="USA") + Division.objects.create(id="ocd-division/country:us/state:nc", name="NC") + Jurisdiction.objects.create(id="us", division_id="ocd-division/country:us") + Jurisdiction.objects.create(id="nc", division_id="ocd-division/country:us/state:nc") @pytest.mark.django_db def test_full_post(): create_jurisdictions() - org = Organization.objects.create(name="United States Executive Branch", - classification="executive", - jurisdiction_id="us") - post = ScrapePost(label='executive', role='President', - organization_id='~{"classification": "executive"}', - start_date=datetime.date(2015, 5, 18), - end_date='2015-05-19', - maximum_memberships=2 - ) - post.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') - post.add_link('http://example.com/link') + org = Organization.objects.create( + name="United States Executive Branch", + classification="executive", + jurisdiction_id="us", + ) + post = ScrapePost( + label="executive", + role="President", + organization_id='~{"classification": "executive"}', + start_date=datetime.date(2015, 5, 18), + end_date="2015-05-19", + maximum_memberships=2, + ) + post.add_contact_detail(type="phone", value="555-555-1234", note="this is fake") + post.add_link("http://example.com/link") # import post - oi = OrganizationImporter('us') - PostImporter('jurisdiction-id', oi).import_data([post.as_dict()]) + oi = OrganizationImporter("us") + PostImporter("jurisdiction-id", oi).import_data([post.as_dict()]) print(post.as_dict()) # get person from db and assert it imported correctly p = Post.objects.get() - assert 'ocd-post' in p.id + assert "ocd-post" in p.id assert p.label == post.label assert p.role == post.role assert p.organization_id == org.id assert p.maximum_memberships == 2 - assert p.contact_details.all()[0].type == 'phone' - assert p.contact_details.all()[0].value == '555-555-1234' - assert p.contact_details.all()[0].note == 'this is fake' + assert p.contact_details.all()[0].type == "phone" + assert p.contact_details.all()[0].value == "555-555-1234" + assert p.contact_details.all()[0].note == "this is fake" - assert p.links.all()[0].url == 'http://example.com/link' + assert p.links.all()[0].url == "http://example.com/link" - assert p.start_date == '2015-05-18' - assert p.end_date == '2015-05-19' + assert p.start_date == "2015-05-18" + assert p.end_date == "2015-05-19" @pytest.mark.django_db def test_deduplication(): create_jurisdictions() - Organization.objects.create(id='us', name="United States Executive Branch", - classification="executive", jurisdiction_id="us") - Organization.objects.create(id='nc', name="North Carolina Executive Branch", - classification="executive", jurisdiction_id="nc") - pres = ScrapePost(label='executive', role='President', - organization_id='~{"classification": "executive"}') - vp = ScrapePost(label='vice-executive', role='Vice President', - organization_id='~{"classification": "executive"}') - gov = ScrapePost(label='executive', role='Governor', - organization_id='~{"classification": "executive"}') + Organization.objects.create( + id="us", + name="United States Executive Branch", + classification="executive", + jurisdiction_id="us", + ) + Organization.objects.create( + id="nc", + name="North Carolina Executive Branch", + classification="executive", + jurisdiction_id="nc", + ) + pres = ScrapePost( + label="executive", + role="President", + organization_id='~{"classification": "executive"}', + ) + vp = ScrapePost( + label="vice-executive", + role="Vice President", + organization_id='~{"classification": "executive"}', + ) + gov = ScrapePost( + label="executive", + role="Governor", + organization_id='~{"classification": "executive"}', + ) # ensure pres, vp and gov are all imported # pres & gov - same label, different jurisdiction # vp & pres - same jurisdiction, different label - us_oi = OrganizationImporter('us') - nc_oi = OrganizationImporter('nc') - PostImporter('us', us_oi).import_data([pres.as_dict(), vp.as_dict()]) - PostImporter('nc', nc_oi).import_data([gov.as_dict()]) + us_oi = OrganizationImporter("us") + nc_oi = OrganizationImporter("nc") + PostImporter("us", us_oi).import_data([pres.as_dict(), vp.as_dict()]) + PostImporter("nc", nc_oi).import_data([gov.as_dict()]) assert Post.objects.count() == 3 @pytest.mark.django_db def test_resolve_special_json_id(): create_jurisdictions() - Organization.objects.create(id='us', name="United States Executive Branch", - classification="executive", jurisdiction_id="us") - Organization.objects.create(id='nc', name="North Carolina Executive Branch", - classification="executive", jurisdiction_id="nc") - Post.objects.create(id='pres', label='executive', role='President', organization_id='us') - Post.objects.create(id='vpres', label='vice-executive', role='Vice President', - organization_id='us') - Post.objects.create(id='gov', label='executive', role='Governor', organization_id='nc') + Organization.objects.create( + id="us", + name="United States Executive Branch", + classification="executive", + jurisdiction_id="us", + ) + Organization.objects.create( + id="nc", + name="North Carolina Executive Branch", + classification="executive", + jurisdiction_id="nc", + ) + Post.objects.create( + id="pres", label="executive", role="President", organization_id="us" + ) + Post.objects.create( + id="vpres", label="vice-executive", role="Vice President", organization_id="us" + ) + Post.objects.create( + id="gov", label="executive", role="Governor", organization_id="nc" + ) - oi = OrganizationImporter('') - assert PostImporter('us', oi).resolve_json_id('~{"label": "executive"}') == 'pres' - assert PostImporter('us', oi).resolve_json_id('~{"label": "vice-executive"}') == 'vpres' - assert PostImporter('nc', oi).resolve_json_id('~{"label": "executive"}') == 'gov' + oi = OrganizationImporter("") + assert PostImporter("us", oi).resolve_json_id('~{"label": "executive"}') == "pres" + assert ( + PostImporter("us", oi).resolve_json_id('~{"label": "vice-executive"}') + == "vpres" + ) + assert PostImporter("nc", oi).resolve_json_id('~{"label": "executive"}') == "gov" diff --git a/pupa/tests/importers/test_topsort.py b/pupa/tests/importers/test_topsort.py index 7fbd00a0..897524bb 100644 --- a/pupa/tests/importers/test_topsort.py +++ b/pupa/tests/importers/test_topsort.py @@ -214,8 +214,6 @@ def test_cycles_complex(): # with open("/home/tag/debug.dot", 'w') as fd: # fd.write(network.dot()) - assert chash(network.cycles()) == chash([ - ('B', 'C', 'B'), - ('C', 'D', 'C'), - ('A', 'B', 'D', 'A') - ]) + assert chash(network.cycles()) == chash( + [("B", "C", "B"), ("C", "D", "C"), ("A", "B", "D", "A")] + ) diff --git a/pupa/tests/importers/test_vote_event_importer.py b/pupa/tests/importers/test_vote_event_importer.py index 2cd08f44..bdb28fc6 100644 --- a/pupa/tests/importers/test_vote_event_importer.py +++ b/pupa/tests/importers/test_vote_event_importer.py @@ -1,202 +1,245 @@ import re import pytest -from pupa.scrape import (VoteEvent as ScrapeVoteEvent, Bill as ScrapeBill, Organization as - ScrapeOrganization, Person as ScrapePerson) -from pupa.importers import (VoteEventImporter, BillImporter, MembershipImporter, - OrganizationImporter, PersonImporter) -from opencivicdata.core.models import (Jurisdiction, Person, Organization, Division) -from opencivicdata.legislative.models import (VoteEvent, LegislativeSession, Bill) +from pupa.scrape import ( + VoteEvent as ScrapeVoteEvent, + Bill as ScrapeBill, + Organization as ScrapeOrganization, + Person as ScrapePerson, +) +from pupa.importers import ( + VoteEventImporter, + BillImporter, + MembershipImporter, + OrganizationImporter, + PersonImporter, +) +from opencivicdata.core.models import Jurisdiction, Person, Organization, Division +from opencivicdata.legislative.models import VoteEvent, LegislativeSession, Bill class DumbMockImporter(object): - """ this is a mock importer that implements a resolve_json_id that is just a pass-through """ + """this is a mock importer that implements a resolve_json_id that is + just a pass-through""" def resolve_json_id(self, json_id, allow_no_match=False): return json_id def create_jurisdiction(): - Division.objects.create(id='ocd-division/country:us', name='USA') - j = Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') + Division.objects.create(id="ocd-division/country:us", name="USA") + j = Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") return j @pytest.mark.django_db def test_full_vote_event(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - sp1 = ScrapePerson('John Smith', primary_org='lower') - sp2 = ScrapePerson('Adam Smith', primary_org='lower') - org = ScrapeOrganization(name='House', classification='lower') - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) - vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-01', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - organization=org._id) - vote_event.set_count('yes', 20) - vote_event.yes('John Smith') - vote_event.no('Adam Smith') - - oi = OrganizationImporter('jid') + j.legislative_sessions.create(name="1900", identifier="1900") + sp1 = ScrapePerson("John Smith", primary_org="lower") + sp2 = ScrapePerson("Adam Smith", primary_org="lower") + org = ScrapeOrganization(name="House", classification="lower") + bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org._id) + vote_event = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-01", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + organization=org._id, + ) + vote_event.set_count("yes", 20) + vote_event.yes("John Smith") + vote_event.no("Adam Smith") + + oi = OrganizationImporter("jid") oi.import_data([org.as_dict()]) - pi = PersonImporter('jid') + pi = PersonImporter("jid") pi.import_data([sp1.as_dict(), sp2.as_dict()]) - mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) + mi = MembershipImporter("jid", pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) - bi = BillImporter('jid', oi, pi) + bi = BillImporter("jid", oi, pi) bi.import_data([bill.as_dict()]) - VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) + VoteEventImporter("jid", pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() - assert ve.motion_classification == ['passage:bill'] + assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() - assert count.option == 'yes' + assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): - if v.voter_name == 'John Smith': - assert v.option == 'yes' - assert v.voter == Person.objects.get(name='John Smith') + if v.voter_name == "John Smith": + assert v.option == "yes" + assert v.voter == Person.objects.get(name="John Smith") else: - assert v.option == 'no' - assert v.voter == Person.objects.get(name='Adam Smith') + assert v.option == "no" + assert v.voter == Person.objects.get(name="Adam Smith") @pytest.mark.django_db def test_vote_event_identifier_dedupe(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - Organization.objects.create(id='org-id', name='Legislature', - classification='legislature', - jurisdiction=j) - - vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on something', - identifier='Roll Call No. 1') + j.legislative_sessions.create(name="1900", identifier="1900") + Organization.objects.create( + id="org-id", name="Legislature", classification="legislature", jurisdiction=j + ) + + vote_event = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on something", + identifier="Roll Call No. 1", + ) dmi = DumbMockImporter() - oi = OrganizationImporter('jid') - bi = BillImporter('jid', dmi, oi) + oi = OrganizationImporter("jid") + bi = BillImporter("jid", dmi, oi) - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'noop' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update - vote_event.result = 'failed' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'update' + vote_event.result = "failed" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "update" assert VoteEvent.objects.count() == 1 # new bill, insert - vote_event.identifier = 'Roll Call 2' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + vote_event.identifier = "Roll Call 2" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 2 @pytest.mark.django_db def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - Organization.objects.create(id='org-id', name='Legislature', - classification='legislature', - jurisdiction=j) - - vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on something', - identifier='Roll Call No. 1') - vote_event.pupa_id = 'foo' + j.legislative_sessions.create(name="1900", identifier="1900") + Organization.objects.create( + id="org-id", name="Legislature", classification="legislature", jurisdiction=j + ) + + vote_event = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on something", + identifier="Roll Call No. 1", + ) + vote_event.pupa_id = "foo" dmi = DumbMockImporter() - oi = OrganizationImporter('jid') - bi = BillImporter('jid', dmi, oi) + oi = OrganizationImporter("jid") + bi = BillImporter("jid", dmi, oi) - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'noop' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update - vote_event.result = 'failed' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'update' + vote_event.result = "failed" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "update" assert VoteEvent.objects.count() == 1 # new bill identifier, update - vote_event.identifier = 'First Roll Call' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'update' + vote_event.identifier = "First Roll Call" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "update" assert VoteEvent.objects.count() == 1 # new pupa identifier, insert - vote_event.pupa_id = 'bar' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + vote_event.pupa_id = "bar" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 2 @pytest.mark.django_db def test_vote_event_bill_id_dedupe(): j = create_jurisdiction() - session = j.legislative_sessions.create(name='1900', identifier='1900') - org = Organization.objects.create(id='org-id', name='House', classification='lower', - jurisdiction=j) - bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, - from_organization=org) - bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, - from_organization=org) - - vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on something', - bill=bill.identifier, bill_chamber='lower', - chamber='lower') + session = j.legislative_sessions.create(name="1900", identifier="1900") + org = Organization.objects.create( + id="org-id", name="House", classification="lower", jurisdiction=j + ) + bill = Bill.objects.create( + id="bill-1", + identifier="HB 1", + legislative_session=session, + from_organization=org, + ) + bill2 = Bill.objects.create( + id="bill-2", + identifier="HB 2", + legislative_session=session, + from_organization=org, + ) + + vote_event = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on something", + bill=bill.identifier, + bill_chamber="lower", + chamber="lower", + ) dmi = DumbMockImporter() - oi = OrganizationImporter('jid') - bi = BillImporter('jid', dmi, oi) + oi = OrganizationImporter("jid") + bi = BillImporter("jid", dmi, oi) - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 1 # same exact vote event, no changes - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'noop' + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "noop" assert VoteEvent.objects.count() == 1 # new info, update - vote_event.result = 'failed' - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'update' + vote_event.result = "failed" + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "update" assert VoteEvent.objects.count() == 1 # new vote event, insert - vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on something', - bill=bill2.identifier, bill_chamber='lower', - chamber='lower') - _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) - assert what == 'insert' + vote_event = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on something", + bill=bill2.identifier, + bill_chamber="lower", + chamber="lower", + ) + _, what = VoteEventImporter("jid", dmi, oi, bi).import_item(vote_event.as_dict()) + assert what == "insert" assert VoteEvent.objects.count() == 2 @@ -205,95 +248,136 @@ def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() - session = j.legislative_sessions.create(name='1900', identifier='1900') - org = Organization.objects.create(id='org-id', name='House', classification='lower', - jurisdiction=j) - bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, - from_organization=org) - Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, - from_organization=org) - oi = OrganizationImporter('jid') + session = j.legislative_sessions.create(name="1900", identifier="1900") + org = Organization.objects.create( + id="org-id", name="House", classification="lower", jurisdiction=j + ) + bill = Bill.objects.create( + id="bill-1", + identifier="HB 1", + legislative_session=session, + from_organization=org, + ) + Bill.objects.create( + id="bill-2", + identifier="HB 2", + legislative_session=session, + from_organization=org, + ) + oi = OrganizationImporter("jid") dmi = DumbMockImporter() - bi = BillImporter('jid', dmi, oi) - - vote_event1 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on somthing', # typo intentional - bill=bill.identifier, bill_chamber='lower', - chamber='lower' - ) - vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', - classification='anything', result='passed', - motion_text='a vote on something else', - bill=bill.identifier, bill_chamber='lower', - chamber='lower' - ) + bi = BillImporter("jid", dmi, oi) + + vote_event1 = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on somthing", # typo intentional + bill=bill.identifier, + bill_chamber="lower", + chamber="lower", + ) + vote_event2 = ScrapeVoteEvent( + legislative_session="1900", + start_date="2013", + classification="anything", + result="passed", + motion_text="a vote on something else", + bill=bill.identifier, + bill_chamber="lower", + chamber="lower", + ) # have to use import_data so postimport is called - VoteEventImporter('jid', dmi, oi, bi).import_data([ - vote_event1.as_dict(), - vote_event2.as_dict() - ]) + VoteEventImporter("jid", dmi, oi, bi).import_data( + [vote_event1.as_dict(), vote_event2.as_dict()] + ) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now - vote_event1.motion_text = 'a vote on something' - VoteEventImporter('jid', dmi, oi, bi).import_data([ - vote_event1.as_dict(), - vote_event2.as_dict() - ]) + vote_event1.motion_text = "a vote on something" + VoteEventImporter("jid", dmi, oi, bi).import_data( + [vote_event1.as_dict(), vote_event2.as_dict()] + ) assert VoteEvent.objects.count() == 2 @pytest.mark.django_db def test_vote_event_bill_actions(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - org1 = ScrapeOrganization(name='House', classification='lower') - org2 = ScrapeOrganization(name='Senate', classification='upper') - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) + j.legislative_sessions.create(name="1900", identifier="1900") + org1 = ScrapeOrganization(name="House", classification="lower") + org2 = ScrapeOrganization(name="Senate", classification="upper") + bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) # add actions, passage of upper & lower on same day, something else, # then passage in upper again on a different day - bill.add_action(description='passage', date='1900-04-01', chamber='upper') - bill.add_action(description='passage', date='1900-04-01', chamber='lower') - bill.add_action(description='other event', date='1900-04-01', chamber='lower') - bill.add_action(description='passage', date='1900-04-02', chamber='upper') + bill.add_action(description="passage", date="1900-04-01", chamber="upper") + bill.add_action(description="passage", date="1900-04-01", chamber="lower") + bill.add_action(description="other event", date="1900-04-01", chamber="lower") + bill.add_action(description="passage", date="1900-04-02", chamber="upper") # four passage votes, one per chamber, one on 04-01, and one on 04-02 - ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-01', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org1._id) - ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-01', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org2._id) - ve3 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org1._id) - ve4 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org2._id) - - oi = OrganizationImporter('jid') + ve1 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-01", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org1._id, + ) + ve2 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-01", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org2._id, + ) + ve3 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org1._id, + ) + ve4 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org2._id, + ) + + oi = OrganizationImporter("jid") oi.import_data([org1.as_dict(), org2.as_dict()]) - bi = BillImporter('jid', oi, DumbMockImporter()) + bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) - VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ - ve1.as_dict(), - ve2.as_dict(), - ve3.as_dict(), - ve4.as_dict(), - ]) + VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( + [ + ve1.as_dict(), + ve2.as_dict(), + ve3.as_dict(), + ve4.as_dict(), + ] + ) bill = Bill.objects.get() votes = list(VoteEvent.objects.all()) @@ -301,63 +385,80 @@ def test_vote_event_bill_actions(): assert len(actions) == 4 assert len(votes) == 4 - votes = {(v.organization.classification, v.start_date): v.bill_action - for v in votes} + votes = { + (v.organization.classification, v.start_date): v.bill_action for v in votes + } # ensure that votes are matched using action, chamber, and date - assert votes[('upper', '1900-04-01')] == actions[0] - assert votes[('lower', '1900-04-01')] == actions[1] - assert votes[('upper', '1900-04-02')] == actions[3] - assert votes[('lower', '1900-04-02')] is None + assert votes[("upper", "1900-04-01")] == actions[0] + assert votes[("lower", "1900-04-01")] == actions[1] + assert votes[("upper", "1900-04-02")] == actions[3] + assert votes[("lower", "1900-04-02")] is None @pytest.mark.django_db def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the - # OneToOneField, but in this case we do it in two stages so that the conflict is found - # even if the votes weren't in the same scrape + # OneToOneField, but in this case we do it in two stages so that the conflict is + # found even if the votes weren't in the same scrape j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - org1 = ScrapeOrganization(name='House', classification='lower') - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) - - bill.add_action(description='passage', date='1900-04-02', chamber='lower') - - ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org1._id) - ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - bill_action='passage', - organization=org1._id) + j.legislative_sessions.create(name="1900", identifier="1900") + org1 = ScrapeOrganization(name="House", classification="lower") + bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) + + bill.add_action(description="passage", date="1900-04-02", chamber="lower") + + ve1 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org1._id, + ) + ve2 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + bill_action="passage", + organization=org1._id, + ) # disambiguate them - ve1.pupa_id = 'one' - ve2.pupa_id = 'two' + ve1.pupa_id = "one" + ve2.pupa_id = "two" - oi = OrganizationImporter('jid') + oi = OrganizationImporter("jid") oi.import_data([org1.as_dict()]) - bi = BillImporter('jid', oi, DumbMockImporter()) + bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine - VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ - ve1.as_dict(), - ]) + VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( + [ + ve1.as_dict(), + ] + ) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import - VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ - ve1.as_dict(), - ve2.as_dict(), - ]) + VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( + [ + ve1.as_dict(), + ve2.as_dict(), + ] + ) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None @@ -367,56 +468,82 @@ def test_vote_event_bill_actions_two_stage(): @pytest.mark.django_db def test_vote_event_bill_actions_errors(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') - org1 = ScrapeOrganization(name='House', classification='lower') - org2 = ScrapeOrganization(name='Senate', classification='upper') - bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) + j.legislative_sessions.create(name="1900", identifier="1900") + org1 = ScrapeOrganization(name="House", classification="lower") + org2 = ScrapeOrganization(name="Senate", classification="upper") + bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org1._id) # for this bill, two identical actions, so vote matching will fail - bill.add_action(description='passage', date='1900-04-01', chamber='lower') - bill.add_action(description='passage', date='1900-04-01', chamber='lower') + bill.add_action(description="passage", date="1900-04-01", chamber="lower") + bill.add_action(description="passage", date="1900-04-01", chamber="lower") # this action is good, but two votes will try to match it - bill.add_action(description='passage', date='1900-04-02', chamber='lower') + bill.add_action(description="passage", date="1900-04-02", chamber="lower") # will match two actions - ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-01', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - identifier='1', - bill_action='passage', - organization=org1._id) + ve1 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-01", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + identifier="1", + bill_action="passage", + organization=org1._id, + ) # will match no actions - ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-01', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - identifier='2', - bill_action='committee result', - organization=org1._id) + ve2 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-01", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + identifier="2", + bill_action="committee result", + organization=org1._id, + ) # these two votes will both match the same action - ve3 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='pass', bill_chamber='lower', bill='HB 1', - identifier='3', - bill_action='passage', - organization=org1._id) - ve4 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage-syz', - start_date='1900-04-02', classification='passage:bill', - result='fail', bill_chamber='lower', bill='HB 1', - identifier='4', - bill_action='passage', - organization=org1._id) - - oi = OrganizationImporter('jid') + ve3 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="pass", + bill_chamber="lower", + bill="HB 1", + identifier="3", + bill_action="passage", + organization=org1._id, + ) + ve4 = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage-syz", + start_date="1900-04-02", + classification="passage:bill", + result="fail", + bill_chamber="lower", + bill="HB 1", + identifier="4", + bill_action="passage", + organization=org1._id, + ) + + oi = OrganizationImporter("jid") oi.import_data([org1.as_dict(), org2.as_dict()]) - bi = BillImporter('jid', oi, DumbMockImporter()) + bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) - VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ - ve1.as_dict(), - ve2.as_dict(), - ve3.as_dict(), - ve4.as_dict(), - ]) + VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( + [ + ve1.as_dict(), + ve2.as_dict(), + ve3.as_dict(), + ve4.as_dict(), + ] + ) bill = Bill.objects.get() votes = list(VoteEvent.objects.all()) @@ -434,36 +561,46 @@ def test_vote_event_bill_actions_errors(): @pytest.mark.django_db def test_fix_bill_id(): j = create_jurisdiction() - j.legislative_sessions.create(name='1900', identifier='1900') + j.legislative_sessions.create(name="1900", identifier="1900") - org1 = ScrapeOrganization(name='House', classification='lower') - bill = ScrapeBill('HB 1', '1900', 'Test Bill ID', - classification='bill', chamber='lower') + org1 = ScrapeOrganization(name="House", classification="lower") + bill = ScrapeBill( + "HB 1", "1900", "Test Bill ID", classification="bill", chamber="lower" + ) - oi = OrganizationImporter('jid') + oi = OrganizationImporter("jid") oi.import_data([org1.as_dict()]) from pupa.settings import IMPORT_TRANSFORMERS - IMPORT_TRANSFORMERS['bill'] = { - 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) + + IMPORT_TRANSFORMERS["bill"] = { + "identifier": lambda x: re.sub(r"([A-Z]*)\s*0*([-\d]+)", r"\1 \2", x, 1) } - bi = BillImporter('jid', oi, DumbMockImporter()) + bi = BillImporter("jid", oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) - ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', - start_date='1900-04-02', classification='passage:bill', - result='fail', bill_chamber='lower', bill='HB1', - identifier='4', - bill_action='passage', - organization=org1._id) - - VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ - ve.as_dict(), - ]) - - IMPORT_TRANSFORMERS['bill'] = {} + ve = ScrapeVoteEvent( + legislative_session="1900", + motion_text="passage", + start_date="1900-04-02", + classification="passage:bill", + result="fail", + bill_chamber="lower", + bill="HB1", + identifier="4", + bill_action="passage", + organization=org1._id, + ) + + VoteEventImporter("jid", DumbMockImporter(), oi, bi).import_data( + [ + ve.as_dict(), + ] + ) + + IMPORT_TRANSFORMERS["bill"] = {} ve = VoteEvent.objects.get() - ve.bill.identifier == 'HB 1' + ve.bill.identifier == "HB 1" diff --git a/pupa/tests/reports/test_session_report.py b/pupa/tests/reports/test_session_report.py index 0e42108d..f7833cb5 100644 --- a/pupa/tests/reports/test_session_report.py +++ b/pupa/tests/reports/test_session_report.py @@ -6,25 +6,29 @@ def create_data(): - Division.objects.create(id='ocd-division/country:us', name='USA') - j = Jurisdiction.objects.create(id='jid', division_id='ocd-division/country:us') - org = Organization.objects.create(jurisdiction=j, name='House', classification='lower') - person = Person.objects.create(name='Roy') - j.legislative_sessions.create(identifier='1899', name='1899') - session = j.legislative_sessions.create(identifier='1900', name='1900').id + Division.objects.create(id="ocd-division/country:us", name="USA") + j = Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") + org = Organization.objects.create( + jurisdiction=j, name="House", classification="lower" + ) + person = Person.objects.create(name="Roy") + j.legislative_sessions.create(identifier="1899", name="1899") + session = j.legislative_sessions.create(identifier="1900", name="1900").id return session, org, person @pytest.mark.django_db def test_bills_missing_actions(): session, org, person = create_data() - Bill.objects.create(identifier='HB1', title='One', legislative_session_id=session) - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) + Bill.objects.create(identifier="HB1", title="One", legislative_session_id=session) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) report = generate_session_report(session) assert report.bills_missing_actions == 2 - b.actions.create(description='Introduced', order=1, organization=org) + b.actions.create(description="Introduced", order=1, organization=org) report = generate_session_report(session) assert report.bills_missing_actions == 1 @@ -32,13 +36,15 @@ def test_bills_missing_actions(): @pytest.mark.django_db def test_bills_missing_sponsors(): session, org, person = create_data() - Bill.objects.create(identifier='HB1', title='One', legislative_session_id=session) - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) + Bill.objects.create(identifier="HB1", title="One", legislative_session_id=session) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) report = generate_session_report(session) assert report.bills_missing_sponsors == 2 - b.sponsorships.create(name='Roy', entity_type='person') + b.sponsorships.create(name="Roy", entity_type="person") report = generate_session_report(session) assert report.bills_missing_sponsors == 1 @@ -46,13 +52,15 @@ def test_bills_missing_sponsors(): @pytest.mark.django_db def test_bills_missing_versions(): session, org, person = create_data() - Bill.objects.create(identifier='HB1', title='One', legislative_session_id=session) - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) + Bill.objects.create(identifier="HB1", title="One", legislative_session_id=session) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) report = generate_session_report(session) assert report.bills_missing_versions == 2 - b.versions.create(note='Final Copy') + b.versions.create(note="Final Copy") report = generate_session_report(session) assert report.bills_missing_versions == 1 @@ -60,11 +68,15 @@ def test_bills_missing_versions(): @pytest.mark.django_db def test_votes_missing_bill(): session, org, person = create_data() - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) - v = VoteEvent.objects.create(legislative_session_id=session, motion_text='Passage', - organization=org) - VoteEvent.objects.create(legislative_session_id=session, motion_text='Amendment', - organization=org) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) + v = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Passage", organization=org + ) + VoteEvent.objects.create( + legislative_session_id=session, motion_text="Amendment", organization=org + ) report = generate_session_report(session) assert report.votes_missing_bill == 2 @@ -78,16 +90,23 @@ def test_votes_missing_bill(): @pytest.mark.django_db def test_votes_missing_voters(): session, org, person = create_data() - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) - v = VoteEvent.objects.create(legislative_session_id=session, motion_text='Passage', bill=b, - organization=org) - VoteEvent.objects.create(legislative_session_id=session, motion_text='Amendment', bill=b, - organization=org) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) + v = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Passage", bill=b, organization=org + ) + VoteEvent.objects.create( + legislative_session_id=session, + motion_text="Amendment", + bill=b, + organization=org, + ) report = generate_session_report(session) assert report.votes_missing_voters == 2 - v.votes.create(option='yes', voter_name='Speaker') + v.votes.create(option="yes", voter_name="Speaker") report = generate_session_report(session) assert report.votes_missing_voters == 1 @@ -95,22 +114,29 @@ def test_votes_missing_voters(): @pytest.mark.django_db def test_missing_yes_no_counts(): session, org, person = create_data() - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) - v = VoteEvent.objects.create(legislative_session_id=session, motion_text='Passage', bill=b, - organization=org) - VoteEvent.objects.create(legislative_session_id=session, motion_text='Amendment', bill=b, - organization=org) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) + v = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Passage", bill=b, organization=org + ) + VoteEvent.objects.create( + legislative_session_id=session, + motion_text="Amendment", + bill=b, + organization=org, + ) report = generate_session_report(session) assert report.votes_missing_yes_count == 2 assert report.votes_missing_no_count == 2 - v.counts.create(option='yes', value=1) + v.counts.create(option="yes", value=1) report = generate_session_report(session) assert report.votes_missing_yes_count == 1 assert report.votes_missing_no_count == 2 - v.counts.create(option='no', value=0) + v.counts.create(option="no", value=0) report = generate_session_report(session) assert report.votes_missing_yes_count == 1 assert report.votes_missing_no_count == 1 @@ -120,35 +146,38 @@ def test_missing_yes_no_counts(): @pytest.mark.django_db def test_votes_with_bad_counts(): session, org, person = create_data() - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) - v = VoteEvent.objects.create(legislative_session_id=session, motion_text='Passage', bill=b, - organization=org) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) + v = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Passage", bill=b, organization=org + ) report = generate_session_report(session) assert report.votes_with_bad_counts == 0 # add count, breaking - v.counts.create(option='yes', value=1) + v.counts.create(option="yes", value=1) report = generate_session_report(session) assert report.votes_with_bad_counts == 1 # add voter, fixing - v.votes.create(option='yes', voter_name='One') + v.votes.create(option="yes", voter_name="One") report = generate_session_report(session) assert report.votes_with_bad_counts == 0 # add voter, breaking - v.votes.create(option='no', voter_name='Two') + v.votes.create(option="no", voter_name="Two") report = generate_session_report(session) assert report.votes_with_bad_counts == 1 # add count, still not equal - v.counts.create(option='no', value=2) + v.counts.create(option="no", value=2) report = generate_session_report(session) assert report.votes_with_bad_counts == 1 # add voter, fixing - v.votes.create(option='no', voter_name='Three') + v.votes.create(option="no", voter_name="Three") report = generate_session_report(session) assert report.votes_with_bad_counts == 0 @@ -156,53 +185,61 @@ def test_votes_with_bad_counts(): @pytest.mark.django_db def test_unmatched_sponsors(): session, org, person = create_data() - b1 = Bill.objects.create(identifier='HB1', title='One', legislative_session_id=session) - b2 = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) + b1 = Bill.objects.create( + identifier="HB1", title="One", legislative_session_id=session + ) + b2 = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) - b1.sponsorships.create(name='Roy', entity_type='person') - b1.sponsorships.create(name='Wendy', entity_type='person') - b1.sponsorships.create(name='Committee On Legislation', entity_type='organization') + b1.sponsorships.create(name="Roy", entity_type="person") + b1.sponsorships.create(name="Wendy", entity_type="person") + b1.sponsorships.create(name="Committee On Legislation", entity_type="organization") - b2.sponsorships.create(name='Wendy', entity_type='person') + b2.sponsorships.create(name="Wendy", entity_type="person") report = generate_session_report(session) assert len(report.unmatched_sponsor_people) == 2 - assert report.unmatched_sponsor_people['Roy'] == 1 - assert report.unmatched_sponsor_people['Wendy'] == 2 - assert report.unmatched_sponsor_organizations == {'Committee On Legislation': 1} + assert report.unmatched_sponsor_people["Roy"] == 1 + assert report.unmatched_sponsor_people["Wendy"] == 2 + assert report.unmatched_sponsor_organizations == {"Committee On Legislation": 1} # ensure that Roy goes away when linked - sp = b1.sponsorships.get(name='Roy') + sp = b1.sponsorships.get(name="Roy") sp.person_id = person.id sp.save() report = generate_session_report(session) - assert report.unmatched_sponsor_people == {'Wendy': 2} + assert report.unmatched_sponsor_people == {"Wendy": 2} @pytest.mark.django_db def test_unmatched_voters(): session, org, person = create_data() - b = Bill.objects.create(identifier='HB2', title='Two', legislative_session_id=session) - v1 = VoteEvent.objects.create(legislative_session_id=session, motion_text='Passage', bill=b, - organization=org) - v2 = VoteEvent.objects.create(legislative_session_id=session, motion_text='Override', bill=b, - organization=org) + b = Bill.objects.create( + identifier="HB2", title="Two", legislative_session_id=session + ) + v1 = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Passage", bill=b, organization=org + ) + v2 = VoteEvent.objects.create( + legislative_session_id=session, motion_text="Override", bill=b, organization=org + ) report = generate_session_report(session) assert report.unmatched_voters == {} # add voters - v1.votes.create(option='yes', voter_name='Roy') - v1.votes.create(option='yes', voter_name='Wendy') - v2.votes.create(option='yes', voter_name='Wendy') + v1.votes.create(option="yes", voter_name="Roy") + v1.votes.create(option="yes", voter_name="Wendy") + v2.votes.create(option="yes", voter_name="Wendy") report = generate_session_report(session) assert len(report.unmatched_voters) == 2 - assert report.unmatched_voters['Roy'] == 1 - assert report.unmatched_voters['Wendy'] == 2 + assert report.unmatched_voters["Roy"] == 1 + assert report.unmatched_voters["Wendy"] == 2 # ensure that Roy goes away when linked - voter = v1.votes.get(voter_name='Roy') + voter = v1.votes.get(voter_name="Roy") voter.voter_id = person.id voter.save() report = generate_session_report(session) - assert report.unmatched_voters == {'Wendy': 2} + assert report.unmatched_voters == {"Wendy": 2} diff --git a/pupa/tests/scrape/test_bill_scrape.py b/pupa/tests/scrape/test_bill_scrape.py index f01b2269..e7728c9c 100644 --- a/pupa/tests/scrape/test_bill_scrape.py +++ b/pupa/tests/scrape/test_bill_scrape.py @@ -5,9 +5,13 @@ def toy_bill(): - b = Bill(identifier="HB 2017", legislative_session="2012A", - title="A bill for an act to raise the cookie budget by 200%", - from_organization="Foo Senate", classification="bill") + b = Bill( + identifier="HB 2017", + legislative_session="2012A", + title="A bill for an act to raise the cookie budget by 200%", + from_organization="Foo Senate", + classification="bill", + ) b.add_source("http://uri.example.com/", note="foo") return b @@ -15,7 +19,7 @@ def toy_bill(): def test_basic_valid_bill(): b = toy_bill() b.validate() - assert 'we got here' + assert "we got here" def test_bill_type_setting(): @@ -24,23 +28,35 @@ def test_bill_type_setting(): assert b.classification == ["bill"] # string -> list - b = Bill(identifier="some bill", legislative_session="session", title="the title", - classification="string") + b = Bill( + identifier="some bill", + legislative_session="session", + title="the title", + classification="string", + ) assert b.classification == ["string"] # list unmodified - b = Bill(identifier="some bill", legislative_session="session", title="the title", - classification=["two", "items"]) + b = Bill( + identifier="some bill", + legislative_session="session", + title="the title", + classification=["two", "items"], + ) assert b.classification == ["two", "items"] # tuple -> list - b = Bill(identifier="some bill", legislative_session="session", title="the title", - classification=("two", "items")) + b = Bill( + identifier="some bill", + legislative_session="session", + title="the title", + classification=("two", "items"), + ) assert b.classification == ["two", "items"] def test_basic_invalid_bill(): - """ Test that we can create an invalid bill, and validation will fail """ + """Test that we can create an invalid bill, and validation will fail""" b = toy_bill() b.identifier = None with pytest.raises(ValueError): @@ -49,57 +65,79 @@ def test_basic_invalid_bill(): def test_from_organization(): # none set - assert ((get_pseudo_id(Bill('HB 1', '2014', 'Some Bill').from_organization) == - {'classification': 'legislature'})) + assert get_pseudo_id(Bill("HB 1", "2014", "Some Bill").from_organization) == { + "classification": "legislature" + } # chamber set - assert (get_pseudo_id(Bill('SB 1', '2014', 'Some Bill', chamber='upper').from_organization) == - {'classification': 'upper'}) + assert get_pseudo_id( + Bill("SB 1", "2014", "Some Bill", chamber="upper").from_organization + ) == {"classification": "upper"} # org direct set - assert Bill('HB 1', '2014', 'Some Bill', from_organization='test').from_organization == 'test' + assert ( + Bill("HB 1", "2014", "Some Bill", from_organization="test").from_organization + == "test" + ) # can't set both with pytest.raises(ValueError): - Bill('HB 1', '2014', 'Some Bill', from_organization='upper', chamber='upper') + Bill("HB 1", "2014", "Some Bill", from_organization="upper", chamber="upper") def test_add_action(): - """ Make sure actions work """ + """Make sure actions work""" b = toy_bill() - b.add_action("Some dude liked it.", "2013-04-29T20:00Z", chamber='lower') + b.add_action("Some dude liked it.", "2013-04-29T20:00Z", chamber="lower") assert len(b.actions) == 1 - assert b.actions[0]['description'] == 'Some dude liked it.' - assert get_pseudo_id(b.actions[0]['organization_id']) == {'classification': 'lower'} - assert b.actions[0]['date'] == '2013-04-29T20:00Z' + assert b.actions[0]["description"] == "Some dude liked it." + assert get_pseudo_id(b.actions[0]["organization_id"]) == {"classification": "lower"} + assert b.actions[0]["date"] == "2013-04-29T20:00Z" b.validate() def test_action_extra(): b = toy_bill() - b.add_action("an action with some extra information", '2017-01-01', - extras=dict(sitting_chair='Adams')) - assert b.actions[0]['extras'] == {'sitting_chair': 'Adams'} + b.add_action( + "an action with some extra information", + "2017-01-01", + extras=dict(sitting_chair="Adams"), + ) + assert b.actions[0]["extras"] == {"sitting_chair": "Adams"} def test_add_related_bill(): - """ Make sure related bills work """ + """Make sure related bills work""" b = toy_bill() - b.add_related_bill(identifier="HB 2020", legislative_session="2011A", - relation_type="companion") + b.add_related_bill( + identifier="HB 2020", legislative_session="2011A", relation_type="companion" + ) assert len(b.related_bills) == 1 - assert b.related_bills[0] == {'identifier': 'HB 2020', 'legislative_session': '2011A', - 'relation_type': 'companion'} + assert b.related_bills[0] == { + "identifier": "HB 2020", + "legislative_session": "2011A", + "relation_type": "companion", + } b.validate() def test_add_sponsor(): b = toy_bill() - b.add_sponsorship(name="Joe Bleu", classification="Author", entity_type="person", - primary=True, chamber="upper") + b.add_sponsorship( + name="Joe Bleu", + classification="Author", + entity_type="person", + primary=True, + chamber="upper", + ) assert len(b.sponsorships) == 1 - assert b.sponsorships[0] == {'person_id': '~{"name": "Joe Bleu"}', 'name': 'Joe Bleu', - 'classification': 'Author', 'entity_type': 'person', - 'primary': True, 'organization_id': None} + assert b.sponsorships[0] == { + "person_id": '~{"name": "Joe Bleu"}', + "name": "Joe Bleu", + "classification": "Author", + "entity_type": "person", + "primary": True, + "organization_id": None, + } b.validate() @@ -107,38 +145,50 @@ def test_subjects(): b = toy_bill() b.add_subject("Foo") b.add_subject("Bar") - assert b.subject == ['Foo', 'Bar'] + assert b.subject == ["Foo", "Bar"] b.validate() def test_abstract(): b = toy_bill() - b.add_abstract('this bill is stupid', 'K-5', '1969-10-20') - b.add_abstract('this legislative document is ignorant', '6-12', '2010-10-10') - assert b.abstracts == [{'note': 'K-5', 'abstract': 'this bill is stupid', - 'date': '1969-10-20'}, - {'note': '6-12', 'abstract': 'this legislative document is ignorant', - 'date': '2010-10-10'}] + b.add_abstract("this bill is stupid", "K-5", "1969-10-20") + b.add_abstract("this legislative document is ignorant", "6-12", "2010-10-10") + assert b.abstracts == [ + {"note": "K-5", "abstract": "this bill is stupid", "date": "1969-10-20"}, + { + "note": "6-12", + "abstract": "this legislative document is ignorant", + "date": "2010-10-10", + }, + ] def test_add_documents(): b = toy_bill() # should only add one document since they all have same note - b.add_document_link(note="Fiscal Impact", date="2013-04", url="http://hi.example.com/foo#bar", - media_type="text/html") - b.add_document_link(note="Fiscal Impact", date="2013-04", url='http://foobar.baz') + b.add_document_link( + note="Fiscal Impact", + date="2013-04", + url="http://hi.example.com/foo#bar", + media_type="text/html", + ) + b.add_document_link(note="Fiscal Impact", date="2013-04", url="http://foobar.baz") assert len(b.documents) == 1 # should now be two documents - b.add_document_link(note="Other Document", date="2013-04", url='http://foobar.baz/other') + b.add_document_link( + note="Other Document", date="2013-04", url="http://foobar.baz/other" + ) assert len(b.documents) == 2 # valid documents so far b.validate() # an invalid document - b.add_document_link(note="Fiscal Impact", date="2013-04", url=None, media_type='foo') + b.add_document_link( + note="Fiscal Impact", date="2013-04", url=None, media_type="foo" + ) with pytest.raises(ScrapeValueError): b.validate() @@ -151,28 +201,38 @@ def test_versions(): b.add_version_link(url="http://pault.ag/foo", note="Final Version", date="2013-04") b.validate() assert len(b.versions) == 1 - assert len(b.versions[0]['links']) == 2 + assert len(b.versions[0]["links"]) == 2 # duplicate! with pytest.raises(ValueError): - b.add_version_link(url="http://pault.ag/foo", note="Final Version", date="2013-04") + b.add_version_link( + url="http://pault.ag/foo", note="Final Version", date="2013-04" + ) # ignore duplicate - nothing should change - b.add_version_link(url="http://pault.ag/foo", note="Final Version", date="2013-04", - on_duplicate='ignore') + b.add_version_link( + url="http://pault.ag/foo", + note="Final Version", + date="2013-04", + on_duplicate="ignore", + ) assert len(b.versions) == 1 - assert len(b.versions[0]['links']) == 2 + assert len(b.versions[0]["links"]) == 2 # duplicate URL with pytest.raises(ValueError): - b.add_version_link(url="http://pault.ag/foo", note="Finals Versions", date="2013-04") + b.add_version_link( + url="http://pault.ag/foo", note="Finals Versions", date="2013-04" + ) assert len(b.versions) == 1 - assert len(b.versions[0]['links']) == 2 + assert len(b.versions[0]["links"]) == 2 # a new doc, numbers go up - b.add_version_link(url="http://pault.ag/foovbar", note="Finals Versions", date="2013-04") + b.add_version_link( + url="http://pault.ag/foovbar", note="Finals Versions", date="2013-04" + ) assert len(b.versions) == 2 - assert len(b.versions[1]['links']) == 1 + assert len(b.versions[1]["links"]) == 1 # still validates b.validate() @@ -184,9 +244,13 @@ def test_str(): def test_no_whitespace_in_uri(): - b = Bill(identifier="HB 2017", legislative_session="2012A", - title="A bill for an act to raise the cookie budget by 200%", - from_organization="Foo Senate", classification="bill") + b = Bill( + identifier="HB 2017", + legislative_session="2012A", + title="A bill for an act to raise the cookie budget by 200%", + from_organization="Foo Senate", + classification="bill", + ) b.add_source("http://uri.example.com/fail here", note="foo") with pytest.raises(ScrapeValueError): b.validate() diff --git a/pupa/tests/scrape/test_event_scrape.py b/pupa/tests/scrape/test_event_scrape.py index 468632cf..16a107a6 100644 --- a/pupa/tests/scrape/test_event_scrape.py +++ b/pupa/tests/scrape/test_event_scrape.py @@ -6,10 +6,10 @@ def event_obj(): e = Event( name="get-together", - start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', + start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", location_name="Joe's Place", ) - e.add_source(url='http://example.com/foobar') + e.add_source(url="http://example.com/foobar") return e @@ -21,9 +21,9 @@ def test_basic_event(): def test_no_location(): e = Event( name="get-together", - start_date=datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z', + start_date=datetime.datetime.utcnow().isoformat().split(".")[0] + "Z", ) - e.add_source(url='http://example.com/foobar') + e.add_source(url="http://example.com/foobar") e.validate() @@ -43,7 +43,7 @@ def test_bad_event(): def test_basic_agenda(): e = event_obj() agenda = e.add_agenda_item("foo bar") - assert agenda['description'] == 'foo bar' + assert agenda["description"] == "foo bar" assert e.agenda[0] == agenda e.validate() @@ -51,20 +51,20 @@ def test_basic_agenda(): def test_agenda_add_person(): e = event_obj() agenda = e.add_agenda_item("foo bar") - assert agenda['related_entities'] == [] + assert agenda["related_entities"] == [] - agenda.add_person(person='John Q. Hacker', note='chair') - assert len(e.agenda[0]['related_entities']) == 1 + agenda.add_person(person="John Q. Hacker", note="chair") + assert len(e.agenda[0]["related_entities"]) == 1 e.validate() def test_agenda_add_vote_event(): e = event_obj() agenda = e.add_agenda_item("foo bar") - assert agenda['related_entities'] == [] + assert agenda["related_entities"] == [] - agenda.add_vote_event(vote_event='Roll no. 12') - assert len(e.agenda[0]['related_entities']) == 1 + agenda.add_vote_event(vote_event="Roll no. 12") + assert len(e.agenda[0]["related_entities"]) == 1 e.validate() @@ -72,10 +72,10 @@ def test_agenda_add_subject(): e = event_obj() agenda = e.add_agenda_item("foo bar") - agenda.add_subject('test') - assert e.agenda[0]['subjects'] == ['test'] - agenda.add_subject('test2') - assert e.agenda[0]['subjects'] == ['test', 'test2'] + agenda.add_subject("test") + assert e.agenda[0]["subjects"] == ["test"] + agenda.add_subject("test2") + assert e.agenda[0]["subjects"] == ["test", "test2"] e.validate() @@ -84,76 +84,82 @@ def test_agenda_add_classification(): e = event_obj() agenda = e.add_agenda_item("foo bar") - agenda.add_classification('test') - assert e.agenda[0]['classification'] == ['test'] - agenda.add_classification('test2') - assert e.agenda[0]['classification'] == ['test', 'test2'] + agenda.add_classification("test") + assert e.agenda[0]["classification"] == ["test"] + agenda.add_classification("test2") + assert e.agenda[0]["classification"] == ["test", "test2"] e.validate() def test_agenda_add_extra(): e = event_obj() - a = e.add_agenda_item('foo bar') - a['extras'] = dict(foo=1, bar=['baz']) + a = e.add_agenda_item("foo bar") + a["extras"] = dict(foo=1, bar=["baz"]) - assert e.agenda[0]['extras'] == {'foo': 1, 'bar': ['baz']} + assert e.agenda[0]["extras"] == {"foo": 1, "bar": ["baz"]} def test_add_committee(): e = event_obj() agenda = e.add_agenda_item("foo bar") - assert agenda['related_entities'] == [] + assert agenda["related_entities"] == [] - agenda.add_committee(committee='Hello, World', note='host') + agenda.add_committee(committee="Hello, World", note="host") e.validate() def test_add_bill(): e = event_obj() agenda = e.add_agenda_item("foo bar") - assert agenda['related_entities'] == [] - agenda.add_bill(bill='HB 101', note='consideration') + assert agenda["related_entities"] == [] + agenda.add_bill(bill="HB 101", note="consideration") e.validate() def test_add_document(): e = event_obj() assert e.documents == [] - e.add_document(note='hello', url='http://example.com', media_type="text/html") + e.add_document(note="hello", url="http://example.com", media_type="text/html") assert len(e.documents) == 1 o = e.documents[0] - assert o['note'] == 'hello' - assert o['links'] == [{'url': 'http://example.com', 'media_type': 'text/html', 'text': ''}] + assert o["note"] == "hello" + assert o["links"] == [ + {"url": "http://example.com", "media_type": "text/html", "text": ""} + ] e.validate() def test_participants(): e = event_obj() - e.add_participant('Committee of the Whole', type='committee', note='everyone') + e.add_participant("Committee of the Whole", type="committee", note="everyone") assert len(e.participants) == 1 - assert e.participants[0]['name'] == 'Committee of the Whole' - assert e.participants[0]['entity_type'] == 'committee' - assert e.participants[0]['note'] == 'everyone' + assert e.participants[0]["name"] == "Committee of the Whole" + assert e.participants[0]["entity_type"] == "committee" + assert e.participants[0]["note"] == "everyone" # and add_person, which is a shortcut - e.add_person('Bill Stevenson') + e.add_person("Bill Stevenson") assert len(e.participants) == 2 - assert e.participants[1]['name'] == 'Bill Stevenson' - assert e.participants[1]['entity_type'] == 'person' - assert e.participants[1]['note'] == 'participant' + assert e.participants[1]["name"] == "Bill Stevenson" + assert e.participants[1]["entity_type"] == "person" + assert e.participants[1]["note"] == "participant" def test_set_location(): e = event_obj() - e.set_location('North Pole', note='it is cold here', url='https://www.northpole.com', - coordinates={'latitude': '90.0000', 'longitude': '0.0000'}) + e.set_location( + "North Pole", + note="it is cold here", + url="https://www.northpole.com", + coordinates={"latitude": "90.0000", "longitude": "0.0000"}, + ) - assert e.location.get('name') == 'North Pole' - assert e.location.get('note') == 'it is cold here' - assert e.location.get('url') == 'https://www.northpole.com' - assert e.location.get('coordinates').get('latitude') == '90.0000' - assert e.location.get('coordinates').get('longitude') == '0.0000' + assert e.location.get("name") == "North Pole" + assert e.location.get("note") == "it is cold here" + assert e.location.get("url") == "https://www.northpole.com" + assert e.location.get("coordinates").get("latitude") == "90.0000" + assert e.location.get("coordinates").get("longitude") == "0.0000" e.validate() @@ -161,15 +167,15 @@ def test_set_location(): def test_add_media(): e = event_obj() name = "Hello, World" - a = e.add_agenda_item(description='foo') + a = e.add_agenda_item(description="foo") a.add_media_link(note=name, url="http://pault.ag", media_type="text/html") a.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain") e.validate() - assert len(e.agenda[0]['media']) == 1 - assert len(e.agenda[0]['media'][0]['links']) == 2 + assert len(e.agenda[0]["media"]) == 1 + assert len(e.agenda[0]["media"][0]["links"]) == 2 e.add_media_link(note=name, url="http://pault.ag", media_type="text/html") e.add_media_link(note=name, url="ftp://pault.ag", media_type="text/plain") e.validate() assert len(e.media) == 1 - assert len(e.media[0]['links']) == 2 + assert len(e.media[0]["links"]) == 2 diff --git a/pupa/tests/scrape/test_jurisdiction_scrape.py b/pupa/tests/scrape/test_jurisdiction_scrape.py index cc2c4352..f47eefa8 100644 --- a/pupa/tests/scrape/test_jurisdiction_scrape.py +++ b/pupa/tests/scrape/test_jurisdiction_scrape.py @@ -3,22 +3,22 @@ class FakeJurisdiction(Jurisdiction): - division_id = 'ocd-division/test' - classification = 'government' - name = 'Test' - url = 'http://example.com' + division_id = "ocd-division/test" + classification = "government" + name = "Test" + url = "http://example.com" def get_organizations(self): - parent = Organization('Congress', classification='legislature') + parent = Organization("Congress", classification="legislature") yield parent - yield Organization('House', classification='lower', parent_id=parent) - yield Organization('Senate', classification='upper', parent_id=parent) + yield Organization("House", classification="lower", parent_id=parent) + yield Organization("Senate", classification="upper", parent_id=parent) def test_basics(): # id property and string j = FakeJurisdiction() - assert j.jurisdiction_id == 'ocd-jurisdiction/test/government' + assert j.jurisdiction_id == "ocd-jurisdiction/test/government" assert j.name in str(j) @@ -26,24 +26,24 @@ def test_as_dict(): j = FakeJurisdiction() d = j.as_dict() - assert d['_id'] == j.jurisdiction_id - assert d['name'] == j.name - assert d['url'] == j.url - assert d['legislative_sessions'] == [] - assert d['feature_flags'] == [] + assert d["_id"] == j.jurisdiction_id + assert d["name"] == j.name + assert d["url"] == j.url + assert d["legislative_sessions"] == [] + assert d["feature_flags"] == [] def test_jurisdiction_unicam_scrape(): class UnicameralJurisdiction(Jurisdiction): - jurisdiction_id = 'unicam' - name = 'Unicameral' - url = 'http://example.com' + jurisdiction_id = "unicam" + name = "Unicameral" + url = "http://example.com" def get_organizations(self): - yield Organization('Unicameral Legislature', classification='legislature') + yield Organization("Unicameral Legislature", classification="legislature") j = UnicameralJurisdiction() - js = JurisdictionScraper(j, '/tmp/') + js = JurisdictionScraper(j, "/tmp/") objects = list(js.scrape()) # two objects, first is the Jurisdiction @@ -52,12 +52,12 @@ def get_organizations(self): # ensure we made a single legislature org assert isinstance(objects[1], Organization) - assert objects[1].classification == 'legislature' + assert objects[1].classification == "legislature" def test_jurisdiction_bicameral_scrape(): j = FakeJurisdiction() - js = JurisdictionScraper(j, '/tmp/') + js = JurisdictionScraper(j, "/tmp/") objects = list(js.scrape()) obj_names = set() obj_types = defaultdict(int) @@ -67,6 +67,6 @@ def test_jurisdiction_bicameral_scrape(): obj_types[type(o)] += 1 # ensure Jurisdiction and 5 organizations were found - assert obj_names == {'Test', 'Congress', 'House', 'Senate'} + assert obj_names == {"Test", "Congress", "House", "Senate"} assert obj_types[FakeJurisdiction] == 1 assert obj_types[Organization] == 3 diff --git a/pupa/tests/scrape/test_model_basics.py b/pupa/tests/scrape/test_model_basics.py index 3d226c5d..2bd8c153 100644 --- a/pupa/tests/scrape/test_model_basics.py +++ b/pupa/tests/scrape/test_model_basics.py @@ -1,12 +1,26 @@ import pytest from pupa.scrape.schemas.person import schema -from pupa.scrape.base import (BaseModel, SourceMixin, ContactDetailMixin, LinkMixin, - AssociatedLinkMixin, OtherNameMixin, IdentifierMixin) - - -class GenericModel(BaseModel, SourceMixin, ContactDetailMixin, LinkMixin, AssociatedLinkMixin, - OtherNameMixin, IdentifierMixin): - """ a generic model used for testing the base and mixins """ +from pupa.scrape.base import ( + BaseModel, + SourceMixin, + ContactDetailMixin, + LinkMixin, + AssociatedLinkMixin, + OtherNameMixin, + IdentifierMixin, +) + + +class GenericModel( + BaseModel, + SourceMixin, + ContactDetailMixin, + LinkMixin, + AssociatedLinkMixin, + OtherNameMixin, + IdentifierMixin, +): + """a generic model used for testing the base and mixins""" _type = "generic" _schema = schema @@ -23,7 +37,7 @@ def test_init_id(): def test_as_dict(): m = GenericModel() - assert m.as_dict()['_id'] == m._id + assert m.as_dict()["_id"] == m._id def test_setattr(): @@ -33,70 +47,118 @@ def test_setattr(): m.some_random_key = 3 # and no error raised since this is a valid key - m._id = 'new id' + m._id = "new id" def test_add_source(): m = GenericModel() - m.add_source('http://example.com/1') - m.add_source('http://example.com/2', note='xyz') - assert m.sources == [{'url': 'http://example.com/1', 'note': ''}, - {'url': 'http://example.com/2', 'note': 'xyz'}] + m.add_source("http://example.com/1") + m.add_source("http://example.com/2", note="xyz") + assert m.sources == [ + {"url": "http://example.com/1", "note": ""}, + {"url": "http://example.com/2", "note": "xyz"}, + ] def test_add_contact_detail(): m = GenericModel() - m.add_contact_detail(type='fax', value='111-222-3333', note='office') - assert m.contact_details == [{'type': 'fax', 'value': '111-222-3333', 'note': 'office'}] + m.add_contact_detail(type="fax", value="111-222-3333", note="office") + assert m.contact_details == [ + {"type": "fax", "value": "111-222-3333", "note": "office"} + ] def test_add_link(): m = GenericModel() - m.add_link('http://example.com/1') - m.add_link('http://example.com/2', note='xyz') - assert m.links == [{'url': 'http://example.com/1', 'note': ''}, - {'url': 'http://example.com/2', 'note': 'xyz'}] + m.add_link("http://example.com/1") + m.add_link("http://example.com/2", note="xyz") + assert m.links == [ + {"url": "http://example.com/1", "note": ""}, + {"url": "http://example.com/2", "note": "xyz"}, + ] def test_add_associated_link_match(): m = GenericModel() - m._add_associated_link('_associated', 'something', 'http://example.com/1.txt', - text='', media_type='text/plain', on_duplicate='error') - m._add_associated_link('_associated', 'something', 'http://example.com/1.pdf', - text='', media_type='application/pdf', on_duplicate='error') + m._add_associated_link( + "_associated", + "something", + "http://example.com/1.txt", + text="", + media_type="text/plain", + on_duplicate="error", + ) + m._add_associated_link( + "_associated", + "something", + "http://example.com/1.pdf", + text="", + media_type="application/pdf", + on_duplicate="error", + ) # one 'document' added, multiple links for it assert len(m._associated) == 1 - assert len(m._associated[0]['links']) == 2 + assert len(m._associated[0]["links"]) == 2 def test_add_associated_link_on_duplicate_bad(): m = GenericModel() with pytest.raises(ValueError): - m._add_associated_link('_associated', 'something', 'http://example.com', - text='', media_type='text/html', on_duplicate='idk') + m._add_associated_link( + "_associated", + "something", + "http://example.com", + text="", + media_type="text/html", + on_duplicate="idk", + ) def test_add_associated_link_on_duplicate_error(): m = GenericModel() - m._add_associated_link('_associated', 'something', 'http://example.com', - text='', media_type='text/html', on_duplicate='error') + m._add_associated_link( + "_associated", + "something", + "http://example.com", + text="", + media_type="text/html", + on_duplicate="error", + ) with pytest.raises(ValueError): - m._add_associated_link('_associated', 'something else', 'http://example.com', - text='', media_type='text/html', on_duplicate='error') + m._add_associated_link( + "_associated", + "something else", + "http://example.com", + text="", + media_type="text/html", + on_duplicate="error", + ) def test_add_associated_link_on_duplicate_ignore(): m = GenericModel() - m._add_associated_link('_associated', 'something', 'http://example.com', - text='', media_type='text/html', on_duplicate='ignore') - m._add_associated_link('_associated', 'something else', 'http://example.com', - text='', media_type='text/html', on_duplicate='ignore') + m._add_associated_link( + "_associated", + "something", + "http://example.com", + text="", + media_type="text/html", + on_duplicate="ignore", + ) + m._add_associated_link( + "_associated", + "something else", + "http://example.com", + text="", + media_type="text/html", + on_duplicate="ignore", + ) # one 'document' added, single link for it, keeps first name assert len(m._associated) == 1 - assert len(m._associated[0]['links']) == 1 - assert m._associated[0]['note'] == 'something' + assert len(m._associated[0]["links"]) == 1 + assert m._associated[0]["note"] == "something" def test_add_name(): @@ -106,13 +168,21 @@ def test_add_name(): assert m.other_names == [{"name": "Thiston", "note": "What my friends call me"}] - m.add_name("Johnseph Q. Publico", note="Birth name", start_date="1920-01", - end_date="1949-12-31") + m.add_name( + "Johnseph Q. Publico", + note="Birth name", + start_date="1920-01", + end_date="1949-12-31", + ) assert m.other_names == [ {"name": "Thiston", "note": "What my friends call me"}, - {"name": "Johnseph Q. Publico", "note": "Birth name", "start_date": "1920-01", - "end_date": "1949-12-31"} + { + "name": "Johnseph Q. Publico", + "note": "Birth name", + "start_date": "1920-01", + "end_date": "1949-12-31", + }, ] @@ -125,5 +195,5 @@ def test_add_identifier(): g.add_identifier("id10t") g.add_identifier("l0l", scheme="kruft") - assert g.identifiers[-1]['scheme'] == "kruft" - assert g.identifiers[0]['identifier'] == "id10t" + assert g.identifiers[-1]["scheme"] == "kruft" + assert g.identifiers[0]["identifier"] == "id10t" diff --git a/pupa/tests/scrape/test_people_org_scrape.py b/pupa/tests/scrape/test_people_org_scrape.py index a775dc96..fea12a13 100644 --- a/pupa/tests/scrape/test_people_org_scrape.py +++ b/pupa/tests/scrape/test_people_org_scrape.py @@ -6,20 +6,20 @@ def test_basic_post(): - post = Post(label='1', role='Representative', organization_id='fake_org') - assert '1' in str(post) + post = Post(label="1", role="Representative", organization_id="fake_org") + assert "1" in str(post) post.validate() def test_basic_invalid_post(): - post = Post(label=1, role='Representative', organization_id='fake_org') + post = Post(label=1, role="Representative", organization_id="fake_org") with pytest.raises(ValueError): post.validate() def test_basic_membership(): - m = Membership(person_id='person', organization_id='org') - assert 'person' in str(m) and 'org' in str(m) + m = Membership(person_id="person", organization_id="org") + assert "person" in str(m) and "org" in str(m) def test_basic_invalid_membership(): @@ -30,7 +30,7 @@ def test_basic_invalid_membership(): def test_basic_invalid_person(): bob = Person("Bob B. Johnson") - bob.add_source(url='http://example.com') + bob.add_source(url="http://example.com") bob.validate() bob.name = None @@ -40,34 +40,36 @@ def test_basic_invalid_person(): def test_basic_person(): - p = Person('Bob B. Bear') - p.add_source('http://example.com') + p = Person("Bob B. Bear") + p.add_source("http://example.com") assert p.name in str(p) p.validate() def test_person_add_membership_org(): - p = Person('Bob B. Bear') - p.add_source('http://example.com') - o = Organization('test org', classification='unknown') - p.add_membership(o, role='member', start_date='2007', end_date=datetime.date(2015, 5, 8)) + p = Person("Bob B. Bear") + p.add_source("http://example.com") + o = Organization("test org", classification="unknown") + p.add_membership( + o, role="member", start_date="2007", end_date=datetime.date(2015, 5, 8) + ) assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id - assert p._related[0].start_date == '2007' + assert p._related[0].start_date == "2007" assert p._related[0].end_date == datetime.date(2015, 5, 8) def test_basic_organization(): - org = Organization('some org', classification='committee') - org.add_source('http://example.com') + org = Organization("some org", classification="committee") + org.add_source("http://example.com") assert org.name in str(org) org.validate() def test_no_source_on_party_org(): - org = Organization('Hat', classification='party') + org = Organization("Hat", classification="party") # no source? no problem because classification = party org.validate() @@ -81,9 +83,9 @@ def test_basic_invalid_organization(): def test_org_add_post(): - """ Test that we can hack posts in on the fly'""" + """Test that we can hack posts in on the fly'""" orga = Organization("name", classification="committee") - orga.add_source(url='http://example.com') + orga.add_source(url="http://example.com") orga.validate() orga.add_post("Human Readable Name", "Chef") @@ -93,95 +95,109 @@ def test_org_add_post(): def test_legislator_related_district(): - leg = Person('John Adams', district='1', primary_org='legislature') - leg.pre_save('jurisdiction-id') + leg = Person("John Adams", district="1", primary_org="legislature") + leg.pre_save("jurisdiction-id") assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id - assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'legislature'} - assert get_pseudo_id(leg._related[0].post_id) ==\ - {"organization__classification": "legislature", - "label": "1"} + assert get_pseudo_id(leg._related[0].organization_id) == { + "classification": "legislature" + } + assert get_pseudo_id(leg._related[0].post_id) == { + "organization__classification": "legislature", + "label": "1", + } def test_legislator_related_chamber_district(): - leg = Person('John Adams', district='1', primary_org='upper') - leg.pre_save('jurisdiction-id') + leg = Person("John Adams", district="1", primary_org="upper") + leg.pre_save("jurisdiction-id") assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id - assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'upper'} - assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "upper", - "label": "1"} + assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "upper"} + assert get_pseudo_id(leg._related[0].post_id) == { + "organization__classification": "upper", + "label": "1", + } def test_legislator_related_chamber_district_role(): - leg = Person('John Adams', district='1', primary_org='lower', role='Speaker') - leg.pre_save('jurisdiction-id') + leg = Person("John Adams", district="1", primary_org="lower", role="Speaker") + leg.pre_save("jurisdiction-id") assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id - assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'lower'} - assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "lower", - "label": "1", "role": "Speaker"} - assert leg._related[0].role == 'Speaker' + assert get_pseudo_id(leg._related[0].organization_id) == {"classification": "lower"} + assert get_pseudo_id(leg._related[0].post_id) == { + "organization__classification": "lower", + "label": "1", + "role": "Speaker", + } + assert leg._related[0].role == "Speaker" def test_legislator_related_party(): - leg = Person('John Adams', party='Democratic-Republican') - leg.pre_save('jurisdiction-id') + leg = Person("John Adams", party="Democratic-Republican") + leg.pre_save("jurisdiction-id") # a party membership assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id - assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'party', - 'name': 'Democratic-Republican'} - assert leg._related[0].role == 'member' + assert get_pseudo_id(leg._related[0].organization_id) == { + "classification": "party", + "name": "Democratic-Republican", + } + assert leg._related[0].role == "member" def test_committee_add_member_person(): - c = Organization('Defense', classification='committee') - p = Person('John Adams') - c.add_member(p, role='chairman') + c = Organization("Defense", classification="committee") + p = Person("John Adams") + c.add_member(p, role="chairman") assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id - assert c._related[0].role == 'chairman' + assert c._related[0].role == "chairman" def test_committee_add_member_name(): - c = Organization('Defense', classification='committee') - c.add_member('John Adams') - assert get_pseudo_id(c._related[0].person_id) == {'name': 'John Adams'} + c = Organization("Defense", classification="committee") + c.add_member("John Adams") + assert get_pseudo_id(c._related[0].person_id) == {"name": "John Adams"} assert c._related[0].organization_id == c._id - assert c._related[0].role == 'member' + assert c._related[0].role == "member" def test_person_add_membership_name(): - p = Person('Leonardo DiCaprio') - p.add_membership('Academy of Motion Picture Arts and Sciences', - role='winner', start_date='2016') + p = Person("Leonardo DiCaprio") + p.add_membership( + "Academy of Motion Picture Arts and Sciences", role="winner", start_date="2016" + ) p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { - 'name': 'Academy of Motion Picture Arts and Sciences'} + "name": "Academy of Motion Picture Arts and Sciences" + } assert p._related[0].person_id == p._id - assert p._related[0].role == 'winner' - assert p._related[0].start_date == '2016' + assert p._related[0].role == "winner" + assert p._related[0].start_date == "2016" def test_person_add_party(): - p = Person('Groot') - p.add_party('Green') + p = Person("Groot") + p.add_party("Green") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { - 'name': 'Green', 'classification': 'party'} + "name": "Green", + "classification": "party", + } def test_person_add_term(): - p = Person('Eternal') - p.add_term('eternal', 'council', start_date='0001', end_date='9999') + p = Person("Eternal") + p.add_term("eternal", "council", start_date="0001", end_date="9999") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { - 'classification': 'council', + "classification": "council", } - assert p._related[0].start_date == '0001' - assert p._related[0].end_date == '9999' + assert p._related[0].start_date == "0001" + assert p._related[0].end_date == "9999" diff --git a/pupa/tests/scrape/test_scraper.py b/pupa/tests/scrape/test_scraper.py index a73ed4da..d522874e 100644 --- a/pupa/tests/scrape/test_scraper.py +++ b/pupa/tests/scrape/test_scraper.py @@ -5,7 +5,7 @@ class FakeJurisdiction(Jurisdiction): - jurisdiction_id = 'jurisdiction' + jurisdiction_id = "jurisdiction" juris = FakeJurisdiction() @@ -13,22 +13,22 @@ class FakeJurisdiction(Jurisdiction): def test_save_object_basics(): # ensure that save object dumps a file - s = Scraper(juris, '/tmp/') - p = Person('Michael Jordan') - p.add_source('http://example.com') + s = Scraper(juris, "/tmp/") + p = Person("Michael Jordan") + p.add_source("http://example.com") - with mock.patch('json.dump') as json_dump: + with mock.patch("json.dump") as json_dump: s.save_object(p) # ensure object is saved in right place - filename = 'person_' + p._id + '.json' - assert filename in s.output_names['person'] + filename = "person_" + p._id + ".json" + assert filename in s.output_names["person"] json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY) def test_save_object_invalid(): - s = Scraper(juris, '/tmp/') - p = Person('Michael Jordan') + s = Scraper(juris, "/tmp/") + p = Person("Michael Jordan") # no source, won't validate with pytest.raises(ValueError): @@ -36,52 +36,55 @@ def test_save_object_invalid(): def test_save_related(): - s = Scraper(juris, '/tmp/') - p = Person('Michael Jordan') - p.add_source('http://example.com') - o = Organization('Chicago Bulls', classification='committee') - o.add_source('http://example.com') + s = Scraper(juris, "/tmp/") + p = Person("Michael Jordan") + p.add_source("http://example.com") + o = Organization("Chicago Bulls", classification="committee") + o.add_source("http://example.com") p._related.append(o) - with mock.patch('json.dump') as json_dump: + with mock.patch("json.dump") as json_dump: s.save_object(p) - assert json_dump.mock_calls == [mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), - mock.call(o.as_dict(), mock.ANY, cls=mock.ANY)] + assert json_dump.mock_calls == [ + mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), + mock.call(o.as_dict(), mock.ANY, cls=mock.ANY), + ] def test_simple_scrape(): class FakeScraper(Scraper): def scrape(self): - p = Person('Michael Jordan') - p.add_source('http://example.com') + p = Person("Michael Jordan") + p.add_source("http://example.com") yield p - with mock.patch('json.dump') as json_dump: - record = FakeScraper(juris, '/tmp/').do_scrape() + with mock.patch("json.dump") as json_dump: + record = FakeScraper(juris, "/tmp/").do_scrape() assert len(json_dump.mock_calls) == 1 - assert record['objects']['person'] == 1 - assert record['end'] > record['start'] - assert record['skipped'] == 0 + assert record["objects"]["person"] == 1 + assert record["end"] > record["start"] + assert record["skipped"] == 0 def test_double_iter(): - """ tests that scrapers that yield iterables work OK """ + """tests that scrapers that yield iterables work OK""" + class IterScraper(Scraper): def scrape(self): yield self.scrape_people() def scrape_people(self): - p = Person('Michael Jordan') - p.add_source('http://example.com') + p = Person("Michael Jordan") + p.add_source("http://example.com") yield p - with mock.patch('json.dump') as json_dump: - record = IterScraper(juris, '/tmp/').do_scrape() + with mock.patch("json.dump") as json_dump: + record = IterScraper(juris, "/tmp/").do_scrape() assert len(json_dump.mock_calls) == 1 - assert record['objects']['person'] == 1 + assert record["objects"]["person"] == 1 def test_no_objects(): @@ -90,7 +93,7 @@ def scrape(self): pass with pytest.raises(ScrapeError): - NullScraper(juris, '/tmp/', fastmode=True).do_scrape() + NullScraper(juris, "/tmp/", fastmode=True).do_scrape() def test_no_scrape(): @@ -98,30 +101,30 @@ class NonScraper(Scraper): pass with pytest.raises(NotImplementedError): - NonScraper(juris, '/tmp/').do_scrape() + NonScraper(juris, "/tmp/").do_scrape() def test_bill_scraper(): class BillScraper(BaseBillScraper): def get_bill_ids(self): - yield '1', {'extra': 'param'} - yield '2', {} + yield "1", {"extra": "param"} + yield "2", {} def get_bill(self, bill_id, **kwargs): - if bill_id == '1': - assert kwargs == {'extra': 'param'} + if bill_id == "1": + assert kwargs == {"extra": "param"} raise self.ContinueScraping else: - assert bill_id == '2' + assert bill_id == "2" assert kwargs == {} - b = Bill('1', self.legislative_session, 'title') - b.add_source('http://example.com') + b = Bill("1", self.legislative_session, "title") + b.add_source("http://example.com") return b - bs = BillScraper(juris, '/tmp/') - with mock.patch('json.dump') as json_dump: - record = bs.do_scrape(legislative_session='2020') + bs = BillScraper(juris, "/tmp/") + with mock.patch("json.dump") as json_dump: + record = bs.do_scrape(legislative_session="2020") assert len(json_dump.mock_calls) == 1 - assert record['objects']['bill'] == 1 - assert record['skipped'] == 1 + assert record["objects"]["bill"] == 1 + assert record["skipped"] == 1 diff --git a/pupa/tests/scrape/test_utils.py b/pupa/tests/scrape/test_utils.py index 7c19376b..475713ed 100644 --- a/pupa/tests/scrape/test_utils.py +++ b/pupa/tests/scrape/test_utils.py @@ -10,20 +10,20 @@ class _Settings: @pytest.fixture def settings(): ret = _Settings() - ret.foo = 'bar' - ret.baz = 'bob' + ret.foo = "bar" + ret.baz = "bob" return ret def test_override_settings(settings): - with override_settings(settings, {'baz': 'fez'}): - assert settings.foo == 'bar' - assert settings.baz == 'fez' - assert settings.foo == 'bar' - assert settings.baz == 'bob' + with override_settings(settings, {"baz": "fez"}): + assert settings.foo == "bar" + assert settings.baz == "fez" + assert settings.foo == "bar" + assert settings.baz == "bob" def test_override_settings_unset(settings): - with override_settings(settings, {'qux': 'fez'}): - assert settings.qux == 'fez' - assert not hasattr(settings, 'qux') + with override_settings(settings, {"qux": "fez"}): + assert settings.qux == "fez" + assert not hasattr(settings, "qux") diff --git a/pupa/tests/scrape/test_vote_event_scrape.py b/pupa/tests/scrape/test_vote_event_scrape.py index 79276818..d4912d4d 100644 --- a/pupa/tests/scrape/test_vote_event_scrape.py +++ b/pupa/tests/scrape/test_vote_event_scrape.py @@ -4,90 +4,117 @@ def toy_vote_event(): - ve = VoteEvent(legislative_session="2009", motion_text="passage of the bill", - start_date="2009-01-07", result='pass', classification='bill-passage') + ve = VoteEvent( + legislative_session="2009", + motion_text="passage of the bill", + start_date="2009-01-07", + result="pass", + classification="bill-passage", + ) ve.add_source("http://uri.example.com/", note="foo") return ve def test_simple_vote_event(): ve = toy_vote_event() - ve.set_count('yes', 2) - ve.yes('James') - ve.no('Paul') - ve.vote('abstain', 'Thom') + ve.set_count("yes", 2) + ve.yes("James") + ve.no("Paul") + ve.vote("abstain", "Thom") assert len(ve.votes) == 3 assert len(ve.counts) == 1 - assert get_pseudo_id(ve.organization) == {'classification': 'legislature'} - assert get_pseudo_id(ve.votes[0]['voter_id']) == {'name': 'James'} - assert get_pseudo_id(ve.votes[1]['voter_id']) == {'name': 'Paul'} - assert get_pseudo_id(ve.votes[2]['voter_id']) == {'name': 'Thom'} + assert get_pseudo_id(ve.organization) == {"classification": "legislature"} + assert get_pseudo_id(ve.votes[0]["voter_id"]) == {"name": "James"} + assert get_pseudo_id(ve.votes[1]["voter_id"]) == {"name": "Paul"} + assert get_pseudo_id(ve.votes[2]["voter_id"]) == {"name": "Thom"} assert ve.bill is None ve.validate() - assert 'we get here' + assert "we get here" def test_vote_event_org_obj(): - o = Organization('something', classification='committee') - ve = VoteEvent(legislative_session="2009", motion_text="passage of the bill", - start_date="2009-01-07", result='pass', classification='bill-passage', - organization=o) + o = Organization("something", classification="committee") + ve = VoteEvent( + legislative_session="2009", + motion_text="passage of the bill", + start_date="2009-01-07", + result="pass", + classification="bill-passage", + organization=o, + ) assert ve.organization == o._id def test_vote_event_org_dict(): - odict = {'name': 'Random Committee', 'classification': 'committee'} - ve = VoteEvent(legislative_session="2009", motion_text="passage of the bill", - start_date="2009-01-07", result='pass', classification='bill-passage', - organization=odict) + odict = {"name": "Random Committee", "classification": "committee"} + ve = VoteEvent( + legislative_session="2009", + motion_text="passage of the bill", + start_date="2009-01-07", + result="pass", + classification="bill-passage", + organization=odict, + ) assert get_pseudo_id(ve.organization) == odict def test_vote_event_org_chamber(): - ve = VoteEvent(legislative_session="2009", motion_text="passage of the bill", - start_date="2009-01-07", result='pass', classification='bill-passage', - chamber='upper') - assert get_pseudo_id(ve.organization) == {'classification': 'upper'} + ve = VoteEvent( + legislative_session="2009", + motion_text="passage of the bill", + start_date="2009-01-07", + result="pass", + classification="bill-passage", + chamber="upper", + ) + assert get_pseudo_id(ve.organization) == {"classification": "upper"} def test_org_and_chamber_conflict(): with pytest.raises(ValueError): - VoteEvent(legislative_session="2009", motion_text="passage of the bill", - start_date="2009-01-07", result='pass', classification='passage', - organization='test', chamber='lower') + VoteEvent( + legislative_session="2009", + motion_text="passage of the bill", + start_date="2009-01-07", + result="pass", + classification="passage", + organization="test", + chamber="lower", + ) def test_set_count(): ve = toy_vote_event() - ve.set_count('yes', 2) - ve.set_count('no', 100) - ve.set_count('yes', 0) - assert ve.counts == [{'option': 'yes', 'value': 0}, {'option': 'no', 'value': 100}] + ve.set_count("yes", 2) + ve.set_count("no", 100) + ve.set_count("yes", 0) + assert ve.counts == [{"option": "yes", "value": 0}, {"option": "no", "value": 100}] def test_set_bill_obj(): ve = toy_vote_event() - b = Bill('HB 1', legislative_session='2009', title='fake bill') + b = Bill("HB 1", legislative_session="2009", title="fake bill") ve.set_bill(b) assert ve.bill == b._id def test_set_bill_obj_no_extra_args(): ve = toy_vote_event() - b = Bill('HB 1', legislative_session='2009', title='fake bill') + b = Bill("HB 1", legislative_session="2009", title="fake bill") with pytest.raises(ValueError): - ve.set_bill(b, chamber='lower') + ve.set_bill(b, chamber="lower") def test_set_bill_pseudo_id(): ve = toy_vote_event() - ve.set_bill('HB 1', chamber='lower') - assert get_pseudo_id(ve.bill) == {'identifier': 'HB 1', - 'from_organization__classification': 'lower', - 'legislative_session__identifier': '2009', - } + ve.set_bill("HB 1", chamber="lower") + assert get_pseudo_id(ve.bill) == { + "identifier": "HB 1", + "from_organization__classification": "lower", + "legislative_session__identifier": "2009", + } def test_str(): @@ -102,40 +129,41 @@ def test_order_vote_event(): order_vote_event = OrderVoteEvent() # add order as seconds to date with no time - ve.start_date = '2019-01-01' + ve.start_date = "2019-01-01" ve.end_date = None - order_vote_event('2019', '1', ve) - assert ve.start_date == '2019-01-01T00:00:01' + order_vote_event("2019", "1", ve) + assert ve.start_date == "2019-01-01T00:00:01" assert ve.end_date is None - # add order as seconds to time with explicit midnight time and zone, preserving timezone - ve.start_date = '2019-01-01T00:00:00+05:00' - ve.end_date = '' - order_vote_event('2019', '1', ve) - assert ve.start_date == '2019-01-01T00:00:02+05:00' - assert ve.end_date == '' + # add order as seconds to time with explicit midnight time and + # zone, preserving timezone + ve.start_date = "2019-01-01T00:00:00+05:00" + ve.end_date = "" + order_vote_event("2019", "1", ve) + assert ve.start_date == "2019-01-01T00:00:02+05:00" + assert ve.end_date == "" # a second bill should start with '00:00:01' again - ve.start_date = '2019-01-01' + ve.start_date = "2019-01-01" ve.end_date = None - order_vote_event('2019', '2', ve) - assert ve.start_date == '2019-01-01T00:00:01' + order_vote_event("2019", "2", ve) + assert ve.start_date == "2019-01-01T00:00:01" assert ve.end_date is None # the same bill id in a different session should start with '00:00:01' again - ve.start_date = '2019-01-01' + ve.start_date = "2019-01-01" ve.end_date = None - order_vote_event('2020', '1', ve) - assert ve.start_date == '2019-01-01T00:00:01' + order_vote_event("2020", "1", ve) + assert ve.start_date == "2019-01-01T00:00:01" assert ve.end_date is None # add order as seconds to time with explicit midnight time and no timezone - ve.start_date = ve.end_date = '2019-01-01T00:00:00' - order_vote_event('2019', '1', ve) - assert ve.start_date == '2019-01-01T00:00:03' - assert ve.end_date == '2019-01-01T00:00:03' + ve.start_date = ve.end_date = "2019-01-01T00:00:00" + order_vote_event("2019", "1", ve) + assert ve.start_date == "2019-01-01T00:00:03" + assert ve.end_date == "2019-01-01T00:00:03" # don't change a date with a non-midnight time - ve.start_date = '2019-01-01T00:00:55+05:00' - order_vote_event('2019', '1', ve) - assert ve.start_date == '2019-01-01T00:00:55+05:00' + ve.start_date = "2019-01-01T00:00:55+05:00" + order_vote_event("2019", "1", ve) + assert ve.start_date == "2019-01-01T00:00:55+05:00" diff --git a/pupa/utils/__init__.py b/pupa/utils/__init__.py index a2e4546b..3d6093bb 100644 --- a/pupa/utils/__init__.py +++ b/pupa/utils/__init__.py @@ -1,3 +1,10 @@ # flake8: noqa -from .generic import (_make_pseudo_id, get_pseudo_id, makedirs, - JSONEncoderPlus, convert_pdf, utcnow, format_datetime) +from .generic import ( + _make_pseudo_id, + get_pseudo_id, + makedirs, + JSONEncoderPlus, + convert_pdf, + utcnow, + format_datetime, +) diff --git a/pupa/utils/generic.py b/pupa/utils/generic.py index d6028b05..0d99c9f2 100644 --- a/pupa/utils/generic.py +++ b/pupa/utils/generic.py @@ -10,13 +10,13 @@ def utcnow(): def _make_pseudo_id(**kwargs): - """ pseudo ids are just JSON """ + """pseudo ids are just JSON""" # ensure keys are sorted so that these are deterministic - return '~' + json.dumps(kwargs, sort_keys=True) + return "~" + json.dumps(kwargs, sort_keys=True) def get_pseudo_id(pid): - if pid[0] != '~': + if pid[0] != "~": raise ValueError("pseudo id doesn't start with ~") return json.loads(pid[1:]) @@ -30,11 +30,11 @@ class JSONEncoderPlus(json.JSONEncoder): """ JSONEncoder that encodes datetime objects as Unix timestamps. """ + def default(self, obj, **kwargs): if isinstance(obj, datetime.datetime): if obj.tzinfo is None: - raise TypeError( - "date '%s' is not fully timezone qualified." % (obj)) + raise TypeError("date '%s' is not fully timezone qualified." % (obj)) obj = obj.astimezone(pytz.UTC) return "{}".format(obj.isoformat()) elif isinstance(obj, datetime.date): @@ -42,16 +42,21 @@ def default(self, obj, **kwargs): return super(JSONEncoderPlus, self).default(obj, **kwargs) -def convert_pdf(filename, type='xml'): - commands = {'text': ['pdftotext', '-layout', filename, '-'], - 'text-nolayout': ['pdftotext', filename, '-'], - 'xml': ['pdftohtml', '-xml', '-stdout', filename], - 'html': ['pdftohtml', '-stdout', filename]} +def convert_pdf(filename, type="xml"): + commands = { + "text": ["pdftotext", "-layout", filename, "-"], + "text-nolayout": ["pdftotext", filename, "-"], + "xml": ["pdftohtml", "-xml", "-stdout", filename], + "html": ["pdftohtml", "-stdout", filename], + } try: - pipe = subprocess.Popen(commands[type], stdout=subprocess.PIPE, close_fds=True).stdout + pipe = subprocess.Popen( + commands[type], stdout=subprocess.PIPE, close_fds=True + ).stdout except OSError as e: - raise EnvironmentError("error running %s, missing executable? [%s]" % - ' '.join(commands[type]), e) + raise EnvironmentError( + "error running %s, missing executable? [%s]" % " ".join(commands[type]), e + ) data = pipe.read() pipe.close() return data diff --git a/pupa/utils/topsort.py b/pupa/utils/topsort.py index 10b83f55..850253c0 100644 --- a/pupa/utils/topsort.py +++ b/pupa/utils/topsort.py @@ -8,6 +8,7 @@ class CyclicGraphError(ValueError): sorting algorithm *knows* that the graph is Cyclic by hitting a snag in the top-sort) """ + pass @@ -25,7 +26,7 @@ def __init__(self): self.edges = defaultdict(set) def add_node(self, node): - """ Add a node to the graph (with no edges) """ + """Add a node to the graph (with no edges)""" self.nodes.add(node) def add_edge(self, fro, to): @@ -60,9 +61,11 @@ def prune_node(self, node, remove_backrefs=False): if not remove_backrefs: for fro, connections in self.edges.items(): if node in self.edges[fro]: - raise ValueError("""Attempting to remove a node with + raise ValueError( + """Attempting to remove a node with backrefs. You may consider setting - `remove_backrefs` to true.""") + `remove_backrefs` to true.""" + ) # OK. Otherwise, let's do our removal. @@ -128,8 +131,7 @@ def walk_node(node, seen): yield (node,) + cycle # First, let's get a iterable of all known cycles. - cycles = chain.from_iterable( - (walk_node(node, set()) for node in self.nodes)) + cycles = chain.from_iterable((walk_node(node, set()) for node in self.nodes)) shortest = set() # Now, let's go through and sift through the cycles, finding diff --git a/setup.py b/setup.py index 2fc710bc..793877dd 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'opencivicdata>=3.0.0', 'dj_database_url>=0.3.0', 'scrapelib>=1.0', - 'jsonschema>=3.0.0', # TODO: Drop alpha release once stable release is available + 'jsonschema>=3.0.0', # TODO: Drop alpha release once stable release available 'psycopg2-binary', 'pytz', ], From 809adb6b7376a85bd90d4a6e295a3942c9316331 Mon Sep 17 00:00:00 2001 From: msj Date: Thu, 5 Jan 2023 16:26:56 -0500 Subject: [PATCH 03/29] Fix typo --- pupa/importers/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pupa/importers/base.py b/pupa/importers/base.py index 4da6f45c..666e620e 100644 --- a/pupa/importers/base.py +++ b/pupa/importers/base.py @@ -295,7 +295,7 @@ def import_item(self, data): what = "update" if what == "update": - obj.last_updated = utcnow() + obj.updated_at = utcnow() # Refresh the object's last_seen field whether or not we updated obj.save() From 8b6355e8cc4353526e50316feb7429c55eb5833e Mon Sep 17 00:00:00 2001 From: msj Date: Thu, 5 Jan 2023 16:27:42 -0500 Subject: [PATCH 04/29] Order test queryset properly --- pupa/tests/importers/test_vote_event_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pupa/tests/importers/test_vote_event_importer.py b/pupa/tests/importers/test_vote_event_importer.py index bdb28fc6..aa7f0434 100644 --- a/pupa/tests/importers/test_vote_event_importer.py +++ b/pupa/tests/importers/test_vote_event_importer.py @@ -546,7 +546,7 @@ def test_vote_event_bill_actions_errors(): ) bill = Bill.objects.get() - votes = list(VoteEvent.objects.all()) + votes = list(VoteEvent.objects.all().order_by("identifier")) # isn't matched, was ambiguous across two actions assert votes[0].bill_action is None From 9320e4f5173b76312b3824098a40512612419578 Mon Sep 17 00:00:00 2001 From: msj Date: Fri, 6 Jan 2023 15:25:28 -0500 Subject: [PATCH 05/29] Use opencivicdata branch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 793877dd..07d44523 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ pupa = pupa.cli.__main__:main''', install_requires=[ 'Django>=2.2', - 'opencivicdata>=3.0.0', + 'opencivicdata @ git+https://github.com/opencivicdata/python-opencivicdata@last-seen', 'dj_database_url>=0.3.0', 'scrapelib>=1.0', 'jsonschema>=3.0.0', # TODO: Drop alpha release once stable release available From b5eb723f9b3dac134fa10a7f760a949a20b6752b Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 9 Jan 2023 12:07:31 -0500 Subject: [PATCH 06/29] Test python 3.10 in gh actions --- .github/workflows/package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 0ea3345f..7c6ed720 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -27,7 +27,7 @@ jobs: - 5432:5432 strategy: matrix: - python-version: ['3.6', '3.7', '3.8'] + python-version: ['3.6', '3.7', '3.8', '3.10'] django-series: ['2.2', '3.0'] steps: - uses: actions/checkout@v2 From a590f6c88f30bc232f4ab3b5bbe55a7ee84cedeb Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 13:48:51 -0600 Subject: [PATCH 07/29] Stop overwriting OCD install --- .github/workflows/package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 7c6ed720..c6d5426b 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -40,7 +40,7 @@ jobs: sudo apt update sudo apt install -y gdal-bin pip install .[dev] --pre Django==${{ matrix.django-series }} - pip install -e git+https://github.com/opencivicdata/python-opencivicdata.git#egg=opencivicdata + # pip install -e git+https://github.com/opencivicdata/python-opencivicdata.git#egg=opencivicdata - name: Lint with flake8 run: | flake8 pupa From 7e3ac0290ac60944a094388eb19e5176058616d5 Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 13:52:01 -0600 Subject: [PATCH 08/29] Revert 3.10 change --- .github/workflows/package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index c6d5426b..f376f213 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -27,7 +27,7 @@ jobs: - 5432:5432 strategy: matrix: - python-version: ['3.6', '3.7', '3.8', '3.10'] + python-version: ['3.6', '3.7', '3.8'] django-series: ['2.2', '3.0'] steps: - uses: actions/checkout@v2 From 02858a26f10f44355b150387e1c100ec1bf1a854 Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:13:25 -0600 Subject: [PATCH 09/29] Upgrade coverage --- .github/workflows/package.yml | 1 - setup.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index f376f213..02923991 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -40,7 +40,6 @@ jobs: sudo apt update sudo apt install -y gdal-bin pip install .[dev] --pre Django==${{ matrix.django-series }} - # pip install -e git+https://github.com/opencivicdata/python-opencivicdata.git#egg=opencivicdata - name: Lint with flake8 run: | flake8 pupa diff --git a/setup.py b/setup.py index 07d44523..5ffc0586 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ 'pytest-cov', 'pytest-django', 'coveralls', + 'coverage>7', 'flake8', ], }, From 4d347900187e55e9b0bfae40c688cfccc1bae19a Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:17:35 -0600 Subject: [PATCH 10/29] Upgrade coverage in CI --- .github/workflows/package.yml | 1 + pupa/tests/django_settings.py | 6 +++--- run-tests.sh | 2 +- setup.py | 1 - 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 02923991..f73a1ed1 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -40,6 +40,7 @@ jobs: sudo apt update sudo apt install -y gdal-bin pip install .[dev] --pre Django==${{ matrix.django-series }} + pip install coverage>7 - name: Lint with flake8 run: | flake8 pupa diff --git a/pupa/tests/django_settings.py b/pupa/tests/django_settings.py index 7ab0e20a..070e61dd 100644 --- a/pupa/tests/django_settings.py +++ b/pupa/tests/django_settings.py @@ -8,10 +8,10 @@ 'default': { 'ENGINE': 'django.contrib.gis.db.backends.postgis', 'NAME': 'test', - 'USER': 'test', - 'PASSWORD': 'test', + 'USER': 'postgres', + 'PASSWORD': 'postgres', 'HOST': 'localhost', - 'PORT': 5432, + 'PORT': 32001, } } MIDDLEWARE_CLASSES = () diff --git a/run-tests.sh b/run-tests.sh index 2b543bfb..5ca196bf 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -1,3 +1,3 @@ #!/bin/sh export PYTHONPATH=. -pytest --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests +pytest -vvv --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests diff --git a/setup.py b/setup.py index 5ffc0586..07d44523 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,6 @@ 'pytest-cov', 'pytest-django', 'coveralls', - 'coverage>7', 'flake8', ], }, From 5776aec8f2d2cb35377b4b1c47eaea511438b3db Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:24:32 -0600 Subject: [PATCH 11/29] What if I downgrade coverage --- .github/workflows/package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index f73a1ed1..e2cde999 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -40,7 +40,7 @@ jobs: sudo apt update sudo apt install -y gdal-bin pip install .[dev] --pre Django==${{ matrix.django-series }} - pip install coverage>7 + pip install coverage<5 - name: Lint with flake8 run: | flake8 pupa From 0dec7dda3e3617748491e5cbc0a22dd70e16480d Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:25:36 -0600 Subject: [PATCH 12/29] Revert local testing changes --- pupa/tests/django_settings.py | 6 +++--- run-tests.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pupa/tests/django_settings.py b/pupa/tests/django_settings.py index 070e61dd..7ab0e20a 100644 --- a/pupa/tests/django_settings.py +++ b/pupa/tests/django_settings.py @@ -8,10 +8,10 @@ 'default': { 'ENGINE': 'django.contrib.gis.db.backends.postgis', 'NAME': 'test', - 'USER': 'postgres', - 'PASSWORD': 'postgres', + 'USER': 'test', + 'PASSWORD': 'test', 'HOST': 'localhost', - 'PORT': 32001, + 'PORT': 5432, } } MIDDLEWARE_CLASSES = () diff --git a/run-tests.sh b/run-tests.sh index 5ca196bf..2b543bfb 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -1,3 +1,3 @@ #!/bin/sh export PYTHONPATH=. -pytest -vvv --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests +pytest --cov pupa --cov-report html --ds=pupa.tests.django_settings pupa/tests From 063d4209c683e5561d21cd006f595b245a4c1266 Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:30:17 -0600 Subject: [PATCH 13/29] Nevermind --- .github/workflows/package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index e2cde999..02923991 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -40,7 +40,6 @@ jobs: sudo apt update sudo apt install -y gdal-bin pip install .[dev] --pre Django==${{ matrix.django-series }} - pip install coverage<5 - name: Lint with flake8 run: | flake8 pupa From c237d87c0f6f18a0dd43c5ffebeabc44102d08e0 Mon Sep 17 00:00:00 2001 From: hannah cushman garland Date: Mon, 9 Jan 2023 14:34:45 -0600 Subject: [PATCH 14/29] Pin coverage once again --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 07d44523..f1b4dd76 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ 'pytest-cov', 'pytest-django', 'coveralls', + 'coverage<=6.5.0', 'flake8', ], }, From 7d43b7a73f1c60a3ff79376d175b34e3882fc27f Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 20 Feb 2023 12:05:15 -0500 Subject: [PATCH 15/29] Add pupa clean CLI command --- pupa/cli/__main__.py | 1 + pupa/cli/commands/clean.py | 103 +++++++++++++++++++++++++++++++++ pupa/tests/clean/test_clean.py | 51 ++++++++++++++++ setup.py | 1 + 4 files changed, 156 insertions(+) create mode 100644 pupa/cli/commands/clean.py create mode 100644 pupa/tests/clean/test_clean.py diff --git a/pupa/cli/__main__.py b/pupa/cli/__main__.py index 09cd450d..baf38686 100644 --- a/pupa/cli/__main__.py +++ b/pupa/cli/__main__.py @@ -14,6 +14,7 @@ "pupa.cli.commands.dbinit", "pupa.cli.commands.update", "pupa.cli.commands.party", + "pupa.cli.commands.clean", ) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py new file mode 100644 index 00000000..d29b9b91 --- /dev/null +++ b/pupa/cli/commands/clean.py @@ -0,0 +1,103 @@ +import itertools +from datetime import datetime, timezone, timedelta + +import django +from django.apps import apps +from .base import BaseCommand + + +def get_subclasses(app_list, abstract_class): + """ + Finds and returns all subclasses of an abstract class. + """ + result = [] + for app in app_list: + for model in apps.get_app_config(app).get_models(): + if issubclass(model, abstract_class) and model is not abstract_class: + result.append(model) + return result + + +def get_stale_objects(window): + """ + Find all database objects that haven't seen been in {window} days. + """ + + from opencivicdata.core.models.base import OCDBase + + ocd_apps = ["core", "legislative"] + # Check all subclasses of OCDBase + models = get_subclasses(ocd_apps, OCDBase) + + results = [] + for model in models: + # Jurisdictions are protected from deletion + if "Jurisdiction" not in model.__name__: + cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window) + results.append(model.objects.filter(last_seen__lte=cutoff_date)) + + return itertools.chain(*results) + + +def remove_stale_objects(window): + """ + Remove all database objects that haven't seen been in {window} days. + """ + for obj in get_stale_objects(window): + print(f"Deleting {obj}...") + obj.delete() + + +class Command(BaseCommand): + name = "clean" + help = "Removes database objects that haven't been seen in recent scrapes" + + def add_args(self): + self.add_argument( + "--window", + type=int, + default=7, + help=( + "Objects not seen in this many days will be deleted from the database" + ), + ) + self.add_argument( + "--report", + action="store_true", + help=( + "Will only generate a report of what objects this command" + "would delete without making any changes to the database" + ), + ) + self.add_argument( + "--noinput", + action="store_true", + help="Will delete objects without getting user confirmation", + ) + + def handle(self, args, other): + django.setup() + + if args.report: + print( + "These objects have not been seen in a scrape within the last" + f" {args.window} days:" + ) + for obj in get_stale_objects(args.window): + print(obj) + else: + if not args.noinput: + print( + "This will permanently delete all objects from your database" + f"that have not been scraped within the last {args.window}" + " days. Are you sure? (Y/N)" + ) + resp = input() + if resp != "Y": + return + + print( + "Removing objects that haven't been seen in a scrape within" + f" the last {args.window} days..." + ) + remove_stale_objects(args.window) diff --git a/pupa/tests/clean/test_clean.py b/pupa/tests/clean/test_clean.py new file mode 100644 index 00000000..3f859a51 --- /dev/null +++ b/pupa/tests/clean/test_clean.py @@ -0,0 +1,51 @@ +import pytest +from datetime import datetime, timezone, timedelta +from freezegun import freeze_time + +from opencivicdata.core.models import Person, Organization, Jurisdiction, Division + +from pupa.cli.commands.clean import get_stale_objects, remove_stale_objects + + +def create_jurisdiction(): + Division.objects.create(id="ocd-division/country:us", name="USA") + return Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") + + +@pytest.mark.django_db +def test_get_stale_objects(): + j = create_jurisdiction() + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + p = Person.objects.create(name="George Washington", family_name="Washington") + m = p.memberships.create(organization=o) + + expected_stale_objects = {p, o, m} + + a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) + with freeze_time(a_week_from_now): + p = Person.objects.create(name="Thomas Jefferson", family_name="Jefferson") + j.save() + p.memberships.create(organization=o) + assert set(get_stale_objects(7)) == expected_stale_objects + + +@pytest.mark.django_db +def test_remove_stale_objects(): + j = create_jurisdiction() + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + p = Person.objects.create(name="George Washington", family_name="Washington") + m = p.memberships.create(organization=o) + + expected_stale_objects = {p, o, m} + + a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) + with freeze_time(a_week_from_now): + p = Person.objects.create(name="Thomas Jefferson", family_name="Jefferson") + p.memberships.create(organization=o) + + j.save() + + remove_stale_objects(7) + for obj in expected_stale_objects: + was_deleted = not type(obj).objects.filter(id=obj.id).exists() + assert was_deleted diff --git a/setup.py b/setup.py index f1b4dd76..230df217 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ 'pytest>=3.6', 'pytest-cov', 'pytest-django', + 'freezegun', 'coveralls', 'coverage<=6.5.0', 'flake8', From 6b3784353706c58317455ba142437bc653d4e942 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 20 Feb 2023 12:17:43 -0500 Subject: [PATCH 16/29] Fix help tags --- pupa/cli/commands/clean.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index d29b9b91..6913a8ef 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -58,21 +58,21 @@ def add_args(self): type=int, default=7, help=( - "Objects not seen in this many days will be deleted from the database" + "objects not seen in this many days will be deleted from the database" ), ) self.add_argument( "--report", action="store_true", help=( - "Will only generate a report of what objects this command" - "would delete without making any changes to the database" + "generate a report of what objects this command" + " would delete without making any changes to the database" ), ) self.add_argument( "--noinput", action="store_true", - help="Will delete objects without getting user confirmation", + help="delete objects without getting user confirmation", ) def handle(self, args, other): @@ -89,7 +89,7 @@ def handle(self, args, other): if not args.noinput: print( "This will permanently delete all objects from your database" - f"that have not been scraped within the last {args.window}" + f" that have not been scraped within the last {args.window}" " days. Are you sure? (Y/N)" ) resp = input() From 11ffc831da045f710a67c2735d9c0c686ac3bd3c Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 08:38:53 -0400 Subject: [PATCH 17/29] Use generator --- pupa/cli/commands/clean.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index 6913a8ef..879b79ab 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -1,4 +1,3 @@ -import itertools from datetime import datetime, timezone, timedelta import django @@ -29,20 +28,18 @@ def get_stale_objects(window): # Check all subclasses of OCDBase models = get_subclasses(ocd_apps, OCDBase) - results = [] for model in models: # Jurisdictions are protected from deletion if "Jurisdiction" not in model.__name__: cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window) - results.append(model.objects.filter(last_seen__lte=cutoff_date)) - - return itertools.chain(*results) + yield from model.objects.filter(last_seen__lte=cutoff_date).iterator() def remove_stale_objects(window): """ Remove all database objects that haven't seen been in {window} days. """ + for obj in get_stale_objects(window): print(f"Deleting {obj}...") obj.delete() From 8ad4c19ccc4791e67f2325a9e9f8c61c0a577b55 Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 08:42:43 -0400 Subject: [PATCH 18/29] Refactor clean command --- pupa/cli/commands/clean.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index 879b79ab..8664d20f 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -35,16 +35,6 @@ def get_stale_objects(window): yield from model.objects.filter(last_seen__lte=cutoff_date).iterator() -def remove_stale_objects(window): - """ - Remove all database objects that haven't seen been in {window} days. - """ - - for obj in get_stale_objects(window): - print(f"Deleting {obj}...") - obj.delete() - - class Command(BaseCommand): name = "clean" help = "Removes database objects that haven't been seen in recent scrapes" @@ -72,6 +62,22 @@ def add_args(self): help="delete objects without getting user confirmation", ) + def remove_stale_objects(window): + """ + Remove all database objects that haven't seen been in {window} days. + """ + + for obj in get_stale_objects(window): + print(f"Deleting {obj}...") + obj.delete() + + def report_stale_objects(window): + """ + Print all database objects that haven't seen been in {window} days. + """ + for obj in get_stale_objects(window): + print(obj) + def handle(self, args, other): django.setup() @@ -80,8 +86,7 @@ def handle(self, args, other): "These objects have not been seen in a scrape within the last" f" {args.window} days:" ) - for obj in get_stale_objects(args.window): - print(obj) + self.report_stale_objects() else: if not args.noinput: print( @@ -97,4 +102,4 @@ def handle(self, args, other): "Removing objects that haven't been seen in a scrape within" f" the last {args.window} days..." ) - remove_stale_objects(args.window) + self.remove_stale_objects(args.window) From 1ef9a90062c5ca469b08a44e4e82aeaab1c97e6b Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 14:36:17 -0400 Subject: [PATCH 19/29] Refactor clean tests --- pupa/tests/clean/test_clean.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/pupa/tests/clean/test_clean.py b/pupa/tests/clean/test_clean.py index 3f859a51..0e75c282 100644 --- a/pupa/tests/clean/test_clean.py +++ b/pupa/tests/clean/test_clean.py @@ -1,10 +1,12 @@ import pytest +import argparse + from datetime import datetime, timezone, timedelta from freezegun import freeze_time from opencivicdata.core.models import Person, Organization, Jurisdiction, Division -from pupa.cli.commands.clean import get_stale_objects, remove_stale_objects +from pupa.cli.commands.clean import Command def create_jurisdiction(): @@ -12,8 +14,24 @@ def create_jurisdiction(): return Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") +@pytest.fixture +def subparsers(): + parser = argparse.ArgumentParser("pupa", description="pupa CLI") + parser.add_argument("--debug", action="store_true", help="open debugger on error") + parser.add_argument( + "--loglevel", + default="INFO", + help=( + "set log level. options are: " + "DEBUG|INFO|WARNING|ERROR|CRITICAL " + "(default is INFO)" + ), + ) + return parser.add_subparsers(dest="subcommand") + + @pytest.mark.django_db -def test_get_stale_objects(): +def test_get_stale_objects(subparsers): j = create_jurisdiction() o = Organization.objects.create(name="WWE", jurisdiction_id="jid") p = Person.objects.create(name="George Washington", family_name="Washington") @@ -26,11 +44,11 @@ def test_get_stale_objects(): p = Person.objects.create(name="Thomas Jefferson", family_name="Jefferson") j.save() p.memberships.create(organization=o) - assert set(get_stale_objects(7)) == expected_stale_objects + assert set(Command(subparsers).get_stale_objects(7)) == expected_stale_objects @pytest.mark.django_db -def test_remove_stale_objects(): +def test_remove_stale_objects(subparsers): j = create_jurisdiction() o = Organization.objects.create(name="WWE", jurisdiction_id="jid") p = Person.objects.create(name="George Washington", family_name="Washington") @@ -45,7 +63,7 @@ def test_remove_stale_objects(): j.save() - remove_stale_objects(7) + Command(subparsers).remove_stale_objects(7) for obj in expected_stale_objects: was_deleted = not type(obj).objects.filter(id=obj.id).exists() assert was_deleted From f7519596c115fda9773a6efb4e8db4da6b5d3d64 Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 16:33:13 -0400 Subject: [PATCH 20/29] Refactor clean command --- pupa/cli/commands/clean.py | 46 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index 8664d20f..3b58a80e 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -1,3 +1,4 @@ +import sys from datetime import datetime, timezone, timedelta import django @@ -17,24 +18,6 @@ def get_subclasses(app_list, abstract_class): return result -def get_stale_objects(window): - """ - Find all database objects that haven't seen been in {window} days. - """ - - from opencivicdata.core.models.base import OCDBase - - ocd_apps = ["core", "legislative"] - # Check all subclasses of OCDBase - models = get_subclasses(ocd_apps, OCDBase) - - for model in models: - # Jurisdictions are protected from deletion - if "Jurisdiction" not in model.__name__: - cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window) - yield from model.objects.filter(last_seen__lte=cutoff_date).iterator() - - class Command(BaseCommand): name = "clean" help = "Removes database objects that haven't been seen in recent scrapes" @@ -62,20 +45,37 @@ def add_args(self): help="delete objects without getting user confirmation", ) - def remove_stale_objects(window): + def get_stale_objects(self, window): + """ + Find all database objects that haven't seen been in {window} days. + """ + + from opencivicdata.core.models.base import OCDBase + + ocd_apps = ["core", "legislative"] + # Check all subclasses of OCDBase + models = get_subclasses(ocd_apps, OCDBase) + + for model in models: + # Jurisdictions are protected from deletion + if "Jurisdiction" not in model.__name__: + cutoff_date = datetime.now(tz=timezone.utc) - timedelta(days=window) + yield from model.objects.filter(last_seen__lte=cutoff_date).iterator() + + def remove_stale_objects(self, window): """ Remove all database objects that haven't seen been in {window} days. """ - for obj in get_stale_objects(window): + for obj in self.get_stale_objects(window): print(f"Deleting {obj}...") obj.delete() - def report_stale_objects(window): + def report_stale_objects(self, window): """ Print all database objects that haven't seen been in {window} days. """ - for obj in get_stale_objects(window): + for obj in self.get_stale_objects(window): print(obj) def handle(self, args, other): @@ -96,7 +96,7 @@ def handle(self, args, other): ) resp = input() if resp != "Y": - return + sys.exit() print( "Removing objects that haven't been seen in a scrape within" From c3efa0d58b32f5ccc2dc52ae8e208cafd7cd3efa Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 16:33:41 -0400 Subject: [PATCH 21/29] Add integration test --- pupa/tests/clean/test_clean.py | 52 +++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/pupa/tests/clean/test_clean.py b/pupa/tests/clean/test_clean.py index 0e75c282..c845cb2a 100644 --- a/pupa/tests/clean/test_clean.py +++ b/pupa/tests/clean/test_clean.py @@ -1,6 +1,5 @@ import pytest import argparse - from datetime import datetime, timezone, timedelta from freezegun import freeze_time @@ -9,11 +8,6 @@ from pupa.cli.commands.clean import Command -def create_jurisdiction(): - Division.objects.create(id="ocd-division/country:us", name="USA") - return Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") - - @pytest.fixture def subparsers(): parser = argparse.ArgumentParser("pupa", description="pupa CLI") @@ -30,9 +24,14 @@ def subparsers(): return parser.add_subparsers(dest="subcommand") +def create_jurisdiction(): + Division.objects.create(id="ocd-division/country:us", name="USA") + return Jurisdiction.objects.create(id="jid", division_id="ocd-division/country:us") + + @pytest.mark.django_db def test_get_stale_objects(subparsers): - j = create_jurisdiction() + _ = create_jurisdiction() o = Organization.objects.create(name="WWE", jurisdiction_id="jid") p = Person.objects.create(name="George Washington", family_name="Washington") m = p.memberships.create(organization=o) @@ -42,14 +41,13 @@ def test_get_stale_objects(subparsers): a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) with freeze_time(a_week_from_now): p = Person.objects.create(name="Thomas Jefferson", family_name="Jefferson") - j.save() p.memberships.create(organization=o) assert set(Command(subparsers).get_stale_objects(7)) == expected_stale_objects @pytest.mark.django_db def test_remove_stale_objects(subparsers): - j = create_jurisdiction() + _ = create_jurisdiction() o = Organization.objects.create(name="WWE", jurisdiction_id="jid") p = Person.objects.create(name="George Washington", family_name="Washington") m = p.memberships.create(organization=o) @@ -61,9 +59,41 @@ def test_remove_stale_objects(subparsers): p = Person.objects.create(name="Thomas Jefferson", family_name="Jefferson") p.memberships.create(organization=o) - j.save() - Command(subparsers).remove_stale_objects(7) for obj in expected_stale_objects: was_deleted = not type(obj).objects.filter(id=obj.id).exists() assert was_deleted + + +@pytest.mark.django_db +def test_clean_command(subparsers): + _ = create_jurisdiction() + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + + stale_person = Person.objects.create( + name="George Washington", family_name="Washington" + ) + stale_membership = stale_person.memberships.create(organization=o) + + a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) + with freeze_time(a_week_from_now): + not_stale_person = Person.objects.create( + name="Thomas Jefferson", family_name="Jefferson" + ) + not_stale_membership = not_stale_person.memberships.create(organization=o) + o.save() # Update org's last_seen field + + # Call clean command + Command(subparsers).handle( + argparse.Namespace(noinput=True, report=False, window=7), [] + ) + + expected_stale_objects = {stale_person, stale_membership} + for obj in expected_stale_objects: + was_deleted = not type(obj).objects.filter(id=obj.id).exists() + assert was_deleted + + expected_not_stale_objects = {o, not_stale_person, not_stale_membership} + for obj in expected_not_stale_objects: + was_not_deleted = type(obj).objects.filter(id=obj.id).exists() + assert was_not_deleted From 3d9717e788ac917b634b48b0ef3a1487d41f9cf6 Mon Sep 17 00:00:00 2001 From: msj Date: Tue, 14 Mar 2023 16:36:19 -0400 Subject: [PATCH 22/29] Add count to clean command prompt --- pupa/cli/commands/clean.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index 3b58a80e..a321ef49 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -90,7 +90,9 @@ def handle(self, args, other): else: if not args.noinput: print( - "This will permanently delete all objects from your database" + f"This will permanently delete" + f" {len(self.get_stale_objects(args.window))}" + " objects from your database" f" that have not been scraped within the last {args.window}" " days. Are you sure? (Y/N)" ) From 9f51bbf3f9c5ca8fd3ba048f61c02221ee31c135 Mon Sep 17 00:00:00 2001 From: M J <36973363+antidipyramid@users.noreply.github.com> Date: Thu, 16 Mar 2023 20:32:59 +0000 Subject: [PATCH 23/29] Update pupa/cli/commands/clean.py Fix clean warning prompt Co-authored-by: hannah cushman garland --- pupa/cli/commands/clean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index a321ef49..6a4cb1c6 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -91,7 +91,7 @@ def handle(self, args, other): if not args.noinput: print( f"This will permanently delete" - f" {len(self.get_stale_objects(args.window))}" + f" {len(list(self.get_stale_objects(args.window)))}" " objects from your database" f" that have not been scraped within the last {args.window}" " days. Are you sure? (Y/N)" From fcd8ca684c821ffcd794769aa9543d20e50d4216 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 3 Apr 2023 14:25:23 -0400 Subject: [PATCH 24/29] Prepare for 0.10.3 release --- CHANGELOG.md | 6 ++++++ pupa/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ff6da09..9bf034df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # pupa changelog +## 0.10.3 - April 3 2023 + +Improvements: + +* Add `pupa clean` command to delete database objects that haven't been seen in recent scrapes + ## 0.10.2 - March 18 2021 Improvements: diff --git a/pupa/__init__.py b/pupa/__init__.py index 563e2bd6..c6adbb08 100644 --- a/pupa/__init__.py +++ b/pupa/__init__.py @@ -1 +1 @@ -__version__ = "0.10.2" # pragma: no cover +__version__ = "0.10.3" # pragma: no cover From 6f2fdda19cd55a434f19f1b769728658f9daf8c1 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 3 Apr 2023 14:41:59 -0400 Subject: [PATCH 25/29] Update gh actions ubuntu --- .github/workflows/package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml index 02923991..dd3c9731 100644 --- a/.github/workflows/package.yml +++ b/.github/workflows/package.yml @@ -10,7 +10,7 @@ on: jobs: test: - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 services: postgres: image: postgis/postgis:10-2.5 @@ -55,7 +55,7 @@ jobs: build: needs: test name: Build package and upload to PyPI - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Build and publish From 810c57706d47273048564fccdaa7eaa6fd6cf001 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 8 May 2023 08:50:10 -0400 Subject: [PATCH 26/29] Bump up to 0.11.0 --- CHANGELOG.md | 2 +- pupa/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bf034df..fb50b017 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # pupa changelog -## 0.10.3 - April 3 2023 +## 0.11.0 - April 3 2023 Improvements: diff --git a/pupa/__init__.py b/pupa/__init__.py index c6adbb08..3ec08441 100644 --- a/pupa/__init__.py +++ b/pupa/__init__.py @@ -1 +1 @@ -__version__ = "0.10.3" # pragma: no cover +__version__ = "0.11.0" # pragma: no cover From afb031ca37a9725bbcaac0c8d7bc7dcbc4a11ba9 Mon Sep 17 00:00:00 2001 From: msj Date: Mon, 8 May 2023 11:16:10 -0400 Subject: [PATCH 27/29] Update python-opencivicdata dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 230df217..3307975d 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ pupa = pupa.cli.__main__:main''', install_requires=[ 'Django>=2.2', - 'opencivicdata @ git+https://github.com/opencivicdata/python-opencivicdata@last-seen', + 'opencivicdata>=3.3.0', 'dj_database_url>=0.3.0', 'scrapelib>=1.0', 'jsonschema>=3.0.0', # TODO: Drop alpha release once stable release available From 4246e6bec1163f5ba1cfe156cc45af486a437714 Mon Sep 17 00:00:00 2001 From: msj Date: Thu, 8 Jun 2023 12:34:17 -0400 Subject: [PATCH 28/29] Add clean cli failsafe --- pupa/cli/commands/clean.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pupa/cli/commands/clean.py b/pupa/cli/commands/clean.py index 6a4cb1c6..bc4dd831 100644 --- a/pupa/cli/commands/clean.py +++ b/pupa/cli/commands/clean.py @@ -88,11 +88,21 @@ def handle(self, args, other): ) self.report_stale_objects() else: - if not args.noinput: + num_stale_objects = len(list(self.get_stale_objects(args.window))) + + if args.noinput: + # Fail-safe to avoid deleting a large amount of objects + # without explicit confimation + if num_stale_objects > 10: + print( + "This command would delete more than 10 objects." + "If you're sure, re-run without --noinput to provide confirmation." + ) + sys.exit(1) + else: print( f"This will permanently delete" - f" {len(list(self.get_stale_objects(args.window)))}" - " objects from your database" + f" {num_stale_objects} objects from your database" f" that have not been scraped within the last {args.window}" " days. Are you sure? (Y/N)" ) From 1b9a31308bfe7cfb8533ca91ffc2da6107f8c47c Mon Sep 17 00:00:00 2001 From: msj Date: Thu, 8 Jun 2023 12:56:15 -0400 Subject: [PATCH 29/29] Add failsafe test --- pupa/tests/clean/test_clean.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pupa/tests/clean/test_clean.py b/pupa/tests/clean/test_clean.py index c845cb2a..d4494fc1 100644 --- a/pupa/tests/clean/test_clean.py +++ b/pupa/tests/clean/test_clean.py @@ -97,3 +97,25 @@ def test_clean_command(subparsers): for obj in expected_not_stale_objects: was_not_deleted = type(obj).objects.filter(id=obj.id).exists() assert was_not_deleted + + +@pytest.mark.django_db +def test_clean_command_failsafe(subparsers): + _ = create_jurisdiction() + o = Organization.objects.create(name="WWE", jurisdiction_id="jid") + + stale_people = [ + Person.objects.create(name="George Washington", family_name="Washington") + for i in range(20) + ] + stale_memberships = [ # noqa + p.memberships.create(organization=o) for p in stale_people + ] + + a_week_from_now = datetime.now(tz=timezone.utc) + timedelta(days=7) + with freeze_time(a_week_from_now): + with pytest.raises(SystemExit): + # Should trigger failsafe exist when deleting more than 10 objects + Command(subparsers).handle( + argparse.Namespace(noinput=True, report=False, window=7), [] + )