Skip to content

Commit 4493830

Browse files
authored
Merge pull request #333 from parthsharma2/dev
Update PR worker
2 parents b302a98 + e6d5ea6 commit 4493830

File tree

1 file changed

+125
-2
lines changed

1 file changed

+125
-2
lines changed

workers/gh_pr_worker/gh_pr_worker/worker.py

+125-2
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,24 @@ def __init__(self, config, task=None):
149149
rs = pd.read_sql(max_msg_id_SQL, self.db)
150150
msg_start = int(rs.iloc[0]["msg_id"]) if rs.iloc[0]["msg_id"] is not None else 25150
151151

152+
max_pr_labels_id_SQL = s.sql.text("""
153+
SELECT max(pr_label_id) AS label_id FROM pull_request_labels
154+
""")
155+
rs = pd.read_sql(max_pr_labels_id_SQL, self.db)
156+
label_start = int(rs.iloc[0]['label_id']) if rs.iloc[0]['label_id'] else 25150
157+
158+
max_pr_event_id_SQL = s.sql.text("""
159+
SELECT MAX(pr_event_id) AS event_id FROM pull_request_events
160+
""")
161+
rs = pd.read_sql(max_pr_event_id_SQL, self.db)
162+
event_start = int(rs.iloc[0]['event_id']) if rs.iloc[0]['event_id'] else 25150
163+
152164
# Increment so we are ready to insert the 'next one' of each of these most recent ids
153165
self.pr_id_inc = (pr_start + 1)
154166
self.cntrb_id_inc = (cntrb_start + 1)
155167
self.msg_id_inc = (msg_start + 1)
168+
self.label_id_inc = (label_start + 1)
169+
self.event_id_inc = (event_start + 1)
156170

157171
# self.run()
158172

@@ -384,12 +398,121 @@ def query_pr(self, entry_info):
384398

385399
result = self.db.execute(self.pull_requests_table.insert().values(pr))
386400
logging.info(f"Primary Key inserted pull_requests table: {result.inserted_primary_key}")
387-
logging.info(f"Inserted PR data for {owner}/{repo}")
388401

402+
self.query_labels(pr_dict['labels'], self.pr_id_inc)
403+
self.query_pr_events(owner, repo, pr_dict['number'], self.pr_id_inc)
404+
405+
logging.info(f"Inserted PR data for {owner}/{repo}")
389406
self.results_counter += 1
390407
self.pr_id_inc += 1
391408

392-
self.register_task_completion(entry_info, 'pull_requests')
409+
self.register_task_completion(entry_info, 'pull_requests')
410+
411+
def query_labels(self, labels, pr_id):
412+
logging.info('Querying PR Labels')
413+
pseudo_key_gh = 'id'
414+
psuedo_key_augur = 'pr_src_id'
415+
table = 'pull_request_labels'
416+
pr_labels_table_values = self.get_table_values({psuedo_key_augur: pseudo_key_gh}, [table])
417+
418+
new_labels = self.check_duplicates(labels, pr_labels_table_values, pseudo_key_gh)
419+
420+
if len(new_labels) == 0:
421+
logging.info('No new labels to add')
422+
return
423+
424+
logging.info(f'Found {len(new_labels)} labels')
425+
426+
for label_dict in new_labels:
427+
428+
label = {
429+
'pr_label_id': self.label_id_inc,
430+
'pull_request_id': pr_id,
431+
'pr_src_id': label_dict['id'],
432+
'pr_src_node_id': label_dict['node_id'],
433+
'pr_src_url': label_dict['url'],
434+
'pr_src_description': label_dict['name'],
435+
'pr_src_color': label_dict['color'],
436+
'pr_src_default_bool': label_dict['default'],
437+
'tool_source': self.tool_source,
438+
'tool_version': self.tool_version,
439+
'data_source': self.data_source,
440+
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
441+
}
442+
443+
result = self.db.execute(self.pull_request_labels_table.insert().values(label))
444+
logging.info(f"Primary Key inserted in pull_request_labels table: {result.inserted_primary_key}")
445+
logging.info(f"Inserted PR Labels data for PR with id {pr_id}")
446+
447+
self.results_counter += 1
448+
self.label_id_inc += 1
449+
450+
def query_pr_events(self, owner, repo, gh_pr_no, pr_id):
451+
logging.info('Querying PR Events')
452+
453+
url = (f'https://api.github.com/repos/{owner}/{repo}/issues/'
454+
+ f'{gh_pr_no}/events?per_page=100')
455+
456+
pseudo_key_gh = 'id'
457+
psuedo_key_augur = 'pr_event_id'
458+
table = 'pull_request_events'
459+
pr_events_table_values = self.get_table_values({psuedo_key_augur: pseudo_key_gh}, [table])
460+
461+
pr_events = []
462+
try:
463+
while True:
464+
r = requests.get(url, headers=self.headers)
465+
self.update_rate_limit(r)
466+
467+
j = r.json()
468+
469+
new_pr_events = self.check_duplicates(j, pr_events_table_values, pseudo_key_gh)
470+
471+
if len(new_pr_events) == 0:
472+
logging.info('No new PR Events to add... Exiting Pagination')
473+
break
474+
else:
475+
pr_events += new_pr_events
476+
477+
if 'next' not in r.links:
478+
break
479+
else:
480+
url = r.links['next']['url']
481+
except Exception as e:
482+
logging.error(f'Caught Exception on url {url}')
483+
logging.error(str(e))
484+
logging.info(f'Not adding PR events for PR {pr_id}')
485+
return
486+
487+
for pr_event_dict in pr_events:
488+
489+
if 'actor' in pr_event_dict:
490+
cntrb_id = self.find_id_from_login(pr_event_dict['actor']['login'])
491+
else:
492+
cntrb_id = None
493+
494+
pr_event = {
495+
'pr_event_id': self.event_id_inc,
496+
'pull_request_id': pr_id,
497+
'cntrb_id': cntrb_id,
498+
'action': pr_event_dict['event'],
499+
'action_commit_hash': None,
500+
'created_at': pr_event_dict['created_at'],
501+
'issue_event_src_id': pr_event_dict['id'],
502+
'node_id': pr_event_dict['node_id'],
503+
'node_url': pr_event_dict['url'],
504+
'tool_source': self.tool_source,
505+
'tool_version': self.tool_version,
506+
'data_source': self.data_source,
507+
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
508+
}
509+
510+
result = self.db.execute(self.pull_request_events_table.insert().values(pr_event))
511+
logging.info(f"Primary Key inserted in pull_request_events table: {result.inserted_primary_key}")
512+
logging.info(f"Inserted PR Events data for PR with id {pr_id}")
513+
514+
self.results_counter += 1
515+
self.event_id_inc += 1
393516

394517
def query_contributors(self, entry_info):
395518

0 commit comments

Comments
 (0)