Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update PR worker #333

Merged
merged 4 commits into from
Jul 25, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 125 additions & 2 deletions workers/gh_pr_worker/gh_pr_worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,24 @@ def __init__(self, config, task=None):
rs = pd.read_sql(max_msg_id_SQL, self.db)
msg_start = int(rs.iloc[0]["msg_id"]) if rs.iloc[0]["msg_id"] is not None else 25150

max_pr_labels_id_SQL = s.sql.text("""
SELECT max(pr_label_id) AS label_id FROM pull_request_labels
""")
rs = pd.read_sql(max_pr_labels_id_SQL, self.db)
label_start = int(rs.iloc[0]['label_id']) if rs.iloc[0]['label_id'] else 25150

max_pr_event_id_SQL = s.sql.text("""
SELECT MAX(pr_event_id) AS event_id FROM pull_request_events
""")
rs = pd.read_sql(max_pr_event_id_SQL, self.db)
event_start = int(rs.iloc[0]['event_id']) if rs.iloc[0]['event_id'] else 25150

# Increment so we are ready to insert the 'next one' of each of these most recent ids
self.pr_id_inc = (pr_start + 1)
self.cntrb_id_inc = (cntrb_start + 1)
self.msg_id_inc = (msg_start + 1)
self.label_id_inc = (label_start + 1)
self.event_id_inc = (event_start + 1)

# self.run()

Expand Down Expand Up @@ -384,12 +398,121 @@ def query_pr(self, entry_info):

result = self.db.execute(self.pull_requests_table.insert().values(pr))
logging.info(f"Primary Key inserted pull_requests table: {result.inserted_primary_key}")
logging.info(f"Inserted PR data for {owner}/{repo}")

self.query_labels(pr_dict['labels'], self.pr_id_inc)
self.query_pr_events(owner, repo, pr_dict['number'], self.pr_id_inc)

logging.info(f"Inserted PR data for {owner}/{repo}")
self.results_counter += 1
self.pr_id_inc += 1

self.register_task_completion(entry_info, 'pull_requests')
self.register_task_completion(entry_info, 'pull_requests')

def query_labels(self, labels, pr_id):
logging.info('Querying PR Labels')
pseudo_key_gh = 'id'
psuedo_key_augur = 'pr_src_id'
table = 'pull_request_labels'
pr_labels_table_values = self.get_table_values({psuedo_key_augur: pseudo_key_gh}, [table])

new_labels = self.check_duplicates(labels, pr_labels_table_values, pseudo_key_gh)

if len(new_labels) == 0:
logging.info('No new labels to add')
return

logging.info(f'Found {len(new_labels)} labels')

for label_dict in new_labels:

label = {
'pr_label_id': self.label_id_inc,
'pull_request_id': pr_id,
'pr_src_id': label_dict['id'],
'pr_src_node_id': label_dict['node_id'],
'pr_src_url': label_dict['url'],
'pr_src_description': label_dict['name'],
'pr_src_color': label_dict['color'],
'pr_src_default_bool': label_dict['default'],
'tool_source': self.tool_source,
'tool_version': self.tool_version,
'data_source': self.data_source,
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
}

result = self.db.execute(self.pull_request_labels_table.insert().values(label))
logging.info(f"Primary Key inserted in pull_request_labels table: {result.inserted_primary_key}")
logging.info(f"Inserted PR Labels data for PR with id {pr_id}")

self.results_counter += 1
self.label_id_inc += 1

def query_pr_events(self, owner, repo, gh_pr_no, pr_id):
logging.info('Querying PR Events')

url = (f'https://api.github.com/repos/{owner}/{repo}/issues/'
+ f'{gh_pr_no}/events?per_page=100')

pseudo_key_gh = 'id'
psuedo_key_augur = 'pr_event_id'
table = 'pull_request_events'
pr_events_table_values = self.get_table_values({psuedo_key_augur: pseudo_key_gh}, [table])

pr_events = []
try:
while True:
r = requests.get(url, headers=self.headers)
self.update_rate_limit(r)

j = r.json()

new_pr_events = self.check_duplicates(j, pr_events_table_values, pseudo_key_gh)

if len(new_pr_events) == 0:
logging.info('No new PR Events to add... Exiting Pagination')
break
else:
pr_events += new_pr_events

if 'next' not in r.links:
break
else:
url = r.links['next']['url']
except Exception as e:
logging.error(f'Caught Exception on url {url}')
logging.error(str(e))
logging.info(f'Not adding PR events for PR {pr_id}')
return

for pr_event_dict in pr_events:

if 'actor' in pr_event_dict:
cntrb_id = self.find_id_from_login(pr_event_dict['actor']['login'])
else:
cntrb_id = None

pr_event = {
'pr_event_id': self.event_id_inc,
'pull_request_id': pr_id,
'cntrb_id': cntrb_id,
'action': pr_event_dict['event'],
'action_commit_hash': None,
'created_at': pr_event_dict['created_at'],
'issue_event_src_id': pr_event_dict['id'],
'node_id': pr_event_dict['node_id'],
'node_url': pr_event_dict['url'],
'tool_source': self.tool_source,
'tool_version': self.tool_version,
'data_source': self.data_source,
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
}

result = self.db.execute(self.pull_request_events_table.insert().values(pr_event))
logging.info(f"Primary Key inserted in pull_request_events table: {result.inserted_primary_key}")
logging.info(f"Inserted PR Events data for PR with id {pr_id}")

self.results_counter += 1
self.event_id_inc += 1

def query_contributors(self, entry_info):

Expand Down