From 615e4fa7950a4c09f62aed6357d505e101cc475b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 13 Aug 2021 11:57:23 -0500 Subject: [PATCH 1/5] Annoying python tabs Signed-off-by: Isaac Milarsky --- workers/worker_git_integration.py | 3666 +++++++++++++++-------------- 1 file changed, 1845 insertions(+), 1821 deletions(-) diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index 0d66415dea..10d2cde616 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -5,1832 +5,1856 @@ #This is a worker base subclass that adds the ability to query github/gitlab with the api key class WorkerGitInterfaceable(Worker): - def __init__(self, worker_type, config={}, given=[], models=[], data_tables=[], operations_tables=[], platform="github"): - super().__init__(worker_type, config, given, models, data_tables, operations_tables) - - self.config.update({ - 'gh_api_key': self.augur_config.get_value('Database', 'key'), - 'gitlab_api_key': self.augur_config.get_value('Database', 'gitlab_api_key') - }) - - #Fix loose attribute definition - self.headers = None - self.platform = platform - self.given = given - self.models = models - - self.specs = { - 'id': self.config['id'], # what the broker knows this worker as - 'location': self.config['location'], # host + port worker is running on (so broker can send tasks here) - 'qualifications': [ - { - 'given': self.given, # type of repo this worker can be given as a task - 'models': self.models # models this worker can fill for a repo as a task - } - ], - 'config': self.config - } - - # Send broker hello message - if self.config['offline_mode'] is False: - self.connect_to_broker() + def __init__(self, worker_type, config={}, given=[], models=[], data_tables=[], operations_tables=[], platform="github"): + super().__init__(worker_type, config, given, models, data_tables, operations_tables) + + self.config.update({ + 'gh_api_key': self.augur_config.get_value('Database', 'key'), + 'gitlab_api_key': self.augur_config.get_value('Database', 'gitlab_api_key') + }) + + #Fix loose attribute definition + self.headers = None + self.platform = platform + self.given = given + self.models = models + + self.specs = { + 'id': self.config['id'], # what the broker knows this worker as + 'location': self.config['location'], # host + port worker is running on (so broker can send tasks here) + 'qualifications': [ + { + 'given': self.given, # type of repo this worker can be given as a task + 'models': self.models # models this worker can fill for a repo as a task + } + ], + 'config': self.config + } + + # Send broker hello message + if self.config['offline_mode'] is False: + self.connect_to_broker() # Attempts to determine if these attributes exist # If not, it creates them with default values - try: - self.tool_source - self.tool_version - self.data_source - except AttributeError: - self.tool_source = 'Augur Worker Testing' - self.tool_version = '0.0.0' - self.data_source = 'Augur Worker Testing' - - #database interface, additional functionality with github interface. - def initialize_database_connections(self): - super().initialize_database_connections() - # Organize different api keys/oauths available - self.logger.info("Initializing API key.") - if 'gh_api_key' in self.config or 'gitlab_api_key' in self.config: - try: - self.init_oauths(self.platform) - except AttributeError: - self.logger.error("Worker not configured to use API key!") - else: - self.oauths = [{'oauth_id': 0}] - - def find_id_from_login(self, login, platform='github'): - """ Retrieves our contributor table primary key value for the contributor with - the given GitHub login credentials, if this contributor is not there, then - they get inserted. - - :param login: String, the GitHub login username to find the primary key id for - :return: Integer, the id of the row in our database with the matching GitHub login - """ - idSQL = s.sql.text(""" - SELECT cntrb_id FROM contributors WHERE cntrb_login = '{}' \ - AND LOWER(data_source) = '{} api' - """.format(login, platform)) - - rs = pd.read_sql(idSQL, self.db, params={}) - data_list = [list(row) for row in rs.itertuples(index=False)] - try: - return data_list[0][0] - except: - self.logger.info('contributor needs to be added...') - - if platform == 'github': - cntrb_url = ("https://api.github.com/users/" + login) - elif platform == 'gitlab': - cntrb_url = ("https://gitlab.com/api/v4/users?username=" + login ) - self.logger.info("Hitting endpoint: {} ...\n".format(cntrb_url)) + try: + self.tool_source + self.tool_version + self.data_source + except AttributeError: + self.tool_source = 'Augur Worker Testing' + self.tool_version = '0.0.0' + self.data_source = 'Augur Worker Testing' + + #database interface, additional functionality with github interface. + def initialize_database_connections(self): + super().initialize_database_connections() + # Organize different api keys/oauths available + self.logger.info("Initializing API key.") + if 'gh_api_key' in self.config or 'gitlab_api_key' in self.config: + try: + self.init_oauths(self.platform) + except AttributeError: + self.logger.error("Worker not configured to use API key!") + else: + self.oauths = [{'oauth_id': 0}] + + def find_id_from_login(self, login, platform='github'): + """ Retrieves our contributor table primary key value for the contributor with + the given GitHub login credentials, if this contributor is not there, then + they get inserted. + + :param login: String, the GitHub login username to find the primary key id for + :return: Integer, the id of the row in our database with the matching GitHub login + """ + idSQL = s.sql.text(""" + SELECT cntrb_id FROM contributors WHERE cntrb_login = '{}' \ + AND LOWER(data_source) = '{} api' + """.format(login, platform)) + + rs = pd.read_sql(idSQL, self.db, params={}) + data_list = [list(row) for row in rs.itertuples(index=False)] + try: + return data_list[0][0] + except: + self.logger.info('contributor needs to be added...') + + if platform == 'github': + cntrb_url = ("https://api.github.com/users/" + login) + elif platform == 'gitlab': + cntrb_url = ("https://gitlab.com/api/v4/users?username=" + login ) + self.logger.info("Hitting endpoint: {} ...\n".format(cntrb_url)) # Possible infinite loop if this request never succeeds? - while True: - try: - r = requests.get(url=cntrb_url, headers=self.headers) - break - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(30) + while True: + try: + r = requests.get(url=cntrb_url, headers=self.headers) + break + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(30) - self.update_rate_limit(r) - contributor = r.json() + self.update_rate_limit(r) + contributor = r.json() # Used primarily for the Gitlab block below - company = None - location = None - email = None - if 'company' in contributor: - company = contributor['company'] - if 'location' in contributor: - location = contributor['location'] - if 'email' in contributor: - email = contributor['email'] - - - if platform == 'github': - cntrb = { - 'cntrb_login': contributor['login'] if 'login' in contributor else None, - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_created_at': contributor['created_at'] if 'created_at' in contributor else None, - 'cntrb_canonical': None, - 'gh_user_id': contributor['id'] if 'id' in contributor else None, - 'gh_login': contributor['login'] if 'login' in contributor else None, - 'gh_url': contributor['url'] if 'url' in contributor else None, - 'gh_html_url': contributor['html_url'] if 'html_url' in contributor else None, - 'gh_node_id': contributor['node_id'] if 'node_id' in contributor else None, - 'gh_avatar_url': contributor['avatar_url'] if 'avatar_url' in contributor else None, - 'gh_gravatar_id': contributor['gravatar_id'] if 'gravatar_id' in contributor else None, - 'gh_followers_url': contributor['followers_url'] if 'followers_url' in contributor else None, - 'gh_following_url': contributor['following_url'] if 'following_url' in contributor else None, - 'gh_gists_url': contributor['gists_url'] if 'gists_url' in contributor else None, - 'gh_starred_url': contributor['starred_url'] if 'starred_url' in contributor else None, - 'gh_subscriptions_url': contributor['subscriptions_url'] if 'subscriptions_url' in contributor else None, - 'gh_organizations_url': contributor['organizations_url'] if 'organizations_url' in contributor else None, - 'gh_repos_url': contributor['repos_url'] if 'repos_url' in contributor else None, - 'gh_events_url': contributor['events_url'] if 'events_url' in contributor else None, - 'gh_received_events_url': contributor['received_events_url'] if 'received_events_url' in contributor else None, - 'gh_type': contributor['type'] if 'type' in contributor else None, - 'gh_site_admin': contributor['site_admin'] if 'site_admin' in contributor else None, - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } - - elif platform == 'gitlab': - cntrb = { - 'cntrb_login': contributor[0]['username'] if 'username' in contributor[0] else None, - 'cntrb_email': email, - 'cntrb_company': company, - 'cntrb_location': location, - 'cntrb_created_at': contributor[0]['created_at'] if 'created_at' in contributor[0] else None, - 'cntrb_canonical': None, - 'gh_user_id': contributor[0]['id'], - 'gh_login': contributor[0]['username'], - 'gh_url': contributor[0]['web_url'], - 'gh_html_url': None, - 'gh_node_id': None, - 'gh_avatar_url': contributor[0]['avatar_url'], - 'gh_gravatar_id': None, - 'gh_followers_url': None, - 'gh_following_url': None, - 'gh_gists_url': None, - 'gh_starred_url': None, - 'gh_subscriptions_url': None, - 'gh_organizations_url': None, - 'gh_repos_url': None, - 'gh_events_url': None, - 'gh_received_events_url': None, - 'gh_type': None, - 'gh_site_admin': None, - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: " + str(result.inserted_primary_key)) - self.results_counter += 1 - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - self.logger.info(f"Inserted contributor: {cntrb['cntrb_login']}\n") - - return self.find_id_from_login(login, platform) - - #Blatently only for api key usage - def init_oauths(self, platform='github'): - - self.oauths = [] - self.headers = None - self.logger.info("Trying initialization.") - # Make a list of api key in the config combined w keys stored in the database - # Select endpoint to hit solely to retrieve rate limit - # information from headers of the response - # Adjust header keys needed to fetch rate limit information from the API responses - if platform == 'github': - url = "https://api.github.com/users/gabe-heim" - oauthSQL = s.sql.text(""" - SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'github' - """.format(self.config['gh_api_key'])) - key_name = 'gh_api_key' - rate_limit_header_key = "X-RateLimit-Remaining" - rate_limit_reset_header_key = "X-RateLimit-Reset" - elif platform == 'gitlab': - url = "https://gitlab.com/api/v4/version" - oauthSQL = s.sql.text(""" - SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'gitlab' - """.format(self.config['gitlab_api_key'])) - key_name = 'gitlab_api_key' - rate_limit_header_key = 'ratelimit-remaining' - rate_limit_reset_header_key = 'ratelimit-reset' - - for oauth in [{'oauth_id': 0, 'access_token': self.config[key_name]}] + json.loads( - pd.read_sql(oauthSQL, self.helper_db, params={}).to_json(orient="records") - ): - if platform == 'github': - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - elif platform == 'gitlab': - self.headers = {'Authorization': 'Bearer %s' % oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - self.oauths.append({ - 'oauth_id': oauth['oauth_id'], - 'access_token': oauth['access_token'], - 'rate_limit': int(response.headers[rate_limit_header_key]), - 'seconds_to_reset': ( - datetime.datetime.fromtimestamp( - int(response.headers[rate_limit_reset_header_key]) - ) - datetime.datetime.now() - ).total_seconds() - }) - self.logger.debug("Found OAuth available for use: {}".format(self.oauths[-1])) - - if len(self.oauths) == 0: - self.logger.info( - "No API keys detected, please include one in your config or in the " - "worker_oauths table in the augur_operations schema of your database." - ) - - # First key to be used will be the one specified in the config (first element in - # self.oauths array will always be the key in use) - if platform == 'github': - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - elif platform == 'gitlab': - self.headers = {'Authorization': 'Bearer %s' % self.oauths[0]['access_token']} - - self.logger.info("OAuth initialized\n") - - def enrich_cntrb_id( - self, data, key, action_map_additions={'insert': {'source': [], 'augur': []}}, - platform='github', prefix='' - ): - - if not len(data): - self.logger.info(f"Enrich contrib data is empty for {len(data)}, for the key {key}.") - - raise ValueError - - self.logger.info(f"Enriching contributor ids for {len(data)} data points...") - - source_df = pd.DataFrame(data) - expanded_source_df = self._add_nested_columns( - source_df.copy(), [key] + action_map_additions['insert']['source'] - ) - - # Insert cntrbs that are not in db - - cntrb_action_map = { - 'insert': { - 'source': [key] + action_map_additions['insert']['source'] + [f'{prefix}id'], - 'augur': ['cntrb_login'] + action_map_additions['insert']['augur'] + ['gh_user_id'] - } - } - - table_values_cntrb = self.db.execute( - s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)) - ).fetchall() - - source_data = expanded_source_df.to_dict(orient='records') - - #Filter out bad data where we can't even hit the api. - source_data = [data for data in source_data if f'{prefix}login' in data and data[f'{prefix}login'] != None] - - self.logger.info(f"table_values_cntrb keys: {table_values_cntrb[0].keys()}") - self.logger.info(f"source_data keys: {source_data[0].keys()}") - - #We can't use this because of worker collisions - #TODO: seperate this method into it's own worker. - #cntrb_id_offset = self.get_max_id(self.contributors_table, 'cntrb_id') - 1 - - # loop through data to test if it is already in the database - for index, data in enumerate(source_data): - - self.logger.info(f"Enriching {index} of {len(source_data)}") - - - user_unique_ids = [] - - #Allow for alt identifiers to be checked if user.id is not present in source_data - try: - #This will trigger a KeyError if data has alt identifier. - data[f'{prefix}id'] - for row in table_values_cntrb: - try: - user_unique_ids.append(row['gh_user_id']) - except Exception as e: - self.logger.info(f"Error adding gh_user_id: {e}. Row: {row}") - except KeyError: - self.logger.info("Source data doesn't have user.id. Using node_id instead.") - for row in table_values_cntrb: - try: - user_unique_ids.append(row['gh_node_id']) - except Exception as e: - self.logger.info(f"Error adding gh_node_id: {e}. Row: {row}") - - - #self.logger.info(f"gh_user_ids: {gh_user_ids}") - - # self.logger.info(f"Users gh_user_id: {data['user.id']}") - # in_user_ids = False - # if data['user.id'] in gh_user_ids: - # in_user_ids = True - # self.logger.info(f"{data['user.id']} is in gh_user_ids") - - # self.logger.info(f"table_values_cntrb len: {len(table_values_cntrb)}") - - #Deal with if data - #See if we can check using the user.id - source_data_id = None - try: - source_data_id = data[f'{prefix}id'] - except KeyError: - source_data_id = data[f'{prefix}node_id'] - - - #if user.id is in the database then there is no need to add the contributor - if source_data_id in user_unique_ids: - - self.logger.info("{} found in database".format(source_data_id)) - - user_id_row = [] - try: - data[f'{prefix}id'] - #gets the dict from the table_values_cntrb that contains data['user.id'] - user_id_row = list(filter(lambda x: x['gh_user_id'] == source_data_id, table_values_cntrb))[0] - except KeyError: - user_id_row = list(filter(lambda x: x['gh_node_id'] == source_data_id, table_values_cntrb))[0] - - - #assigns the cntrb_id to the source data to be returned to the workers - data['cntrb_id'] = user_id_row['cntrb_id'] - self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") - - #contributor is not in the database - else: - - self.logger.info("{} not in database, making api call".format(source_data_id)) - - self.logger.info("login: {}".format(data[f'{prefix}login'])) - - try: - url = ("https://api.github.com/users/" + data[f'{prefix}login']) - except Exception as e: - self.logger.info(f"Error when creating url: {e}. Data: {data}") - - attempts = 0 - - try: - while attempts < 10: - try: - self.logger.info("Hitting endpoint: " + url + " ...\n") - response = requests.get(url=url , headers=self.headers) - break - except TimeoutError: - self.logger.info(f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") - time.sleep(10) - - attempts += 1 - except Exception as e: - raise e - - try: - contributor = response.json() - except: - contributor = json.loads(json.dumps(response.text)) - - self.logger.info(f"Contributor data: {contributor}") - - cntrb = { - "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'], - "cntrb_email": contributor['email'] if 'email' in contributor else None, - "cntrb_company": contributor['company'] if 'company' in contributor else None, - "cntrb_location": contributor['location'] if 'location' in contributor else None, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": contributor['email'] if 'email' in contributor else None, - "gh_user_id": contributor['id'], - "gh_login": contributor['login'], - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - - #insert new contributor into database - # TODO: make this method it's own worker. This errors because of collisions between github_worker and pull_request_worker. - #We can solve this by making another worker with a queue. It wouldn't have to be too complicated. - try: - self.db.execute(self.contributors_table.insert().values(cntrb)) - # except s.exc.IntegrityError: - except Exception as e: - self.logger.info(f"Contributor was unable to be added to table! Attempting to get cntrb_id from table anyway because of possible collision. Error: {e}") - - - #Get the contributor id from the newly inserted contributor. - cntrb_id_row = self.db.execute( - s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)).where( - self.contributors_table.c.gh_user_id==cntrb["gh_user_id"] - ) - ).fetchall() - - #Handle and log rare failure cases. If this part errors something is very wrong. - if len(cntrb_id_row) == 1: - data['cntrb_id'] = cntrb_id_row[0]['cntrb_id'] - self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") - elif len(cntrb_id_row) == 0: - self.logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor that is unable to be inserted into the contributors table but is also not present in that table.") - else: - self.logger.info(f"There are more than one contributors in the table with gh_user_id={cntrb['gh_user_id']}") - - - cntrb_data = { - 'cntrb_id': data['cntrb_id'], - 'gh_node_id': cntrb['gh_node_id'], - 'cntrb_login': cntrb['cntrb_login'], - 'gh_user_id': cntrb['gh_user_id'] - } - #This updates our list of who is already in the database as we iterate to avoid duplicates. - #People who make changes tend to make more than one in a row. - table_values_cntrb.append(cntrb_data) - - self.logger.info( - "Contributor id enrichment successful, result has " - f"{len(source_data)} data points.\n" - ) - return source_data - - - - - #old method - """ - # source_cntrb_insert, _ = self.organize_needed_data( - # expanded_source_df.to_dict(orient='records'), table_values=table_values_cntrb, - # action_map=cntrb_action_map - # ) - - # cntrb_insert = [ - # { - # 'cntrb_login': contributor[f'{prefix}login'], - # 'cntrb_created_at': None if ( - # f'{prefix}created_at' not in contributor - # ) else contributor[f'{prefix}created_at'], - # 'cntrb_email': None if f'{prefix}email' not in contributor else contributor[f'{prefix}email'], - # 'cntrb_company': None if f'{prefix}company' not in contributor else contributor[f'{prefix}company'], - # 'cntrb_location': None if ( - # f'{prefix}location' not in contributor - # ) else contributor[f'{prefix}location'], - # 'gh_user_id': None if ( - # not contributor[f'{prefix}id'] - # ) else int(float(contributor[f'{prefix}id'])), - # 'gh_login': contributor[f'{prefix}login'], - # 'gh_url': contributor[f'{prefix}url'], - # 'gh_html_url': contributor[f'{prefix}html_url'], - # 'gh_node_id': contributor[f'{prefix}node_id'], #valid for dup check - # 'gh_avatar_url': contributor[f'{prefix}avatar_url'], - # 'gh_gravatar_id': contributor[f'{prefix}gravatar_id'], - # 'gh_followers_url': contributor[f'{prefix}followers_url'], - # 'gh_following_url': contributor[f'{prefix}following_url'], - # 'gh_gists_url': contributor[f'{prefix}gists_url'], - # 'gh_starred_url': contributor[f'{prefix}starred_url'], - # 'gh_subscriptions_url': contributor[f'{prefix}subscriptions_url'], - # 'gh_organizations_url': contributor[f'{prefix}organizations_url'], - # 'gh_repos_url': contributor[f'{prefix}repos_url'], - # 'gh_events_url': contributor[f'{prefix}events_url'], - # 'gh_received_events_url': contributor[f'{prefix}received_events_url'], - # 'gh_type': contributor[f'{prefix}type'], - # 'gh_site_admin': contributor[f'{prefix}site_admin'], - # 'tool_source': self.tool_source, - # 'tool_version': self.tool_version, - # 'data_source': self.data_source - # } for contributor in source_cntrb_insert if contributor[f'{prefix}login'] - # ] - # - # try: - # self.bulk_insert(self.contributors_table, cntrb_insert) - # except s.exc.IntegrityError: - # self.logger.info("Unique Violation in contributors table! ") - # - # # Query db for inserted cntrb pkeys and add to shallow level of data - # - # # Query - # cntrb_pk_name = list(self.contributors_table.primary_key)[0].name - # session = s.orm.Session(self.db) - # inserted_pks = pd.DataFrame( - # session.query( - # self.contributors_table.c[cntrb_pk_name], self.contributors_table.c.cntrb_login, - # self.contributors_table.c.gh_node_id - # ).distinct(self.contributors_table.c.cntrb_login).order_by( - # self.contributors_table.c.cntrb_login, self.contributors_table.c[cntrb_pk_name] - # ).all(), columns=[cntrb_pk_name, 'cntrb_login', 'gh_node_id'] - # ).to_dict(orient='records') - # session.close() - # - # # Prepare for merge - # source_columns = sorted(list(source_df.columns)) - # necessary_columns = sorted(list(set(source_columns + cntrb_action_map['insert']['source']))) - # (source_table, inserted_pks_table), metadata, session = self._setup_postgres_merge( - # [ - # expanded_source_df[necessary_columns].to_dict(orient='records'), - # inserted_pks - # ], sort=True - # ) - # final_columns = [cntrb_pk_name] + sorted(list(set(necessary_columns))) - # - # # Merge - # source_pk = pd.DataFrame( - # session.query( - # inserted_pks_table.c.cntrb_id, source_table - # ).join( - # source_table, - # eval( - # ' and '.join( - # [ - # ( - # f"inserted_pks_table.c['{table_column}'] " - # f"== source_table.c['{source_column}']" - # ) for table_column, source_column in zip( - # cntrb_action_map['insert']['augur'], - # cntrb_action_map['insert']['source'] - # ) - # ] - # ) - # ) - # ).all(), columns=final_columns - # ) - # - # # Cleanup merge - # source_pk = self._eval_json_columns(source_pk) - # self._close_postgres_merge(metadata, session) - - #self.logger.info( - # "Contributor id enrichment successful, result has " - # f"{len(source_pk)} data points.\n" - #) - - #return source_pk.to_dict(orient='records')""" - - def query_github_contributors(self, entry_info, repo_id): - - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") - - ## It absolutely doesn't matter if the contributor has already contributoed to a repo. it only matters that they exist in our table, and - ## if the DO, then we DO NOT want to insert them again in any GitHub Method. - github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] - - # Extract owner/repo from the url for the endpoint - owner, name = self.get_owner_repo(github_url) - - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?per_page=100&page={}" - ) - - # Get contributors that we already have stored - # Set our duplicate and update column map keys (something other than PK) to - # check dupicates/needed column updates with - table = 'contributors' - table_pkey = 'cntrb_id' - update_col_map = {'cntrb_email': 'email'} - duplicate_col_map = {'cntrb_login': 'login'} - - #list to hold contributors needing insertion or update - contributors = self.paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) - - self.logger.info("Count of contributors needing insertion: " + str(len(contributors)) + "\n") - - for repo_contributor in contributors: - try: - # Need to hit this single contributor endpoint to get extra data including... - # `created at` - # i think that's it - cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - self.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - self.update_gh_rate_limit(r) - contributor = r.json() - - company = None - location = None - email = None - if 'company' in contributor: - company = contributor['company'] - if 'location' in contributor: - location = contributor['location'] - if 'email' in contributor: - email = contributor['email'] - canonical_email = contributor['email'] - - cntrb = { - "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'], - "cntrb_email": email, - "cntrb_company": company, - "cntrb_location": location, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": canonical_email, - "gh_user_id": contributor['id'], - "gh_login": contributor['login'], - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], #This is what we are dup checking - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - #dup check - #TODO: add additional fields to check if needed. - existingMatchingContributors = self.db.execute( - self.sql.select( - [self.contributors_table.c.gh_node_id] - ).where( - self.contributors_table.c.gh_node_id==cntrb["gh_node_id"] - ).fetchall() - ) - - if len(existingMatchingContributors) > 0: - break #if contributor already exists in table - - - # Commit insertion to table - if repo_contributor['flag'] == 'need_update': - result = self.db.execute(self.contributors_table.update().where( - self.worker_history_table.c.cntrb_email==email).values(cntrb)) - self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) - self.cntrb_id_inc = repo_contributor['pkey'] - elif repo_contributor['flag'] == 'need_insertion': - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) - self.results_counter += 1 - - self.logger.info("Inserted contributor: " + contributor['login'] + "\n") - - # Increment our global track of the cntrb id for the possibility of it being used as a FK - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - - except Exception as e: - self.logger.error("Caught exception: {}".format(e)) - self.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) - continue - - - def query_github_contributors_bulk(self, entry_info, repo_id): - - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") - - github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] - - owner, name = self.get_owner_repo(github_url) - - contributors_url = (f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?per_page=100&page={}") - - action_map = { - 'insert': { - 'source': ['login'], - 'augur': ['cntrb_login'] - }, - 'update': { - 'source': ['email'], - 'augur': ['cntrb_email'] - } - } - - source_contributors = self.paginate_endpoint(contributors_url, action_map=action_map, - table=self.contributors_table) - - contributors_insert = [] - - for repo_contributor in source_contributors['insert']: - # Need to hit this single contributor endpoint to get extra data - cntrb_url = (f"https://api.github.com/users/{repo_contributor['login']}") - self.logger.info(f"Hitting endpoint: {cntrb_url} ...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - self.update_gh_rate_limit(r) - contributor = r.json() - - contributors_insert.append({ - 'cntrb_login': contributor['login'], - 'cntrb_created_at': contributor['created_at'], - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, - 'gh_user_id': contributor['id'], - 'gh_login': contributor['login'], - 'gh_url': contributor['url'], - 'gh_html_url': contributor['html_url'], - 'gh_node_id': contributor['node_id'], - 'gh_avatar_url': contributor['avatar_url'], - 'gh_gravatar_id': contributor['gravatar_id'], - 'gh_followers_url': contributor['followers_url'], - 'gh_following_url': contributor['following_url'], - 'gh_gists_url': contributor['gists_url'], - 'gh_starred_url': contributor['starred_url'], - 'gh_subscriptions_url': contributor['subscriptions_url'], - 'gh_organizations_url': contributor['organizations_url'], - 'gh_repos_url': contributor['repos_url'], - 'gh_events_url': contributor['events_url'], - 'gh_received_events_url': contributor['received_events_url'], - 'gh_type': contributor['type'], - 'gh_site_admin': contributor['site_admin'], - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - }) - - contributors_insert_result, contributors_update_result = self.bulk_insert(self.contributors_table, - update=source_contributors['update'], unique_columns=action_map['insert']['augur'], - insert=contributors_insert, update_columns=action_map['update']['augur']) - - def query_github_contributors_fast(self, entry_info, repo_id): - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}") - - github_url = ( - entry_info['given']['github_url'] if 'github_url' in entry_info['given'] - else entry_info['given']['git_url'] - ) - - contributors_url = ( - f"https://api.github.com/repos/{self.owner}/{self.name}/" - "contributors?per_page=100&page={}" - ) - - action_map = { - 'insert': { - 'source': ['login'], - 'augur': ['cntrb_login'] - }, - 'update': { - 'source': ['email'], - 'augur': ['cntrb_email'] - } - } - - source_contributors = self.paginate_endpoint( - contributors_url, action_map=action_map, table=self.contributors_table - ) - - contributors_insert = [ - { - 'cntrb_login': contributor['login'], - 'cntrb_created_at': ( - contributor['created_at'] if 'created_at' in contributor else None - ), - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, - 'gh_user_id': contributor['id'], - 'gh_login': contributor['login'], - 'gh_url': contributor['url'], - 'gh_html_url': contributor['html_url'], - 'gh_node_id': contributor['node_id'], - 'gh_avatar_url': contributor['avatar_url'], - 'gh_gravatar_id': contributor['gravatar_id'], - 'gh_followers_url': contributor['followers_url'], - 'gh_following_url': contributor['following_url'], - 'gh_gists_url': contributor['gists_url'], - 'gh_starred_url': contributor['starred_url'], - 'gh_subscriptions_url': contributor['subscriptions_url'], - 'gh_organizations_url': contributor['organizations_url'], - 'gh_repos_url': contributor['repos_url'], - 'gh_events_url': contributor['events_url'], - 'gh_received_events_url': contributor['received_events_url'], - 'gh_type': contributor['type'], - 'gh_site_admin': contributor['site_admin'], - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } for contributor in source_contributors['insert'] - ] - - self.bulk_insert( - self.contributors_table, update=source_contributors['update'], - unique_columns=action_map['insert']['augur'], - insert=contributors_insert, update_columns=action_map['update']['augur'] - ) - - def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.info( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.info("Gitlab rate limit reached. Temp. disabling...") - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Updated rate limit, you have: " + - str(self.oauths[0]['rate_limit']) + " requests remaining.") - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['RateLimit-Reset'] - except Exception as e: - self.logger.info(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://gitlab.com/api/v4/version" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info(f"Higher rate limit found in oauth: {oauth}") - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} - - def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.warning( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.debug( - "Github thinks we are abusing their api. Preventing use " - "of this key until its rate limit resets..." - ) - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - # self.logger.info("Recieved rate limit from headers\n") - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Headers did not work, had to decrement") - self.logger.info( - f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." - ) - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['X-RateLimit-Reset'] - except Exception as e: - self.logger.error(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://api.github.com/users/gabe-heim" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - - attempts = 3 - success = False - while attempts > 0 and not success: - response = requests.get(url=url, headers=self.headers) - try: - oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['X-RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - success = True - except Exception as e: - self.logger.info( - f"oath method ran into error getting info from headers: {e}\n" - ) - self.logger.info(f"{self.headers}\n{url}\n") - attempts -= 1 - if not success: - continue - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}\n" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - - #TODO: figure out if changing this typo breaks anything - def query_gitlab_contributors(self, entry_info, repo_id): - - gitlab_url = ( - entry_info['given']['gitlab_url'] if 'gitlab_url' in entry_info['given'] - else entry_info['given']['git_url'] - ) - - self.logger.info("Querying contributors with given entry info: " + str(entry_info) + "\n") - - path = urlparse(gitlab_url) - split = path[2].split('/') - - owner = split[1] - name = split[2] - - # Handles git url case by removing the extension - if ".git" in name: - name = name[:-4] - - url_encoded_format = quote(owner + '/' + name, safe='') - - table = 'contributors' - table_pkey = 'cntrb_id' - ### Here we are adding gitlab user information from the API - ### Following Gabe's rework of the contributor worker. - - ### The GitLab API will NEVER give you an email. It will let you - ### Query an email, but never give you one. - ### ## Gitlab email api: https://gitlab.com/api/v4/users?search=s@goggins.com - ### We don't need to update right now, so commenting out. - ### TODO: SOLVE LOGIC. - # update_col_map = {'cntrb_email': 'email'} - update_col_map = {} - duplicate_col_map = {'gl_username': 'username'} - - # list to hold contributors needing insertion or update - contributors = self.paginate("https://gitlab.com/api/v4/projects/" + url_encoded_format + "/repository/contributors?per_page=100&page={}", duplicate_col_map, update_col_map, table, table_pkey, platform='gitlab') - - for repo_contributor in contributors: - try: - cntrb_compressed_url = ("https://gitlab.com/api/v4/users?search=" + repo_contributor['email']) - self.logger.info("Hitting endpoint: " + cntrb_compressed_url + " ...\n") - r = requests.get(url=cntrb_compressed_url, headers=self.headers) - contributor_compressed = r.json() - - email = repo_contributor['email'] - self.logger.info(contributor_compressed) - if len(contributor_compressed) == 0 or type(contributor_compressed) is dict or "id" not in contributor_compressed[0]: - continue - - self.logger.info("Fetching for user: " + str(contributor_compressed[0]["id"])) - - cntrb_url = ("https://gitlab.com/api/v4/users/" + str(contributor_compressed[0]["id"])) - self.logger.info("Hitting end point to get complete contributor info now: " + cntrb_url + "...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - contributor = r.json() - - cntrb = { - "gl_id": contributor.get('gl_id', None), - "gl_full_name": contributor.get('full_name', None), - "gl_username": contributor.get('username', None), - "gl_state": contributor.get('state', None), - "gl_avatar_url": contributor.get('avatar_url', None), - "gl_web_url": contributor.get('web_url', None), - #"cntrb_login": contributor.get('username', None), - #"cntrb_created_at": contributor.get('created_at', None), - "cntrb_email": ('email', None), - #"cntrb_company": contributor.get('organization', None), - #"cntrb_location": contributor.get('location', None), - # "cntrb_type": , dont have a use for this as of now ... let it default to null - #"cntrb_canonical": contributor.get('public_email', None), - #"gh_user_id": contributor.get('id', None), - #"gh_login": contributor.get('username', None), - #"gh_url": contributor.get('web_url', None), - #"gh_html_url": contributor.get('web_url', None), - #"gh_node_id": None, - #"gh_avatar_url": contributor.get('avatar_url', None), - #"gh_gravatar_id": None, - #"gh_followers_url": None, - #"gh_following_url": None, - #"gh_gists_url": None, - #"gh_starred_url": None, - #"gh_subscriptions_url": None, - #"gh_organizations_url": None, - #"gh_repos_url": None, - #"gh_events_url": None, - #"gh_received_events_url": None, - #"gh_type": None, - #"gh_site_admin": None, - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - - # Commit insertion to table - if repo_contributor['flag'] == 'need_update': - result = self.db.execute(self.contributors_table.update().where( - self.worker_history_table.c.cntrb_email == email).values(cntrb)) - self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) - self.cntrb_id_inc = repo_contributor['pkey'] - elif repo_contributor['flag'] == 'need_insertion': - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) - self.results_counter += 1 - - self.logger.info("Inserted contributor: " + contributor['username'] + "\n") - - # Increment our global track of the cntrb id for the possibility of it being used as a FK - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - - except Exception as e: - self.logger.info("Caught exception: {}".format(e)) - self.logger.info("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) - continue - - - def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.info( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.info("Gitlab rate limit reached. Temp. disabling...") - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Updated rate limit, you have: " + - str(self.oauths[0]['rate_limit']) + " requests remaining.") - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['RateLimit-Reset'] - except Exception as e: - self.logger.info(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://gitlab.com/api/v4/version" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info(f"Higher rate limit found in oauth: {oauth}") - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} - - - def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.warning( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.debug( - "Github thinks we are abusing their api. Preventing use " - "of this key until its rate limit resets..." - ) - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - # self.logger.info("Recieved rate limit from headers\n") - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Headers did not work, had to decrement") - self.logger.info( - f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." - ) - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['X-RateLimit-Reset'] - except Exception as e: - self.logger.error(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://api.github.com/users/gabe-heim" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - - attempts = 3 - success = False - while attempts > 0 and not success: - response = requests.get(url=url, headers=self.headers) - try: - oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['X-RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - success = True - except Exception as e: - self.logger.info( - f"oath method ran into error getting info from headers: {e}\n" - ) - self.logger.info(f"{self.headers}\n{url}\n") - attempts -= 1 - if not success: - continue - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}\n" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - - def update_rate_limit( - self, response, bad_credentials=False, temporarily_disable=False, platform="gitlab" - ): - if platform == 'gitlab': - return self.update_gitlab_rate_limit( - response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable - ) - elif platform == 'github': - return self.update_gh_rate_limit( - response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable - ) - - - #Indexerror somewhere - def multi_thread_urls(self, all_urls, max_attempts=5, platform='github'): - """ - :param all_urls: list of tuples - """ - - if not len(all_urls): - self.logger.info("No urls to multithread, returning blank list.\n") - return [] - - def load_url(url, extra_data={}): - try: - html = requests.get(url, stream=True, headers=self.headers) - return html, extra_data - except requests.exceptions.RequestException as e: - self.logger.info(e, url) - - self.logger.info("Beginning to multithread API endpoints.") - - start = time.time() - - all_data = [] - valid_url_count = len(all_urls) - - partitions = math.ceil(len(all_urls) / 600) - self.logger.info(f"{len(all_urls)} urls to process. Trying {partitions} partitions. " + - f"Using {max(multiprocessing.cpu_count()//8, 1)} threads.") - for urls in numpy.array_split(all_urls, partitions): - attempts = 0 - self.logger.info(f"Total data points collected so far: {len(all_data)}") - while len(urls) > 0 and attempts < max_attempts: - with concurrent.futures.ThreadPoolExecutor( - max_workers=max(multiprocessing.cpu_count()//8, 1) - ) as executor: - # Start the load operations and mark each future with its URL - future_to_url = {executor.submit(load_url, *url): url for url in urls} - self.logger.info("Multithreaded urls and returned status codes:") - count = 0 - for future in concurrent.futures.as_completed(future_to_url): - - if count % 100 == 0: - self.logger.info( - f"Processed {len(all_data)} / {valid_url_count} urls. " - f"{len(urls)} remaining in this partition." - ) - count += 1 - - url = future_to_url[future] - try: - response, extra_data = future.result() - - if response.status_code != 200: - self.logger.info( - f"Url: {url[0]} ; Status code: {response.status_code}" - ) - - if response.status_code == 403 or response.status_code == 401: # 403 is rate limit, 404 is not found, 401 is bad credentials - self.update_rate_limit(response, platform=platform) - continue - - elif response.status_code == 200: - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - page_data = [{**data, **extra_data} for data in page_data] - all_data += page_data - - if 'last' in response.links and "&page=" not in url[0]: - urls += [ - (url[0] + f"&page={page}", extra_data) for page in range( - 2, int(response.links['last']['url'].split('=')[-1]) + 1 - ) - ] - try: - # self.logger.info(f"urls boundry issue? for {urls} where they are equal to {url}.") - - urls = numpy.delete(urls, numpy.where(urls == url), axis=0) - except: - self.logger.info(f"ERROR with axis = 0 - Now attempting without setting axis for numpy.delete for {urls} where they are equal to {url}.") - urls = numpy.delete(urls, numpy.where(urls == url)) - - elif response.status_code == 404: - urls = numpy.delete(urls, numpy.where(urls == url), axis=0) - self.logger.info(f"Not found url: {url}\n") - else: - self.logger.info( - f"Unhandled response code: {response.status_code} {url}\n" - ) - - except Exception as e: - self.logger.info( - f"{url} generated an exception: {traceback.format_exc()}\n" - ) - - attempts += 1 - - self.logger.info( - f"Processed {valid_url_count} urls and got {len(all_data)} data points " - f"in {time.time() - start} seconds thanks to multithreading!\n" - ) - return all_data - - - #insertion_method and stagger are arguments that allow paginate_endpoint to insert at around ~500 pages at a time. - def paginate_endpoint( - self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=500 - ): - - #Get augur columns using the action map along with the primary key - table_values = self.db.execute( - s.sql.select(self.get_relevant_columns(table, action_map)).where(where_clause) - ).fetchall() - - page_number = 1 - multiple_pages = False - need_insertion = [] - need_update = [] - - #Stores sum of page data - all_data = [] - forward_pagination = True - backwards_activation = False - last_page_number = -1 - - #Block to handle page queries and retry at least 10 times - while True: - - # Multiple attempts to hit endpoint - num_attempts = 0 - success = False - while num_attempts < 10: - self.logger.info(f"Hitting endpoint: {url.format(page_number)}...\n") - try: - response = requests.get(url=url.format(page_number), headers=self.headers) - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(10) - continue - - self.update_rate_limit(response, platform=platform) - - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - if type(page_data) == list: - success = True - break - elif type(page_data) == dict: - self.logger.info("Request returned a dict: {}\n".format(page_data)) - if page_data['message'] == "Not Found": - self.logger.warning( - "Github repo was not found or does not exist for endpoint: " - f"{url.format(page_number)}\n" - ) - break - if "You have triggered an abuse detection mechanism." in page_data['message']: - num_attempts -= 1 - self.update_rate_limit(response, temporarily_disable=True,platform=platform) - if page_data['message'] == "Bad credentials": - self.update_rate_limit(response, bad_credentials=True, platform=platform) - elif type(page_data) == str: - self.logger.info(f"Warning! page_data was string: {page_data}\n") - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - elif len(page_data) == 0: - self.logger.warning("Empty string, trying again...\n") - else: - try: - page_data = json.loads(page_data) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Success - - # Determine if continued pagination is needed - - if len(page_data) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - all_data += page_data - - if not forward_pagination: - - # Checking contents of requests with what we already have in the db - page_insertions, page_updates = self.organize_needed_data( - page_data, table_values, list(table.primary_key)[0].name, - action_map, in_memory=True - ) - - # Reached a page where we already have all tuples - if len(need_insertion) == 0 and len(need_update) == 0 and \ - backwards_activation: - self.logger.info( - "No more pages with unknown tuples, breaking from pagination.\n" - ) - break - - need_insertion += page_insertions - need_update += page_updates - - # Find last page so we can decrement from there - if 'last' in response.links and last_page_number == -1: - if platform == 'github': - last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) - elif platform == 'gitlab': - last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) - - if not forward_pagination and not backwards_activation: - page_number = last_page_number - backwards_activation = True - - self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, - int(last_page_number) if last_page_number != -1 else "*last page not known*")) - - if (page_number <= 1 and not forward_pagination) or \ - (page_number >= last_page_number and forward_pagination): - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - #This is probably where we should insert at around ~500 at a time - #makes sure that stagger is enabled, we have an insertion method, and the insertion happens every 500 pages or so. - if stagger and insertion_method != None and page_number % insertion_threshold == 0: - #call insertion method passed as argument. - staggered_source_prs = { - 'insert' : need_insertion, - 'update' : need_update, - 'all' : all_data - } - - #Use the method the subclass needs in order to insert the data. - insertion_method(staggered_source_prs,action_map) - - #clear the data from memory and avoid duplicate insertions. - need_insertion = [] - need_update = [] - all_data = [] - - page_number = page_number + 1 if forward_pagination else page_number - 1 - - if forward_pagination: - need_insertion, need_update = self.organize_needed_data( - all_data, table_values, list(table.primary_key)[0].name, action_map, - in_memory=in_memory - ) - - return { - 'insert': need_insertion, - 'update': need_update, - 'all': all_data - } - - #TODO: deprecated but still used by the issues worker. - def paginate(self, url, duplicate_col_map, update_col_map, table, table_pkey, where_clause="", value_update_col_map={}, platform="github"): - """ DEPRECATED - Paginate either backwards or forwards (depending on the value of the worker's - finishing_task attribute) through all the GitHub or GitLab api endpoint pages. - - :param url: String, the url of the API endpoint we are paginating through, expects - a curly brace string formatter within the string to format the Integer - representing the page number that is wanted to be returned - :param duplicate_col_map: Dictionary, maps the column names of the source data - to the field names in our database for columns that should be checked for - duplicates (if source data value == value in existing database row, then this - element is a duplicate and would not need an insertion). Key is source data - column name, value is database field name. Example: {'id': 'gh_issue_id'} - :param update_col_map: Dictionary, maps the column names of the source data - to the field names in our database for columns that should be checked for - updates (if source data value != value in existing database row, then an - update is needed). Key is source data column name, value is database field name. - Example: {'id': 'gh_issue_id'} - :param table: String, the name of the table that holds the values to check for - duplicates/updates against - :param table_pkey: String, the field name of the primary key of the table in - the database that we are getting the values for to cross-reference to check - for duplicates. - :param where_clause: String, optional where clause to filter the values - that are queried when preparing the values that will be cross-referenced - for duplicates/updates - :param value_update_col_map: Dictionary, sometimes we add a new field to a table, - and we want to trigger an update of that row in the database even if all of the - data values are the same and would not need an update ordinarily. Checking for - a specific existing value in the database field allows us to do this. The key is the - name of the field in the database we are checking for a specific value to trigger - an update, the value is the value we are checking for equality to trigger an update. - Example: {'cntrb_id': None} - :return: List of dictionaries, all data points from the pages of the specified API endpoint - each with a 'flag' key-value pair representing the required action to take with that - data point (i.e. 'need_insertion', 'need_update', 'none') - """ - - update_keys = list(update_col_map.keys()) if update_col_map else [] - update_keys += list(value_update_col_map.keys()) if value_update_col_map else [] - cols_to_query = list(duplicate_col_map.keys()) + update_keys + [table_pkey] - table_values = self.get_table_values(cols_to_query, [table], where_clause) - - i = 1 - multiple_pages = False - tuples = [] - while True: - num_attempts = 0 - success = False - while num_attempts < 3: - self.logger.info(f'Hitting endpoint: {url.format(i)}...\n') - r = requests.get(url=url.format(i), headers=self.headers) - - self.update_rate_limit(r, platform=platform) - if 'last' not in r.links: - last_page = None - else: - if platform == "github": - last_page = r.links['last']['url'][-6:].split('=')[1] - elif platform == "gitlab": - last_page = r.links['last']['url'].split('&')[2].split("=")[1] - self.logger.info("Analyzing page {} of {}\n".format(i, int(last_page) + 1 if last_page is not None else '*last page not known*')) - - try: - j = r.json() - except: - j = json.loads(json.dumps(r.text)) - - if type(j) != dict and type(j) != str: - success = True - break - elif type(j) == dict: - self.logger.info("Request returned a dict: {}\n".format(j)) - if j['message'] == 'Not Found': - self.logger.warning("Github repo was not found or does not exist for endpoint: {}\n".format(url)) - break - if j['message'] == 'You have triggered an abuse detection mechanism. Please wait a few minutes before you try again.': - num_attempts -= 1 - self.logger.info("rate limit update code goes here") - self.update_rate_limit(r, temporarily_disable=True,platform=platform) - if j['message'] == 'Bad credentials': - self.logger.info("rate limit update code goes here") - self.update_rate_limit(r, bad_credentials=True, platform=platform) - elif type(j) == str: - self.logger.info(f'J was string: {j}\n') - if '' in j: - self.logger.info('HTML was returned, trying again...\n') - elif len(j) == 0: - self.logger.warning('Empty string, trying again...\n') - else: - try: - j = json.loads(j) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Find last page so we can decrement from there - if 'last' in r.links and not multiple_pages and not self.finishing_task: - if platform == "github": - param = r.links['last']['url'][-6:] - i = int(param.split('=')[1]) + 1 - elif platform == "gitlab": - i = int(r.links['last']['url'].split('&')[2].split("=")[1]) + 1 - self.logger.info("Multiple pages of request, last page is " + str(i - 1) + "\n") - multiple_pages = True - elif not multiple_pages and not self.finishing_task: - self.logger.info("Only 1 page of request\n") - elif self.finishing_task: - self.logger.info("Finishing a previous task, paginating forwards ..." - " excess rate limit requests will be made\n") - - if len(j) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - # Checking contents of requests with what we already have in the db - j = self.assign_tuple_action(j, table_values, update_col_map, duplicate_col_map, table_pkey, value_update_col_map) - - if not j: - self.logger.error("Assigning tuple action failed, moving to next page.\n") - i = i + 1 if self.finishing_task else i - 1 - continue - try: - to_add = [obj for obj in j if obj not in tuples and (obj['flag'] != 'none')] - except Exception as e: - self.logger.error("Failure accessing data of page: {}. Moving to next page.\n".format(e)) - i = i + 1 if self.finishing_task else i - 1 - continue - if len(to_add) == 0 and multiple_pages and 'last' in r.links: - self.logger.info("{}".format(r.links['last'])) - if platform == "github": - page_number = int(r.links['last']['url'][-6:].split('=')[1]) - elif platform == "gitlab": - page_number = int(r.links['last']['url'].split('&')[2].split("=")[1]) - if i - 1 != page_number: - self.logger.info("No more pages with unknown tuples, breaking from pagination.\n") - break - - tuples += to_add - - i = i + 1 if self.finishing_task else i - 1 - - # Since we already wouldve checked the first page... break - if (i == 1 and multiple_pages and not self.finishing_task) or i < 1 or len(j) == 0: - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - return tuples - - def new_paginate_endpoint( - self, url, action_map={}, table=None, where_clause=True, platform='github' - ): - - page_number = 1 - multiple_pages = False - need_insertion = [] - need_update = [] - all_data = [] - forward_pagination = True - backwards_activation = False - last_page_number = -1 - while True: - - # Multiple attempts to hit endpoint - num_attempts = 0 - success = False - while num_attempts < 10: - self.logger.info("hitting an endpiont") - # f"Hitting endpoint: ...\n" - # f"{url.format(page_number)} on page number. \n") - try: - response = requests.get(url=url.format(page_number), headers=self.headers) - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(10) - continue - - self.update_rate_limit(response, platform=platform) - - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - if type(page_data) == list: - success = True - break - elif type(page_data) == dict: - self.logger.info("Request returned a dict: {}\n".format(page_data)) - if page_data['message'] == "Not Found": - self.logger.warning( - "Github repo was not found or does not exist for endpoint: " - f"{url.format(page_number)}\n" - ) - break - if "You have triggered an abuse detection mechanism." in page_data['message']: - num_attempts -= 1 - self.update_rate_limit(response, temporarily_disable=True,platform=platform) - if page_data['message'] == "Bad credentials": - self.update_rate_limit(response, bad_credentials=True, platform=platform) - elif type(page_data) == str: - self.logger.info(f"Warning! page_data was string: {page_data}\n") - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - elif len(page_data) == 0: - self.logger.warning("Empty string, trying again...\n") - else: - try: - page_data = json.loads(page_data) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Success - - # Determine if continued pagination is needed - - if len(page_data) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - all_data += page_data - - if not forward_pagination: - - # Checking contents of requests with what we already have in the db - page_insertions, page_updates = self.new_organize_needed_data( - page_data, augur_table=table, action_map=action_map - ) - - # Reached a page where we already have all tuples - if len(need_insertion) == 0 and len(need_update) == 0 and \ - backwards_activation: - self.logger.info( - "No more pages with unknown tuples, breaking from pagination.\n" - ) - break - - need_insertion += page_insertions - need_update += page_updates - - # Find last page so we can decrement from there - if 'last' in response.links and last_page_number == -1: - if platform == 'github': - last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) - elif platform == 'gitlab': - last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) - - if not forward_pagination and not backwards_activation: - page_number = last_page_number - backwards_activation = True - - self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, - int(last_page_number) if last_page_number != -1 else "*last page not known*")) - - if (page_number <= 1 and not forward_pagination) or \ - (page_number >= last_page_number and forward_pagination): - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - page_number = page_number + 1 if forward_pagination else page_number - 1 - - if forward_pagination: - need_insertion, need_update = self.new_organize_needed_data( - all_data, augur_table=table, action_map=action_map - ) - - return { - 'insert': need_insertion, - 'update': need_update, - 'all': all_data - } + company = None + location = None + email = None + if 'company' in contributor: + company = contributor['company'] + if 'location' in contributor: + location = contributor['location'] + if 'email' in contributor: + email = contributor['email'] + + + if platform == 'github': + cntrb = { + 'cntrb_login': contributor['login'] if 'login' in contributor else None, + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_created_at': contributor['created_at'] if 'created_at' in contributor else None, + 'cntrb_canonical': None, + 'gh_user_id': contributor['id'] if 'id' in contributor else None, + 'gh_login': contributor['login'] if 'login' in contributor else None, + 'gh_url': contributor['url'] if 'url' in contributor else None, + 'gh_html_url': contributor['html_url'] if 'html_url' in contributor else None, + 'gh_node_id': contributor['node_id'] if 'node_id' in contributor else None, + 'gh_avatar_url': contributor['avatar_url'] if 'avatar_url' in contributor else None, + 'gh_gravatar_id': contributor['gravatar_id'] if 'gravatar_id' in contributor else None, + 'gh_followers_url': contributor['followers_url'] if 'followers_url' in contributor else None, + 'gh_following_url': contributor['following_url'] if 'following_url' in contributor else None, + 'gh_gists_url': contributor['gists_url'] if 'gists_url' in contributor else None, + 'gh_starred_url': contributor['starred_url'] if 'starred_url' in contributor else None, + 'gh_subscriptions_url': contributor['subscriptions_url'] if 'subscriptions_url' in contributor else None, + 'gh_organizations_url': contributor['organizations_url'] if 'organizations_url' in contributor else None, + 'gh_repos_url': contributor['repos_url'] if 'repos_url' in contributor else None, + 'gh_events_url': contributor['events_url'] if 'events_url' in contributor else None, + 'gh_received_events_url': contributor['received_events_url'] if 'received_events_url' in contributor else None, + 'gh_type': contributor['type'] if 'type' in contributor else None, + 'gh_site_admin': contributor['site_admin'] if 'site_admin' in contributor else None, + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } + + elif platform == 'gitlab': + cntrb = { + 'cntrb_login': contributor[0]['username'] if 'username' in contributor[0] else None, + 'cntrb_email': email, + 'cntrb_company': company, + 'cntrb_location': location, + 'cntrb_created_at': contributor[0]['created_at'] if 'created_at' in contributor[0] else None, + 'cntrb_canonical': None, + 'gh_user_id': contributor[0]['id'], + 'gh_login': contributor[0]['username'], + 'gh_url': contributor[0]['web_url'], + 'gh_html_url': None, + 'gh_node_id': None, + 'gh_avatar_url': contributor[0]['avatar_url'], + 'gh_gravatar_id': None, + 'gh_followers_url': None, + 'gh_following_url': None, + 'gh_gists_url': None, + 'gh_starred_url': None, + 'gh_subscriptions_url': None, + 'gh_organizations_url': None, + 'gh_repos_url': None, + 'gh_events_url': None, + 'gh_received_events_url': None, + 'gh_type': None, + 'gh_site_admin': None, + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: " + str(result.inserted_primary_key)) + self.results_counter += 1 + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + self.logger.info(f"Inserted contributor: {cntrb['cntrb_login']}\n") + + return self.find_id_from_login(login, platform) + + #Blatently only for api key usage + def init_oauths(self, platform='github'): + + self.oauths = [] + self.headers = None + self.logger.info("Trying initialization.") + # Make a list of api key in the config combined w keys stored in the database + # Select endpoint to hit solely to retrieve rate limit + # information from headers of the response + # Adjust header keys needed to fetch rate limit information from the API responses + if platform == 'github': + url = "https://api.github.com/users/gabe-heim" + oauthSQL = s.sql.text(""" + SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'github' + """.format(self.config['gh_api_key'])) + key_name = 'gh_api_key' + rate_limit_header_key = "X-RateLimit-Remaining" + rate_limit_reset_header_key = "X-RateLimit-Reset" + elif platform == 'gitlab': + url = "https://gitlab.com/api/v4/version" + oauthSQL = s.sql.text(""" + SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'gitlab' + """.format(self.config['gitlab_api_key'])) + key_name = 'gitlab_api_key' + rate_limit_header_key = 'ratelimit-remaining' + rate_limit_reset_header_key = 'ratelimit-reset' + + for oauth in [{'oauth_id': 0, 'access_token': self.config[key_name]}] + json.loads( + pd.read_sql(oauthSQL, self.helper_db, params={}).to_json(orient="records") + ): + if platform == 'github': + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + elif platform == 'gitlab': + self.headers = {'Authorization': 'Bearer %s' % oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + self.oauths.append({ + 'oauth_id': oauth['oauth_id'], + 'access_token': oauth['access_token'], + 'rate_limit': int(response.headers[rate_limit_header_key]), + 'seconds_to_reset': ( + datetime.datetime.fromtimestamp( + int(response.headers[rate_limit_reset_header_key]) + ) - datetime.datetime.now() + ).total_seconds() + }) + self.logger.debug("Found OAuth available for use: {}".format(self.oauths[-1])) + + if len(self.oauths) == 0: + self.logger.info( + "No API keys detected, please include one in your config or in the " + "worker_oauths table in the augur_operations schema of your database." + ) + + # First key to be used will be the one specified in the config (first element in + # self.oauths array will always be the key in use) + if platform == 'github': + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + elif platform == 'gitlab': + self.headers = {'Authorization': 'Bearer %s' % self.oauths[0]['access_token']} + + self.logger.info("OAuth initialized\n") + + def enrich_cntrb_id( + self, data, key, action_map_additions={'insert': {'source': [], 'augur': []}}, + platform='github', prefix='' + ): + + if not len(data): + self.logger.info(f"Enrich contrib data is empty for {len(data)}, for the key {key}.") + + raise ValueError + + self.logger.info(f"Enriching contributor ids for {len(data)} data points...") + + source_df = pd.DataFrame(data) + expanded_source_df = self._add_nested_columns( + source_df.copy(), [key] + action_map_additions['insert']['source'] + ) + + # Insert cntrbs that are not in db + + cntrb_action_map = { + 'insert': { + 'source': [key] + action_map_additions['insert']['source'] + [f'{prefix}id'], + 'augur': ['cntrb_login'] + action_map_additions['insert']['augur'] + ['gh_user_id'] + } + } + + table_values_cntrb = self.db.execute( + s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)) + ).fetchall() + + source_data = expanded_source_df.to_dict(orient='records') + + #Filter out bad data where we can't even hit the api. + source_data = [data for data in source_data if f'{prefix}login' in data and data[f'{prefix}login'] != None] + + self.logger.info(f"table_values_cntrb keys: {table_values_cntrb[0].keys()}") + self.logger.info(f"source_data keys: {source_data[0].keys()}") + + #We can't use this because of worker collisions + #TODO: seperate this method into it's own worker. + #cntrb_id_offset = self.get_max_id(self.contributors_table, 'cntrb_id') - 1 + + # loop through data to test if it is already in the database + for index, data in enumerate(source_data): + + self.logger.info(f"Enriching {index} of {len(source_data)}") + + + user_unique_ids = [] + + #Allow for alt identifiers to be checked if user.id is not present in source_data + try: + #This will trigger a KeyError if data has alt identifier. + data[f'{prefix}id'] + for row in table_values_cntrb: + try: + user_unique_ids.append(row['gh_user_id']) + except Exception as e: + self.logger.info(f"Error adding gh_user_id: {e}. Row: {row}") + except KeyError: + self.logger.info("Source data doesn't have user.id. Using node_id instead.") + for row in table_values_cntrb: + try: + user_unique_ids.append(row['gh_node_id']) + except Exception as e: + self.logger.info(f"Error adding gh_node_id: {e}. Row: {row}") + + + #self.logger.info(f"gh_user_ids: {gh_user_ids}") + + # self.logger.info(f"Users gh_user_id: {data['user.id']}") + # in_user_ids = False + # if data['user.id'] in gh_user_ids: + # in_user_ids = True + # self.logger.info(f"{data['user.id']} is in gh_user_ids") + + # self.logger.info(f"table_values_cntrb len: {len(table_values_cntrb)}") + + #Deal with if data + #See if we can check using the user.id + source_data_id = None + try: + source_data_id = data[f'{prefix}id'] + except KeyError: + source_data_id = data[f'{prefix}node_id'] + + + #if user.id is in the database then there is no need to add the contributor + if source_data_id in user_unique_ids: + + self.logger.info("{} found in database".format(source_data_id)) + + user_id_row = [] + try: + data[f'{prefix}id'] + #gets the dict from the table_values_cntrb that contains data['user.id'] + user_id_row = list(filter(lambda x: x['gh_user_id'] == source_data_id, table_values_cntrb))[0] + except KeyError: + user_id_row = list(filter(lambda x: x['gh_node_id'] == source_data_id, table_values_cntrb))[0] + + + #assigns the cntrb_id to the source data to be returned to the workers + data['cntrb_id'] = user_id_row['cntrb_id'] + self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") + + #contributor is not in the database + else: + + self.logger.info("{} not in database, making api call".format(source_data_id)) + + self.logger.info("login: {}".format(data[f'{prefix}login'])) + + try: + url = ("https://api.github.com/users/" + data[f'{prefix}login']) + except Exception as e: + self.logger.info(f"Error when creating url: {e}. Data: {data}") + + attempts = 0 + + while attempts < 10: + self.logger.info("Hitting endpoint: " + url + " ...\n") + + try: + response = requests.get(url=url , headers=self.headers) + except TimeoutError: + self.logger.info(f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") + time.sleep(10) + continue + + self.update_rate_limit(response,platform=platform) + + + try: + contributor = response.json() + except: + contributor = json.loads(json.dumps(response.text)) + + if type(contributor) == dict: + self.logger.info("Request returned a dict!") + self.logger.info(f"Contributor data: {contributor}") + success = True + break + elif type(contributor) == list: + self.logger.warning("Wrong type returned trying again.") + self.logger.info(f"Contributor data: {contributor}") + elif type(contributor) == str: + self.logger.info(f"Warning! page_data was string: {contributor}\n") + if "" in contributor: + self.logger.info("HTML was returned, trying again...\n") + elif len(contributor) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + contributor = json.loads(contributor) + success = True + break + except: + pass + attempts += 1 + if not success: + continue + + self.logger.info(f"Contributor data: {contributor}") + + cntrb = { + "cntrb_login": contributor['login'], + "cntrb_created_at": contributor['created_at'], + "cntrb_email": contributor['email'] if 'email' in contributor else None, + "cntrb_company": contributor['company'] if 'company' in contributor else None, + "cntrb_location": contributor['location'] if 'location' in contributor else None, + # "cntrb_type": , dont have a use for this as of now ... let it default to null + "cntrb_canonical": contributor['email'] if 'email' in contributor else None, + "gh_user_id": contributor['id'], + "gh_login": contributor['login'], + "gh_url": contributor['url'], + "gh_html_url": contributor['html_url'], + "gh_node_id": contributor['node_id'], + "gh_avatar_url": contributor['avatar_url'], + "gh_gravatar_id": contributor['gravatar_id'], + "gh_followers_url": contributor['followers_url'], + "gh_following_url": contributor['following_url'], + "gh_gists_url": contributor['gists_url'], + "gh_starred_url": contributor['starred_url'], + "gh_subscriptions_url": contributor['subscriptions_url'], + "gh_organizations_url": contributor['organizations_url'], + "gh_repos_url": contributor['repos_url'], + "gh_events_url": contributor['events_url'], + "gh_received_events_url": contributor['received_events_url'], + "gh_type": contributor['type'], + "gh_site_admin": contributor['site_admin'], + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + + #insert new contributor into database + # TODO: make this method it's own worker. This errors because of collisions between github_worker and pull_request_worker. + #We can solve this by making another worker with a queue. It wouldn't have to be too complicated. + try: + self.db.execute(self.contributors_table.insert().values(cntrb)) + # except s.exc.IntegrityError: + except Exception as e: + self.logger.info(f"Contributor was unable to be added to table! Attempting to get cntrb_id from table anyway because of possible collision. Error: {e}") + + + #Get the contributor id from the newly inserted contributor. + cntrb_id_row = self.db.execute( + s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)).where( + self.contributors_table.c.gh_user_id==cntrb["gh_user_id"] + ) + ).fetchall() + + #Handle and log rare failure cases. If this part errors something is very wrong. + if len(cntrb_id_row) == 1: + data['cntrb_id'] = cntrb_id_row[0]['cntrb_id'] + self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") + elif len(cntrb_id_row) == 0: + self.logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor that is unable to be inserted into the contributors table but is also not present in that table.") + else: + self.logger.info(f"There are more than one contributors in the table with gh_user_id={cntrb['gh_user_id']}") + + + cntrb_data = { + 'cntrb_id': data['cntrb_id'], + 'gh_node_id': cntrb['gh_node_id'], + 'cntrb_login': cntrb['cntrb_login'], + 'gh_user_id': cntrb['gh_user_id'] + } + #This updates our list of who is already in the database as we iterate to avoid duplicates. + #People who make changes tend to make more than one in a row. + table_values_cntrb.append(cntrb_data) + + self.logger.info( + "Contributor id enrichment successful, result has " + f"{len(source_data)} data points.\n" + ) + return source_data + + + + + #old method + """ + # source_cntrb_insert, _ = self.organize_needed_data( + # expanded_source_df.to_dict(orient='records'), table_values=table_values_cntrb, + # action_map=cntrb_action_map + # ) + + # cntrb_insert = [ + # { + # 'cntrb_login': contributor[f'{prefix}login'], + # 'cntrb_created_at': None if ( + # f'{prefix}created_at' not in contributor + # ) else contributor[f'{prefix}created_at'], + # 'cntrb_email': None if f'{prefix}email' not in contributor else contributor[f'{prefix}email'], + # 'cntrb_company': None if f'{prefix}company' not in contributor else contributor[f'{prefix}company'], + # 'cntrb_location': None if ( + # f'{prefix}location' not in contributor + # ) else contributor[f'{prefix}location'], + # 'gh_user_id': None if ( + # not contributor[f'{prefix}id'] + # ) else int(float(contributor[f'{prefix}id'])), + # 'gh_login': contributor[f'{prefix}login'], + # 'gh_url': contributor[f'{prefix}url'], + # 'gh_html_url': contributor[f'{prefix}html_url'], + # 'gh_node_id': contributor[f'{prefix}node_id'], #valid for dup check + # 'gh_avatar_url': contributor[f'{prefix}avatar_url'], + # 'gh_gravatar_id': contributor[f'{prefix}gravatar_id'], + # 'gh_followers_url': contributor[f'{prefix}followers_url'], + # 'gh_following_url': contributor[f'{prefix}following_url'], + # 'gh_gists_url': contributor[f'{prefix}gists_url'], + # 'gh_starred_url': contributor[f'{prefix}starred_url'], + # 'gh_subscriptions_url': contributor[f'{prefix}subscriptions_url'], + # 'gh_organizations_url': contributor[f'{prefix}organizations_url'], + # 'gh_repos_url': contributor[f'{prefix}repos_url'], + # 'gh_events_url': contributor[f'{prefix}events_url'], + # 'gh_received_events_url': contributor[f'{prefix}received_events_url'], + # 'gh_type': contributor[f'{prefix}type'], + # 'gh_site_admin': contributor[f'{prefix}site_admin'], + # 'tool_source': self.tool_source, + # 'tool_version': self.tool_version, + # 'data_source': self.data_source + # } for contributor in source_cntrb_insert if contributor[f'{prefix}login'] + # ] + # + # try: + # self.bulk_insert(self.contributors_table, cntrb_insert) + # except s.exc.IntegrityError: + # self.logger.info("Unique Violation in contributors table! ") + # + # # Query db for inserted cntrb pkeys and add to shallow level of data + # + # # Query + # cntrb_pk_name = list(self.contributors_table.primary_key)[0].name + # session = s.orm.Session(self.db) + # inserted_pks = pd.DataFrame( + # session.query( + # self.contributors_table.c[cntrb_pk_name], self.contributors_table.c.cntrb_login, + # self.contributors_table.c.gh_node_id + # ).distinct(self.contributors_table.c.cntrb_login).order_by( + # self.contributors_table.c.cntrb_login, self.contributors_table.c[cntrb_pk_name] + # ).all(), columns=[cntrb_pk_name, 'cntrb_login', 'gh_node_id'] + # ).to_dict(orient='records') + # session.close() + # + # # Prepare for merge + # source_columns = sorted(list(source_df.columns)) + # necessary_columns = sorted(list(set(source_columns + cntrb_action_map['insert']['source']))) + # (source_table, inserted_pks_table), metadata, session = self._setup_postgres_merge( + # [ + # expanded_source_df[necessary_columns].to_dict(orient='records'), + # inserted_pks + # ], sort=True + # ) + # final_columns = [cntrb_pk_name] + sorted(list(set(necessary_columns))) + # + # # Merge + # source_pk = pd.DataFrame( + # session.query( + # inserted_pks_table.c.cntrb_id, source_table + # ).join( + # source_table, + # eval( + # ' and '.join( + # [ + # ( + # f"inserted_pks_table.c['{table_column}'] " + # f"== source_table.c['{source_column}']" + # ) for table_column, source_column in zip( + # cntrb_action_map['insert']['augur'], + # cntrb_action_map['insert']['source'] + # ) + # ] + # ) + # ) + # ).all(), columns=final_columns + # ) + # + # # Cleanup merge + # source_pk = self._eval_json_columns(source_pk) + # self._close_postgres_merge(metadata, session) + + #self.logger.info( + # "Contributor id enrichment successful, result has " + # f"{len(source_pk)} data points.\n" + #) + + #return source_pk.to_dict(orient='records')""" + + def query_github_contributors(self, entry_info, repo_id): + + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") + + ## It absolutely doesn't matter if the contributor has already contributoed to a repo. it only matters that they exist in our table, and + ## if the DO, then we DO NOT want to insert them again in any GitHub Method. + github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] + + # Extract owner/repo from the url for the endpoint + owner, name = self.get_owner_repo(github_url) + + # Set the base of the url and place to hold contributors to insert + contributors_url = ( + f"https://api.github.com/repos/{owner}/{name}/" + + "contributors?per_page=100&page={}" + ) + + # Get contributors that we already have stored + # Set our duplicate and update column map keys (something other than PK) to + # check dupicates/needed column updates with + table = 'contributors' + table_pkey = 'cntrb_id' + update_col_map = {'cntrb_email': 'email'} + duplicate_col_map = {'cntrb_login': 'login'} + + #list to hold contributors needing insertion or update + contributors = self.paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) + + self.logger.info("Count of contributors needing insertion: " + str(len(contributors)) + "\n") + + for repo_contributor in contributors: + try: + # Need to hit this single contributor endpoint to get extra data including... + # `created at` + # i think that's it + cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) + self.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + self.update_gh_rate_limit(r) + contributor = r.json() + + company = None + location = None + email = None + if 'company' in contributor: + company = contributor['company'] + if 'location' in contributor: + location = contributor['location'] + if 'email' in contributor: + email = contributor['email'] + canonical_email = contributor['email'] + + cntrb = { + "cntrb_login": contributor['login'], + "cntrb_created_at": contributor['created_at'], + "cntrb_email": email, + "cntrb_company": company, + "cntrb_location": location, + # "cntrb_type": , dont have a use for this as of now ... let it default to null + "cntrb_canonical": canonical_email, + "gh_user_id": contributor['id'], + "gh_login": contributor['login'], + "gh_url": contributor['url'], + "gh_html_url": contributor['html_url'], + "gh_node_id": contributor['node_id'], #This is what we are dup checking + "gh_avatar_url": contributor['avatar_url'], + "gh_gravatar_id": contributor['gravatar_id'], + "gh_followers_url": contributor['followers_url'], + "gh_following_url": contributor['following_url'], + "gh_gists_url": contributor['gists_url'], + "gh_starred_url": contributor['starred_url'], + "gh_subscriptions_url": contributor['subscriptions_url'], + "gh_organizations_url": contributor['organizations_url'], + "gh_repos_url": contributor['repos_url'], + "gh_events_url": contributor['events_url'], + "gh_received_events_url": contributor['received_events_url'], + "gh_type": contributor['type'], + "gh_site_admin": contributor['site_admin'], + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + #dup check + #TODO: add additional fields to check if needed. + existingMatchingContributors = self.db.execute( + self.sql.select( + [self.contributors_table.c.gh_node_id] + ).where( + self.contributors_table.c.gh_node_id==cntrb["gh_node_id"] + ).fetchall() + ) + + if len(existingMatchingContributors) > 0: + break #if contributor already exists in table + + + # Commit insertion to table + if repo_contributor['flag'] == 'need_update': + result = self.db.execute(self.contributors_table.update().where( + self.worker_history_table.c.cntrb_email==email).values(cntrb)) + self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) + self.cntrb_id_inc = repo_contributor['pkey'] + elif repo_contributor['flag'] == 'need_insertion': + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) + self.results_counter += 1 + + self.logger.info("Inserted contributor: " + contributor['login'] + "\n") + + # Increment our global track of the cntrb id for the possibility of it being used as a FK + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + + except Exception as e: + self.logger.error("Caught exception: {}".format(e)) + self.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + continue + + + def query_github_contributors_bulk(self, entry_info, repo_id): + + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") + + github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] + + owner, name = self.get_owner_repo(github_url) + + contributors_url = (f"https://api.github.com/repos/{owner}/{name}/" + + "contributors?per_page=100&page={}") + + action_map = { + 'insert': { + 'source': ['login'], + 'augur': ['cntrb_login'] + }, + 'update': { + 'source': ['email'], + 'augur': ['cntrb_email'] + } + } + + source_contributors = self.paginate_endpoint(contributors_url, action_map=action_map, + table=self.contributors_table) + + contributors_insert = [] + + for repo_contributor in source_contributors['insert']: + # Need to hit this single contributor endpoint to get extra data + cntrb_url = (f"https://api.github.com/users/{repo_contributor['login']}") + self.logger.info(f"Hitting endpoint: {cntrb_url} ...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + self.update_gh_rate_limit(r) + contributor = r.json() + + contributors_insert.append({ + 'cntrb_login': contributor['login'], + 'cntrb_created_at': contributor['created_at'], + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, + 'gh_user_id': contributor['id'], + 'gh_login': contributor['login'], + 'gh_url': contributor['url'], + 'gh_html_url': contributor['html_url'], + 'gh_node_id': contributor['node_id'], + 'gh_avatar_url': contributor['avatar_url'], + 'gh_gravatar_id': contributor['gravatar_id'], + 'gh_followers_url': contributor['followers_url'], + 'gh_following_url': contributor['following_url'], + 'gh_gists_url': contributor['gists_url'], + 'gh_starred_url': contributor['starred_url'], + 'gh_subscriptions_url': contributor['subscriptions_url'], + 'gh_organizations_url': contributor['organizations_url'], + 'gh_repos_url': contributor['repos_url'], + 'gh_events_url': contributor['events_url'], + 'gh_received_events_url': contributor['received_events_url'], + 'gh_type': contributor['type'], + 'gh_site_admin': contributor['site_admin'], + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + }) + + contributors_insert_result, contributors_update_result = self.bulk_insert(self.contributors_table, + update=source_contributors['update'], unique_columns=action_map['insert']['augur'], + insert=contributors_insert, update_columns=action_map['update']['augur']) + + def query_github_contributors_fast(self, entry_info, repo_id): + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}") + + github_url = ( + entry_info['given']['github_url'] if 'github_url' in entry_info['given'] + else entry_info['given']['git_url'] + ) + + contributors_url = ( + f"https://api.github.com/repos/{self.owner}/{self.name}/" + "contributors?per_page=100&page={}" + ) + + action_map = { + 'insert': { + 'source': ['login'], + 'augur': ['cntrb_login'] + }, + 'update': { + 'source': ['email'], + 'augur': ['cntrb_email'] + } + } + + source_contributors = self.paginate_endpoint( + contributors_url, action_map=action_map, table=self.contributors_table + ) + + contributors_insert = [ + { + 'cntrb_login': contributor['login'], + 'cntrb_created_at': ( + contributor['created_at'] if 'created_at' in contributor else None + ), + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, + 'gh_user_id': contributor['id'], + 'gh_login': contributor['login'], + 'gh_url': contributor['url'], + 'gh_html_url': contributor['html_url'], + 'gh_node_id': contributor['node_id'], + 'gh_avatar_url': contributor['avatar_url'], + 'gh_gravatar_id': contributor['gravatar_id'], + 'gh_followers_url': contributor['followers_url'], + 'gh_following_url': contributor['following_url'], + 'gh_gists_url': contributor['gists_url'], + 'gh_starred_url': contributor['starred_url'], + 'gh_subscriptions_url': contributor['subscriptions_url'], + 'gh_organizations_url': contributor['organizations_url'], + 'gh_repos_url': contributor['repos_url'], + 'gh_events_url': contributor['events_url'], + 'gh_received_events_url': contributor['received_events_url'], + 'gh_type': contributor['type'], + 'gh_site_admin': contributor['site_admin'], + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } for contributor in source_contributors['insert'] + ] + + self.bulk_insert( + self.contributors_table, update=source_contributors['update'], + unique_columns=action_map['insert']['augur'], + insert=contributors_insert, update_columns=action_map['update']['augur'] + ) + + def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.info( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.info("Gitlab rate limit reached. Temp. disabling...") + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Updated rate limit, you have: " + + str(self.oauths[0]['rate_limit']) + " requests remaining.") + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['RateLimit-Reset'] + except Exception as e: + self.logger.info(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://gitlab.com/api/v4/version" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info(f"Higher rate limit found in oauth: {oauth}") + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} + + def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.warning( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.debug( + "Github thinks we are abusing their api. Preventing use " + "of this key until its rate limit resets..." + ) + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + # self.logger.info("Recieved rate limit from headers\n") + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Headers did not work, had to decrement") + self.logger.info( + f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." + ) + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['X-RateLimit-Reset'] + except Exception as e: + self.logger.error(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://api.github.com/users/gabe-heim" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + + attempts = 3 + success = False + while attempts > 0 and not success: + response = requests.get(url=url, headers=self.headers) + try: + oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['X-RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + success = True + except Exception as e: + self.logger.info( + f"oath method ran into error getting info from headers: {e}\n" + ) + self.logger.info(f"{self.headers}\n{url}\n") + attempts -= 1 + if not success: + continue + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}\n" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + + #TODO: figure out if changing this typo breaks anything + def query_gitlab_contributors(self, entry_info, repo_id): + + gitlab_url = ( + entry_info['given']['gitlab_url'] if 'gitlab_url' in entry_info['given'] + else entry_info['given']['git_url'] + ) + + self.logger.info("Querying contributors with given entry info: " + str(entry_info) + "\n") + + path = urlparse(gitlab_url) + split = path[2].split('/') + + owner = split[1] + name = split[2] + + # Handles git url case by removing the extension + if ".git" in name: + name = name[:-4] + + url_encoded_format = quote(owner + '/' + name, safe='') + + table = 'contributors' + table_pkey = 'cntrb_id' + ### Here we are adding gitlab user information from the API + ### Following Gabe's rework of the contributor worker. + + ### The GitLab API will NEVER give you an email. It will let you + ### Query an email, but never give you one. + ### ## Gitlab email api: https://gitlab.com/api/v4/users?search=s@goggins.com + ### We don't need to update right now, so commenting out. + ### TODO: SOLVE LOGIC. + # update_col_map = {'cntrb_email': 'email'} + update_col_map = {} + duplicate_col_map = {'gl_username': 'username'} + + # list to hold contributors needing insertion or update + contributors = self.paginate("https://gitlab.com/api/v4/projects/" + url_encoded_format + "/repository/contributors?per_page=100&page={}", duplicate_col_map, update_col_map, table, table_pkey, platform='gitlab') + + for repo_contributor in contributors: + try: + cntrb_compressed_url = ("https://gitlab.com/api/v4/users?search=" + repo_contributor['email']) + self.logger.info("Hitting endpoint: " + cntrb_compressed_url + " ...\n") + r = requests.get(url=cntrb_compressed_url, headers=self.headers) + contributor_compressed = r.json() + + email = repo_contributor['email'] + self.logger.info(contributor_compressed) + if len(contributor_compressed) == 0 or type(contributor_compressed) is dict or "id" not in contributor_compressed[0]: + continue + + self.logger.info("Fetching for user: " + str(contributor_compressed[0]["id"])) + + cntrb_url = ("https://gitlab.com/api/v4/users/" + str(contributor_compressed[0]["id"])) + self.logger.info("Hitting end point to get complete contributor info now: " + cntrb_url + "...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + contributor = r.json() + + cntrb = { + "gl_id": contributor.get('gl_id', None), + "gl_full_name": contributor.get('full_name', None), + "gl_username": contributor.get('username', None), + "gl_state": contributor.get('state', None), + "gl_avatar_url": contributor.get('avatar_url', None), + "gl_web_url": contributor.get('web_url', None), + #"cntrb_login": contributor.get('username', None), + #"cntrb_created_at": contributor.get('created_at', None), + "cntrb_email": ('email', None), + #"cntrb_company": contributor.get('organization', None), + #"cntrb_location": contributor.get('location', None), + # "cntrb_type": , dont have a use for this as of now ... let it default to null + #"cntrb_canonical": contributor.get('public_email', None), + #"gh_user_id": contributor.get('id', None), + #"gh_login": contributor.get('username', None), + #"gh_url": contributor.get('web_url', None), + #"gh_html_url": contributor.get('web_url', None), + #"gh_node_id": None, + #"gh_avatar_url": contributor.get('avatar_url', None), + #"gh_gravatar_id": None, + #"gh_followers_url": None, + #"gh_following_url": None, + #"gh_gists_url": None, + #"gh_starred_url": None, + #"gh_subscriptions_url": None, + #"gh_organizations_url": None, + #"gh_repos_url": None, + #"gh_events_url": None, + #"gh_received_events_url": None, + #"gh_type": None, + #"gh_site_admin": None, + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + + # Commit insertion to table + if repo_contributor['flag'] == 'need_update': + result = self.db.execute(self.contributors_table.update().where( + self.worker_history_table.c.cntrb_email == email).values(cntrb)) + self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) + self.cntrb_id_inc = repo_contributor['pkey'] + elif repo_contributor['flag'] == 'need_insertion': + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) + self.results_counter += 1 + + self.logger.info("Inserted contributor: " + contributor['username'] + "\n") + + # Increment our global track of the cntrb id for the possibility of it being used as a FK + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + + except Exception as e: + self.logger.info("Caught exception: {}".format(e)) + self.logger.info("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + continue + + + def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.info( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.info("Gitlab rate limit reached. Temp. disabling...") + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Updated rate limit, you have: " + + str(self.oauths[0]['rate_limit']) + " requests remaining.") + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['RateLimit-Reset'] + except Exception as e: + self.logger.info(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://gitlab.com/api/v4/version" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info(f"Higher rate limit found in oauth: {oauth}") + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} + + + def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.warning( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.debug( + "Github thinks we are abusing their api. Preventing use " + "of this key until its rate limit resets..." + ) + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + # self.logger.info("Recieved rate limit from headers\n") + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Headers did not work, had to decrement") + self.logger.info( + f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." + ) + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['X-RateLimit-Reset'] + except Exception as e: + self.logger.error(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://api.github.com/users/gabe-heim" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + + attempts = 3 + success = False + while attempts > 0 and not success: + response = requests.get(url=url, headers=self.headers) + try: + oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['X-RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + success = True + except Exception as e: + self.logger.info( + f"oath method ran into error getting info from headers: {e}\n" + ) + self.logger.info(f"{self.headers}\n{url}\n") + attempts -= 1 + if not success: + continue + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}\n" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + + def update_rate_limit( + self, response, bad_credentials=False, temporarily_disable=False, platform="gitlab" + ): + if platform == 'gitlab': + return self.update_gitlab_rate_limit( + response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable + ) + elif platform == 'github': + return self.update_gh_rate_limit( + response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable + ) + + + #Indexerror somewhere + def multi_thread_urls(self, all_urls, max_attempts=5, platform='github'): + """ + :param all_urls: list of tuples + """ + + if not len(all_urls): + self.logger.info("No urls to multithread, returning blank list.\n") + return [] + + def load_url(url, extra_data={}): + try: + html = requests.get(url, stream=True, headers=self.headers) + return html, extra_data + except requests.exceptions.RequestException as e: + self.logger.info(e, url) + + self.logger.info("Beginning to multithread API endpoints.") + + start = time.time() + + all_data = [] + valid_url_count = len(all_urls) + + partitions = math.ceil(len(all_urls) / 600) + self.logger.info(f"{len(all_urls)} urls to process. Trying {partitions} partitions. " + + f"Using {max(multiprocessing.cpu_count()//8, 1)} threads.") + for urls in numpy.array_split(all_urls, partitions): + attempts = 0 + self.logger.info(f"Total data points collected so far: {len(all_data)}") + while len(urls) > 0 and attempts < max_attempts: + with concurrent.futures.ThreadPoolExecutor( + max_workers=max(multiprocessing.cpu_count()//8, 1) + ) as executor: + # Start the load operations and mark each future with its URL + future_to_url = {executor.submit(load_url, *url): url for url in urls} + self.logger.info("Multithreaded urls and returned status codes:") + count = 0 + for future in concurrent.futures.as_completed(future_to_url): + + if count % 100 == 0: + self.logger.info( + f"Processed {len(all_data)} / {valid_url_count} urls. " + f"{len(urls)} remaining in this partition." + ) + count += 1 + + url = future_to_url[future] + try: + response, extra_data = future.result() + + if response.status_code != 200: + self.logger.info( + f"Url: {url[0]} ; Status code: {response.status_code}" + ) + + if response.status_code == 403 or response.status_code == 401: # 403 is rate limit, 404 is not found, 401 is bad credentials + self.update_rate_limit(response, platform=platform) + continue + + elif response.status_code == 200: + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + page_data = [{**data, **extra_data} for data in page_data] + all_data += page_data + + if 'last' in response.links and "&page=" not in url[0]: + urls += [ + (url[0] + f"&page={page}", extra_data) for page in range( + 2, int(response.links['last']['url'].split('=')[-1]) + 1 + ) + ] + try: + # self.logger.info(f"urls boundry issue? for {urls} where they are equal to {url}.") + + urls = numpy.delete(urls, numpy.where(urls == url), axis=0) + except: + self.logger.info(f"ERROR with axis = 0 - Now attempting without setting axis for numpy.delete for {urls} where they are equal to {url}.") + urls = numpy.delete(urls, numpy.where(urls == url)) + + elif response.status_code == 404: + urls = numpy.delete(urls, numpy.where(urls == url), axis=0) + self.logger.info(f"Not found url: {url}\n") + else: + self.logger.info( + f"Unhandled response code: {response.status_code} {url}\n" + ) + + except Exception as e: + self.logger.info( + f"{url} generated an exception: {traceback.format_exc()}\n" + ) + + attempts += 1 + + self.logger.info( + f"Processed {valid_url_count} urls and got {len(all_data)} data points " + f"in {time.time() - start} seconds thanks to multithreading!\n" + ) + return all_data + + + #insertion_method and stagger are arguments that allow paginate_endpoint to insert at around ~500 pages at a time. + def paginate_endpoint( + self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=500 + ): + + #Get augur columns using the action map along with the primary key + table_values = self.db.execute( + s.sql.select(self.get_relevant_columns(table, action_map)).where(where_clause) + ).fetchall() + + page_number = 1 + multiple_pages = False + need_insertion = [] + need_update = [] + + #Stores sum of page data + all_data = [] + forward_pagination = True + backwards_activation = False + last_page_number = -1 + + #Block to handle page queries and retry at least 10 times + while True: + + # Multiple attempts to hit endpoint + num_attempts = 0 + success = False + while num_attempts < 10: + self.logger.info(f"Hitting endpoint: {url.format(page_number)}...\n") + try: + response = requests.get(url=url.format(page_number), headers=self.headers) + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(10) + continue + + self.update_rate_limit(response, platform=platform) + + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + if type(page_data) == list: + success = True + break + elif type(page_data) == dict: + self.logger.info("Request returned a dict: {}\n".format(page_data)) + if page_data['message'] == "Not Found": + self.logger.warning( + "Github repo was not found or does not exist for endpoint: " + f"{url.format(page_number)}\n" + ) + break + if "You have triggered an abuse detection mechanism." in page_data['message']: + num_attempts -= 1 + self.update_rate_limit(response, temporarily_disable=True,platform=platform) + if page_data['message'] == "Bad credentials": + self.update_rate_limit(response, bad_credentials=True, platform=platform) + elif type(page_data) == str: + self.logger.info(f"Warning! page_data was string: {page_data}\n") + if "" in page_data: + self.logger.info("HTML was returned, trying again...\n") + elif len(page_data) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + page_data = json.loads(page_data) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Success + + # Determine if continued pagination is needed + + if len(page_data) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + all_data += page_data + + if not forward_pagination: + + # Checking contents of requests with what we already have in the db + page_insertions, page_updates = self.organize_needed_data( + page_data, table_values, list(table.primary_key)[0].name, + action_map, in_memory=True + ) + + # Reached a page where we already have all tuples + if len(need_insertion) == 0 and len(need_update) == 0 and \ + backwards_activation: + self.logger.info( + "No more pages with unknown tuples, breaking from pagination.\n" + ) + break + + need_insertion += page_insertions + need_update += page_updates + + # Find last page so we can decrement from there + if 'last' in response.links and last_page_number == -1: + if platform == 'github': + last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) + elif platform == 'gitlab': + last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) + + if not forward_pagination and not backwards_activation: + page_number = last_page_number + backwards_activation = True + + self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, + int(last_page_number) if last_page_number != -1 else "*last page not known*")) + + if (page_number <= 1 and not forward_pagination) or \ + (page_number >= last_page_number and forward_pagination): + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + #This is probably where we should insert at around ~500 at a time + #makes sure that stagger is enabled, we have an insertion method, and the insertion happens every 500 pages or so. + if stagger and insertion_method != None and page_number % insertion_threshold == 0: + #call insertion method passed as argument. + staggered_source_prs = { + 'insert' : need_insertion, + 'update' : need_update, + 'all' : all_data + } + + #Use the method the subclass needs in order to insert the data. + insertion_method(staggered_source_prs,action_map) + + #clear the data from memory and avoid duplicate insertions. + need_insertion = [] + need_update = [] + all_data = [] + + page_number = page_number + 1 if forward_pagination else page_number - 1 + + if forward_pagination: + need_insertion, need_update = self.organize_needed_data( + all_data, table_values, list(table.primary_key)[0].name, action_map, + in_memory=in_memory + ) + + return { + 'insert': need_insertion, + 'update': need_update, + 'all': all_data + } + + #TODO: deprecated but still used by the issues worker. + def paginate(self, url, duplicate_col_map, update_col_map, table, table_pkey, where_clause="", value_update_col_map={}, platform="github"): + """ DEPRECATED + Paginate either backwards or forwards (depending on the value of the worker's + finishing_task attribute) through all the GitHub or GitLab api endpoint pages. + + :param url: String, the url of the API endpoint we are paginating through, expects + a curly brace string formatter within the string to format the Integer + representing the page number that is wanted to be returned + :param duplicate_col_map: Dictionary, maps the column names of the source data + to the field names in our database for columns that should be checked for + duplicates (if source data value == value in existing database row, then this + element is a duplicate and would not need an insertion). Key is source data + column name, value is database field name. Example: {'id': 'gh_issue_id'} + :param update_col_map: Dictionary, maps the column names of the source data + to the field names in our database for columns that should be checked for + updates (if source data value != value in existing database row, then an + update is needed). Key is source data column name, value is database field name. + Example: {'id': 'gh_issue_id'} + :param table: String, the name of the table that holds the values to check for + duplicates/updates against + :param table_pkey: String, the field name of the primary key of the table in + the database that we are getting the values for to cross-reference to check + for duplicates. + :param where_clause: String, optional where clause to filter the values + that are queried when preparing the values that will be cross-referenced + for duplicates/updates + :param value_update_col_map: Dictionary, sometimes we add a new field to a table, + and we want to trigger an update of that row in the database even if all of the + data values are the same and would not need an update ordinarily. Checking for + a specific existing value in the database field allows us to do this. The key is the + name of the field in the database we are checking for a specific value to trigger + an update, the value is the value we are checking for equality to trigger an update. + Example: {'cntrb_id': None} + :return: List of dictionaries, all data points from the pages of the specified API endpoint + each with a 'flag' key-value pair representing the required action to take with that + data point (i.e. 'need_insertion', 'need_update', 'none') + """ + + update_keys = list(update_col_map.keys()) if update_col_map else [] + update_keys += list(value_update_col_map.keys()) if value_update_col_map else [] + cols_to_query = list(duplicate_col_map.keys()) + update_keys + [table_pkey] + table_values = self.get_table_values(cols_to_query, [table], where_clause) + + i = 1 + multiple_pages = False + tuples = [] + while True: + num_attempts = 0 + success = False + while num_attempts < 3: + self.logger.info(f'Hitting endpoint: {url.format(i)}...\n') + r = requests.get(url=url.format(i), headers=self.headers) + + self.update_rate_limit(r, platform=platform) + if 'last' not in r.links: + last_page = None + else: + if platform == "github": + last_page = r.links['last']['url'][-6:].split('=')[1] + elif platform == "gitlab": + last_page = r.links['last']['url'].split('&')[2].split("=")[1] + self.logger.info("Analyzing page {} of {}\n".format(i, int(last_page) + 1 if last_page is not None else '*last page not known*')) + + try: + j = r.json() + except: + j = json.loads(json.dumps(r.text)) + + if type(j) != dict and type(j) != str: + success = True + break + elif type(j) == dict: + self.logger.info("Request returned a dict: {}\n".format(j)) + if j['message'] == 'Not Found': + self.logger.warning("Github repo was not found or does not exist for endpoint: {}\n".format(url)) + break + if j['message'] == 'You have triggered an abuse detection mechanism. Please wait a few minutes before you try again.': + num_attempts -= 1 + self.logger.info("rate limit update code goes here") + self.update_rate_limit(r, temporarily_disable=True,platform=platform) + if j['message'] == 'Bad credentials': + self.logger.info("rate limit update code goes here") + self.update_rate_limit(r, bad_credentials=True, platform=platform) + elif type(j) == str: + self.logger.info(f'J was string: {j}\n') + if '' in j: + self.logger.info('HTML was returned, trying again...\n') + elif len(j) == 0: + self.logger.warning('Empty string, trying again...\n') + else: + try: + j = json.loads(j) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Find last page so we can decrement from there + if 'last' in r.links and not multiple_pages and not self.finishing_task: + if platform == "github": + param = r.links['last']['url'][-6:] + i = int(param.split('=')[1]) + 1 + elif platform == "gitlab": + i = int(r.links['last']['url'].split('&')[2].split("=")[1]) + 1 + self.logger.info("Multiple pages of request, last page is " + str(i - 1) + "\n") + multiple_pages = True + elif not multiple_pages and not self.finishing_task: + self.logger.info("Only 1 page of request\n") + elif self.finishing_task: + self.logger.info("Finishing a previous task, paginating forwards ..." + " excess rate limit requests will be made\n") + + if len(j) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + # Checking contents of requests with what we already have in the db + j = self.assign_tuple_action(j, table_values, update_col_map, duplicate_col_map, table_pkey, value_update_col_map) + + if not j: + self.logger.error("Assigning tuple action failed, moving to next page.\n") + i = i + 1 if self.finishing_task else i - 1 + continue + try: + to_add = [obj for obj in j if obj not in tuples and (obj['flag'] != 'none')] + except Exception as e: + self.logger.error("Failure accessing data of page: {}. Moving to next page.\n".format(e)) + i = i + 1 if self.finishing_task else i - 1 + continue + if len(to_add) == 0 and multiple_pages and 'last' in r.links: + self.logger.info("{}".format(r.links['last'])) + if platform == "github": + page_number = int(r.links['last']['url'][-6:].split('=')[1]) + elif platform == "gitlab": + page_number = int(r.links['last']['url'].split('&')[2].split("=")[1]) + if i - 1 != page_number: + self.logger.info("No more pages with unknown tuples, breaking from pagination.\n") + break + + tuples += to_add + + i = i + 1 if self.finishing_task else i - 1 + + # Since we already wouldve checked the first page... break + if (i == 1 and multiple_pages and not self.finishing_task) or i < 1 or len(j) == 0: + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + return tuples + + def new_paginate_endpoint( + self, url, action_map={}, table=None, where_clause=True, platform='github' + ): + + page_number = 1 + multiple_pages = False + need_insertion = [] + need_update = [] + all_data = [] + forward_pagination = True + backwards_activation = False + last_page_number = -1 + while True: + + # Multiple attempts to hit endpoint + num_attempts = 0 + success = False + while num_attempts < 10: + self.logger.info("hitting an endpiont") + # f"Hitting endpoint: ...\n" + # f"{url.format(page_number)} on page number. \n") + try: + response = requests.get(url=url.format(page_number), headers=self.headers) + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(10) + continue + + self.update_rate_limit(response, platform=platform) + + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + if type(page_data) == list: + success = True + break + elif type(page_data) == dict: + self.logger.info("Request returned a dict: {}\n".format(page_data)) + if page_data['message'] == "Not Found": + self.logger.warning( + "Github repo was not found or does not exist for endpoint: " + f"{url.format(page_number)}\n" + ) + break + if "You have triggered an abuse detection mechanism." in page_data['message']: + num_attempts -= 1 + self.update_rate_limit(response, temporarily_disable=True,platform=platform) + if page_data['message'] == "Bad credentials": + self.update_rate_limit(response, bad_credentials=True, platform=platform) + elif type(page_data) == str: + self.logger.info(f"Warning! page_data was string: {page_data}\n") + if "" in page_data: + self.logger.info("HTML was returned, trying again...\n") + elif len(page_data) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + page_data = json.loads(page_data) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Success + + # Determine if continued pagination is needed + + if len(page_data) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + all_data += page_data + + if not forward_pagination: + + # Checking contents of requests with what we already have in the db + page_insertions, page_updates = self.new_organize_needed_data( + page_data, augur_table=table, action_map=action_map + ) + + # Reached a page where we already have all tuples + if len(need_insertion) == 0 and len(need_update) == 0 and \ + backwards_activation: + self.logger.info( + "No more pages with unknown tuples, breaking from pagination.\n" + ) + break + + need_insertion += page_insertions + need_update += page_updates + + # Find last page so we can decrement from there + if 'last' in response.links and last_page_number == -1: + if platform == 'github': + last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) + elif platform == 'gitlab': + last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) + + if not forward_pagination and not backwards_activation: + page_number = last_page_number + backwards_activation = True + + self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, + int(last_page_number) if last_page_number != -1 else "*last page not known*")) + + if (page_number <= 1 and not forward_pagination) or \ + (page_number >= last_page_number and forward_pagination): + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + page_number = page_number + 1 if forward_pagination else page_number - 1 + + if forward_pagination: + need_insertion, need_update = self.new_organize_needed_data( + all_data, augur_table=table, action_map=action_map + ) + + return { + 'insert': need_insertion, + 'update': need_update, + 'all': all_data + } From d2e81bd880c8b716dfe49497dd2851626d18903b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 13 Aug 2021 12:37:20 -0500 Subject: [PATCH 2/5] Trying the same with differant tab size Signed-off-by: Isaac Milarsky --- workers/worker_git_integration.py | 3692 +++++++++++++++-------------- 1 file changed, 1847 insertions(+), 1845 deletions(-) diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index 10d2cde616..e43ddea867 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -5,1856 +5,1858 @@ #This is a worker base subclass that adds the ability to query github/gitlab with the api key class WorkerGitInterfaceable(Worker): - def __init__(self, worker_type, config={}, given=[], models=[], data_tables=[], operations_tables=[], platform="github"): - super().__init__(worker_type, config, given, models, data_tables, operations_tables) - - self.config.update({ - 'gh_api_key': self.augur_config.get_value('Database', 'key'), - 'gitlab_api_key': self.augur_config.get_value('Database', 'gitlab_api_key') - }) - - #Fix loose attribute definition - self.headers = None - self.platform = platform - self.given = given - self.models = models - - self.specs = { - 'id': self.config['id'], # what the broker knows this worker as - 'location': self.config['location'], # host + port worker is running on (so broker can send tasks here) - 'qualifications': [ - { - 'given': self.given, # type of repo this worker can be given as a task - 'models': self.models # models this worker can fill for a repo as a task - } - ], - 'config': self.config - } - - # Send broker hello message - if self.config['offline_mode'] is False: - self.connect_to_broker() + def __init__(self, worker_type, config={}, given=[], models=[], data_tables=[], operations_tables=[], platform="github"): + super().__init__(worker_type, config, given, models, data_tables, operations_tables) + + self.config.update({ + 'gh_api_key': self.augur_config.get_value('Database', 'key'), + 'gitlab_api_key': self.augur_config.get_value('Database', 'gitlab_api_key') + }) + + #Fix loose attribute definition + self.headers = None + self.platform = platform + self.given = given + self.models = models + + self.specs = { + 'id': self.config['id'], # what the broker knows this worker as + 'location': self.config['location'], # host + port worker is running on (so broker can send tasks here) + 'qualifications': [ + { + 'given': self.given, # type of repo this worker can be given as a task + 'models': self.models # models this worker can fill for a repo as a task + } + ], + 'config': self.config + } + + # Send broker hello message + if self.config['offline_mode'] is False: + self.connect_to_broker() # Attempts to determine if these attributes exist # If not, it creates them with default values - try: - self.tool_source - self.tool_version - self.data_source - except AttributeError: - self.tool_source = 'Augur Worker Testing' - self.tool_version = '0.0.0' - self.data_source = 'Augur Worker Testing' - - #database interface, additional functionality with github interface. - def initialize_database_connections(self): - super().initialize_database_connections() - # Organize different api keys/oauths available - self.logger.info("Initializing API key.") - if 'gh_api_key' in self.config or 'gitlab_api_key' in self.config: - try: - self.init_oauths(self.platform) - except AttributeError: - self.logger.error("Worker not configured to use API key!") - else: - self.oauths = [{'oauth_id': 0}] - - def find_id_from_login(self, login, platform='github'): - """ Retrieves our contributor table primary key value for the contributor with - the given GitHub login credentials, if this contributor is not there, then - they get inserted. - - :param login: String, the GitHub login username to find the primary key id for - :return: Integer, the id of the row in our database with the matching GitHub login - """ - idSQL = s.sql.text(""" - SELECT cntrb_id FROM contributors WHERE cntrb_login = '{}' \ - AND LOWER(data_source) = '{} api' - """.format(login, platform)) - - rs = pd.read_sql(idSQL, self.db, params={}) - data_list = [list(row) for row in rs.itertuples(index=False)] - try: - return data_list[0][0] - except: - self.logger.info('contributor needs to be added...') - - if platform == 'github': - cntrb_url = ("https://api.github.com/users/" + login) - elif platform == 'gitlab': - cntrb_url = ("https://gitlab.com/api/v4/users?username=" + login ) - self.logger.info("Hitting endpoint: {} ...\n".format(cntrb_url)) + try: + self.tool_source + self.tool_version + self.data_source + except AttributeError: + self.tool_source = 'Augur Worker Testing' + self.tool_version = '0.0.0' + self.data_source = 'Augur Worker Testing' + + #database interface, additional functionality with github interface. + def initialize_database_connections(self): + super().initialize_database_connections() + # Organize different api keys/oauths available + self.logger.info("Initializing API key.") + if 'gh_api_key' in self.config or 'gitlab_api_key' in self.config: + try: + self.init_oauths(self.platform) + except AttributeError: + self.logger.error("Worker not configured to use API key!") + else: + self.oauths = [{'oauth_id': 0}] + + def find_id_from_login(self, login, platform='github'): + """ Retrieves our contributor table primary key value for the contributor with + the given GitHub login credentials, if this contributor is not there, then + they get inserted. + + :param login: String, the GitHub login username to find the primary key id for + :return: Integer, the id of the row in our database with the matching GitHub login + """ + idSQL = s.sql.text(""" + SELECT cntrb_id FROM contributors WHERE cntrb_login = '{}' \ + AND LOWER(data_source) = '{} api' + """.format(login, platform)) + + rs = pd.read_sql(idSQL, self.db, params={}) + data_list = [list(row) for row in rs.itertuples(index=False)] + try: + return data_list[0][0] + except: + self.logger.info('contributor needs to be added...') + + if platform == 'github': + cntrb_url = ("https://api.github.com/users/" + login) + elif platform == 'gitlab': + cntrb_url = ("https://gitlab.com/api/v4/users?username=" + login ) + self.logger.info("Hitting endpoint: {} ...\n".format(cntrb_url)) # Possible infinite loop if this request never succeeds? - while True: - try: - r = requests.get(url=cntrb_url, headers=self.headers) - break - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(30) + while True: + try: + r = requests.get(url=cntrb_url, headers=self.headers) + break + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(30) - self.update_rate_limit(r) - contributor = r.json() + self.update_rate_limit(r) + contributor = r.json() # Used primarily for the Gitlab block below - company = None - location = None - email = None - if 'company' in contributor: - company = contributor['company'] - if 'location' in contributor: - location = contributor['location'] - if 'email' in contributor: - email = contributor['email'] - - - if platform == 'github': - cntrb = { - 'cntrb_login': contributor['login'] if 'login' in contributor else None, - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_created_at': contributor['created_at'] if 'created_at' in contributor else None, - 'cntrb_canonical': None, - 'gh_user_id': contributor['id'] if 'id' in contributor else None, - 'gh_login': contributor['login'] if 'login' in contributor else None, - 'gh_url': contributor['url'] if 'url' in contributor else None, - 'gh_html_url': contributor['html_url'] if 'html_url' in contributor else None, - 'gh_node_id': contributor['node_id'] if 'node_id' in contributor else None, - 'gh_avatar_url': contributor['avatar_url'] if 'avatar_url' in contributor else None, - 'gh_gravatar_id': contributor['gravatar_id'] if 'gravatar_id' in contributor else None, - 'gh_followers_url': contributor['followers_url'] if 'followers_url' in contributor else None, - 'gh_following_url': contributor['following_url'] if 'following_url' in contributor else None, - 'gh_gists_url': contributor['gists_url'] if 'gists_url' in contributor else None, - 'gh_starred_url': contributor['starred_url'] if 'starred_url' in contributor else None, - 'gh_subscriptions_url': contributor['subscriptions_url'] if 'subscriptions_url' in contributor else None, - 'gh_organizations_url': contributor['organizations_url'] if 'organizations_url' in contributor else None, - 'gh_repos_url': contributor['repos_url'] if 'repos_url' in contributor else None, - 'gh_events_url': contributor['events_url'] if 'events_url' in contributor else None, - 'gh_received_events_url': contributor['received_events_url'] if 'received_events_url' in contributor else None, - 'gh_type': contributor['type'] if 'type' in contributor else None, - 'gh_site_admin': contributor['site_admin'] if 'site_admin' in contributor else None, - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } - - elif platform == 'gitlab': - cntrb = { - 'cntrb_login': contributor[0]['username'] if 'username' in contributor[0] else None, - 'cntrb_email': email, - 'cntrb_company': company, - 'cntrb_location': location, - 'cntrb_created_at': contributor[0]['created_at'] if 'created_at' in contributor[0] else None, - 'cntrb_canonical': None, - 'gh_user_id': contributor[0]['id'], - 'gh_login': contributor[0]['username'], - 'gh_url': contributor[0]['web_url'], - 'gh_html_url': None, - 'gh_node_id': None, - 'gh_avatar_url': contributor[0]['avatar_url'], - 'gh_gravatar_id': None, - 'gh_followers_url': None, - 'gh_following_url': None, - 'gh_gists_url': None, - 'gh_starred_url': None, - 'gh_subscriptions_url': None, - 'gh_organizations_url': None, - 'gh_repos_url': None, - 'gh_events_url': None, - 'gh_received_events_url': None, - 'gh_type': None, - 'gh_site_admin': None, - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: " + str(result.inserted_primary_key)) - self.results_counter += 1 - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - self.logger.info(f"Inserted contributor: {cntrb['cntrb_login']}\n") - - return self.find_id_from_login(login, platform) - - #Blatently only for api key usage - def init_oauths(self, platform='github'): - - self.oauths = [] - self.headers = None - self.logger.info("Trying initialization.") - # Make a list of api key in the config combined w keys stored in the database - # Select endpoint to hit solely to retrieve rate limit - # information from headers of the response - # Adjust header keys needed to fetch rate limit information from the API responses - if platform == 'github': - url = "https://api.github.com/users/gabe-heim" - oauthSQL = s.sql.text(""" - SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'github' - """.format(self.config['gh_api_key'])) - key_name = 'gh_api_key' - rate_limit_header_key = "X-RateLimit-Remaining" - rate_limit_reset_header_key = "X-RateLimit-Reset" - elif platform == 'gitlab': - url = "https://gitlab.com/api/v4/version" - oauthSQL = s.sql.text(""" - SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'gitlab' - """.format(self.config['gitlab_api_key'])) - key_name = 'gitlab_api_key' - rate_limit_header_key = 'ratelimit-remaining' - rate_limit_reset_header_key = 'ratelimit-reset' - - for oauth in [{'oauth_id': 0, 'access_token': self.config[key_name]}] + json.loads( - pd.read_sql(oauthSQL, self.helper_db, params={}).to_json(orient="records") - ): - if platform == 'github': - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - elif platform == 'gitlab': - self.headers = {'Authorization': 'Bearer %s' % oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - self.oauths.append({ - 'oauth_id': oauth['oauth_id'], - 'access_token': oauth['access_token'], - 'rate_limit': int(response.headers[rate_limit_header_key]), - 'seconds_to_reset': ( - datetime.datetime.fromtimestamp( - int(response.headers[rate_limit_reset_header_key]) - ) - datetime.datetime.now() - ).total_seconds() - }) - self.logger.debug("Found OAuth available for use: {}".format(self.oauths[-1])) - - if len(self.oauths) == 0: - self.logger.info( - "No API keys detected, please include one in your config or in the " - "worker_oauths table in the augur_operations schema of your database." - ) - - # First key to be used will be the one specified in the config (first element in - # self.oauths array will always be the key in use) - if platform == 'github': - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - elif platform == 'gitlab': - self.headers = {'Authorization': 'Bearer %s' % self.oauths[0]['access_token']} - - self.logger.info("OAuth initialized\n") - - def enrich_cntrb_id( - self, data, key, action_map_additions={'insert': {'source': [], 'augur': []}}, - platform='github', prefix='' - ): - - if not len(data): - self.logger.info(f"Enrich contrib data is empty for {len(data)}, for the key {key}.") - - raise ValueError - - self.logger.info(f"Enriching contributor ids for {len(data)} data points...") - - source_df = pd.DataFrame(data) - expanded_source_df = self._add_nested_columns( - source_df.copy(), [key] + action_map_additions['insert']['source'] - ) - - # Insert cntrbs that are not in db - - cntrb_action_map = { - 'insert': { - 'source': [key] + action_map_additions['insert']['source'] + [f'{prefix}id'], - 'augur': ['cntrb_login'] + action_map_additions['insert']['augur'] + ['gh_user_id'] - } - } - - table_values_cntrb = self.db.execute( - s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)) - ).fetchall() - - source_data = expanded_source_df.to_dict(orient='records') - - #Filter out bad data where we can't even hit the api. - source_data = [data for data in source_data if f'{prefix}login' in data and data[f'{prefix}login'] != None] - - self.logger.info(f"table_values_cntrb keys: {table_values_cntrb[0].keys()}") - self.logger.info(f"source_data keys: {source_data[0].keys()}") - - #We can't use this because of worker collisions - #TODO: seperate this method into it's own worker. - #cntrb_id_offset = self.get_max_id(self.contributors_table, 'cntrb_id') - 1 - - # loop through data to test if it is already in the database - for index, data in enumerate(source_data): - - self.logger.info(f"Enriching {index} of {len(source_data)}") - - - user_unique_ids = [] - - #Allow for alt identifiers to be checked if user.id is not present in source_data - try: - #This will trigger a KeyError if data has alt identifier. - data[f'{prefix}id'] - for row in table_values_cntrb: - try: - user_unique_ids.append(row['gh_user_id']) - except Exception as e: - self.logger.info(f"Error adding gh_user_id: {e}. Row: {row}") - except KeyError: - self.logger.info("Source data doesn't have user.id. Using node_id instead.") - for row in table_values_cntrb: - try: - user_unique_ids.append(row['gh_node_id']) - except Exception as e: - self.logger.info(f"Error adding gh_node_id: {e}. Row: {row}") - - - #self.logger.info(f"gh_user_ids: {gh_user_ids}") - - # self.logger.info(f"Users gh_user_id: {data['user.id']}") - # in_user_ids = False - # if data['user.id'] in gh_user_ids: - # in_user_ids = True - # self.logger.info(f"{data['user.id']} is in gh_user_ids") - - # self.logger.info(f"table_values_cntrb len: {len(table_values_cntrb)}") - - #Deal with if data - #See if we can check using the user.id - source_data_id = None - try: - source_data_id = data[f'{prefix}id'] - except KeyError: - source_data_id = data[f'{prefix}node_id'] - - - #if user.id is in the database then there is no need to add the contributor - if source_data_id in user_unique_ids: - - self.logger.info("{} found in database".format(source_data_id)) - - user_id_row = [] - try: - data[f'{prefix}id'] - #gets the dict from the table_values_cntrb that contains data['user.id'] - user_id_row = list(filter(lambda x: x['gh_user_id'] == source_data_id, table_values_cntrb))[0] - except KeyError: - user_id_row = list(filter(lambda x: x['gh_node_id'] == source_data_id, table_values_cntrb))[0] - - - #assigns the cntrb_id to the source data to be returned to the workers - data['cntrb_id'] = user_id_row['cntrb_id'] - self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") - - #contributor is not in the database - else: - - self.logger.info("{} not in database, making api call".format(source_data_id)) - - self.logger.info("login: {}".format(data[f'{prefix}login'])) - - try: - url = ("https://api.github.com/users/" + data[f'{prefix}login']) - except Exception as e: - self.logger.info(f"Error when creating url: {e}. Data: {data}") - - attempts = 0 - - while attempts < 10: - self.logger.info("Hitting endpoint: " + url + " ...\n") - - try: - response = requests.get(url=url , headers=self.headers) - except TimeoutError: - self.logger.info(f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") - time.sleep(10) - continue - - self.update_rate_limit(response,platform=platform) - - - try: - contributor = response.json() - except: - contributor = json.loads(json.dumps(response.text)) - - if type(contributor) == dict: - self.logger.info("Request returned a dict!") - self.logger.info(f"Contributor data: {contributor}") - success = True - break - elif type(contributor) == list: - self.logger.warning("Wrong type returned trying again.") - self.logger.info(f"Contributor data: {contributor}") - elif type(contributor) == str: - self.logger.info(f"Warning! page_data was string: {contributor}\n") - if "" in contributor: - self.logger.info("HTML was returned, trying again...\n") - elif len(contributor) == 0: - self.logger.warning("Empty string, trying again...\n") - else: - try: - contributor = json.loads(contributor) - success = True - break - except: - pass - attempts += 1 - if not success: - continue - - self.logger.info(f"Contributor data: {contributor}") - - cntrb = { - "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'], - "cntrb_email": contributor['email'] if 'email' in contributor else None, - "cntrb_company": contributor['company'] if 'company' in contributor else None, - "cntrb_location": contributor['location'] if 'location' in contributor else None, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": contributor['email'] if 'email' in contributor else None, - "gh_user_id": contributor['id'], - "gh_login": contributor['login'], - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - - #insert new contributor into database - # TODO: make this method it's own worker. This errors because of collisions between github_worker and pull_request_worker. - #We can solve this by making another worker with a queue. It wouldn't have to be too complicated. - try: - self.db.execute(self.contributors_table.insert().values(cntrb)) - # except s.exc.IntegrityError: - except Exception as e: - self.logger.info(f"Contributor was unable to be added to table! Attempting to get cntrb_id from table anyway because of possible collision. Error: {e}") - - - #Get the contributor id from the newly inserted contributor. - cntrb_id_row = self.db.execute( - s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)).where( - self.contributors_table.c.gh_user_id==cntrb["gh_user_id"] - ) - ).fetchall() - - #Handle and log rare failure cases. If this part errors something is very wrong. - if len(cntrb_id_row) == 1: - data['cntrb_id'] = cntrb_id_row[0]['cntrb_id'] - self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") - elif len(cntrb_id_row) == 0: - self.logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor that is unable to be inserted into the contributors table but is also not present in that table.") - else: - self.logger.info(f"There are more than one contributors in the table with gh_user_id={cntrb['gh_user_id']}") - - - cntrb_data = { - 'cntrb_id': data['cntrb_id'], - 'gh_node_id': cntrb['gh_node_id'], - 'cntrb_login': cntrb['cntrb_login'], - 'gh_user_id': cntrb['gh_user_id'] - } - #This updates our list of who is already in the database as we iterate to avoid duplicates. - #People who make changes tend to make more than one in a row. - table_values_cntrb.append(cntrb_data) - - self.logger.info( - "Contributor id enrichment successful, result has " - f"{len(source_data)} data points.\n" - ) - return source_data - - - - - #old method - """ - # source_cntrb_insert, _ = self.organize_needed_data( - # expanded_source_df.to_dict(orient='records'), table_values=table_values_cntrb, - # action_map=cntrb_action_map - # ) - - # cntrb_insert = [ - # { - # 'cntrb_login': contributor[f'{prefix}login'], - # 'cntrb_created_at': None if ( - # f'{prefix}created_at' not in contributor - # ) else contributor[f'{prefix}created_at'], - # 'cntrb_email': None if f'{prefix}email' not in contributor else contributor[f'{prefix}email'], - # 'cntrb_company': None if f'{prefix}company' not in contributor else contributor[f'{prefix}company'], - # 'cntrb_location': None if ( - # f'{prefix}location' not in contributor - # ) else contributor[f'{prefix}location'], - # 'gh_user_id': None if ( - # not contributor[f'{prefix}id'] - # ) else int(float(contributor[f'{prefix}id'])), - # 'gh_login': contributor[f'{prefix}login'], - # 'gh_url': contributor[f'{prefix}url'], - # 'gh_html_url': contributor[f'{prefix}html_url'], - # 'gh_node_id': contributor[f'{prefix}node_id'], #valid for dup check - # 'gh_avatar_url': contributor[f'{prefix}avatar_url'], - # 'gh_gravatar_id': contributor[f'{prefix}gravatar_id'], - # 'gh_followers_url': contributor[f'{prefix}followers_url'], - # 'gh_following_url': contributor[f'{prefix}following_url'], - # 'gh_gists_url': contributor[f'{prefix}gists_url'], - # 'gh_starred_url': contributor[f'{prefix}starred_url'], - # 'gh_subscriptions_url': contributor[f'{prefix}subscriptions_url'], - # 'gh_organizations_url': contributor[f'{prefix}organizations_url'], - # 'gh_repos_url': contributor[f'{prefix}repos_url'], - # 'gh_events_url': contributor[f'{prefix}events_url'], - # 'gh_received_events_url': contributor[f'{prefix}received_events_url'], - # 'gh_type': contributor[f'{prefix}type'], - # 'gh_site_admin': contributor[f'{prefix}site_admin'], - # 'tool_source': self.tool_source, - # 'tool_version': self.tool_version, - # 'data_source': self.data_source - # } for contributor in source_cntrb_insert if contributor[f'{prefix}login'] - # ] - # - # try: - # self.bulk_insert(self.contributors_table, cntrb_insert) - # except s.exc.IntegrityError: - # self.logger.info("Unique Violation in contributors table! ") - # - # # Query db for inserted cntrb pkeys and add to shallow level of data - # - # # Query - # cntrb_pk_name = list(self.contributors_table.primary_key)[0].name - # session = s.orm.Session(self.db) - # inserted_pks = pd.DataFrame( - # session.query( - # self.contributors_table.c[cntrb_pk_name], self.contributors_table.c.cntrb_login, - # self.contributors_table.c.gh_node_id - # ).distinct(self.contributors_table.c.cntrb_login).order_by( - # self.contributors_table.c.cntrb_login, self.contributors_table.c[cntrb_pk_name] - # ).all(), columns=[cntrb_pk_name, 'cntrb_login', 'gh_node_id'] - # ).to_dict(orient='records') - # session.close() - # - # # Prepare for merge - # source_columns = sorted(list(source_df.columns)) - # necessary_columns = sorted(list(set(source_columns + cntrb_action_map['insert']['source']))) - # (source_table, inserted_pks_table), metadata, session = self._setup_postgres_merge( - # [ - # expanded_source_df[necessary_columns].to_dict(orient='records'), - # inserted_pks - # ], sort=True - # ) - # final_columns = [cntrb_pk_name] + sorted(list(set(necessary_columns))) - # - # # Merge - # source_pk = pd.DataFrame( - # session.query( - # inserted_pks_table.c.cntrb_id, source_table - # ).join( - # source_table, - # eval( - # ' and '.join( - # [ - # ( - # f"inserted_pks_table.c['{table_column}'] " - # f"== source_table.c['{source_column}']" - # ) for table_column, source_column in zip( - # cntrb_action_map['insert']['augur'], - # cntrb_action_map['insert']['source'] - # ) - # ] - # ) - # ) - # ).all(), columns=final_columns - # ) - # - # # Cleanup merge - # source_pk = self._eval_json_columns(source_pk) - # self._close_postgres_merge(metadata, session) - - #self.logger.info( - # "Contributor id enrichment successful, result has " - # f"{len(source_pk)} data points.\n" - #) - - #return source_pk.to_dict(orient='records')""" - - def query_github_contributors(self, entry_info, repo_id): - - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") - - ## It absolutely doesn't matter if the contributor has already contributoed to a repo. it only matters that they exist in our table, and - ## if the DO, then we DO NOT want to insert them again in any GitHub Method. - github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] - - # Extract owner/repo from the url for the endpoint - owner, name = self.get_owner_repo(github_url) - - # Set the base of the url and place to hold contributors to insert - contributors_url = ( - f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?per_page=100&page={}" - ) - - # Get contributors that we already have stored - # Set our duplicate and update column map keys (something other than PK) to - # check dupicates/needed column updates with - table = 'contributors' - table_pkey = 'cntrb_id' - update_col_map = {'cntrb_email': 'email'} - duplicate_col_map = {'cntrb_login': 'login'} - - #list to hold contributors needing insertion or update - contributors = self.paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) - - self.logger.info("Count of contributors needing insertion: " + str(len(contributors)) + "\n") - - for repo_contributor in contributors: - try: - # Need to hit this single contributor endpoint to get extra data including... - # `created at` - # i think that's it - cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - self.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - self.update_gh_rate_limit(r) - contributor = r.json() - - company = None - location = None - email = None - if 'company' in contributor: - company = contributor['company'] - if 'location' in contributor: - location = contributor['location'] - if 'email' in contributor: - email = contributor['email'] - canonical_email = contributor['email'] - - cntrb = { - "cntrb_login": contributor['login'], - "cntrb_created_at": contributor['created_at'], - "cntrb_email": email, - "cntrb_company": company, - "cntrb_location": location, - # "cntrb_type": , dont have a use for this as of now ... let it default to null - "cntrb_canonical": canonical_email, - "gh_user_id": contributor['id'], - "gh_login": contributor['login'], - "gh_url": contributor['url'], - "gh_html_url": contributor['html_url'], - "gh_node_id": contributor['node_id'], #This is what we are dup checking - "gh_avatar_url": contributor['avatar_url'], - "gh_gravatar_id": contributor['gravatar_id'], - "gh_followers_url": contributor['followers_url'], - "gh_following_url": contributor['following_url'], - "gh_gists_url": contributor['gists_url'], - "gh_starred_url": contributor['starred_url'], - "gh_subscriptions_url": contributor['subscriptions_url'], - "gh_organizations_url": contributor['organizations_url'], - "gh_repos_url": contributor['repos_url'], - "gh_events_url": contributor['events_url'], - "gh_received_events_url": contributor['received_events_url'], - "gh_type": contributor['type'], - "gh_site_admin": contributor['site_admin'], - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - #dup check - #TODO: add additional fields to check if needed. - existingMatchingContributors = self.db.execute( - self.sql.select( - [self.contributors_table.c.gh_node_id] - ).where( - self.contributors_table.c.gh_node_id==cntrb["gh_node_id"] - ).fetchall() - ) - - if len(existingMatchingContributors) > 0: - break #if contributor already exists in table - - - # Commit insertion to table - if repo_contributor['flag'] == 'need_update': - result = self.db.execute(self.contributors_table.update().where( - self.worker_history_table.c.cntrb_email==email).values(cntrb)) - self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) - self.cntrb_id_inc = repo_contributor['pkey'] - elif repo_contributor['flag'] == 'need_insertion': - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) - self.results_counter += 1 - - self.logger.info("Inserted contributor: " + contributor['login'] + "\n") - - # Increment our global track of the cntrb id for the possibility of it being used as a FK - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - - except Exception as e: - self.logger.error("Caught exception: {}".format(e)) - self.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) - continue - - - def query_github_contributors_bulk(self, entry_info, repo_id): - - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") - - github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] - - owner, name = self.get_owner_repo(github_url) - - contributors_url = (f"https://api.github.com/repos/{owner}/{name}/" + - "contributors?per_page=100&page={}") - - action_map = { - 'insert': { - 'source': ['login'], - 'augur': ['cntrb_login'] - }, - 'update': { - 'source': ['email'], - 'augur': ['cntrb_email'] - } - } - - source_contributors = self.paginate_endpoint(contributors_url, action_map=action_map, - table=self.contributors_table) - - contributors_insert = [] - - for repo_contributor in source_contributors['insert']: - # Need to hit this single contributor endpoint to get extra data - cntrb_url = (f"https://api.github.com/users/{repo_contributor['login']}") - self.logger.info(f"Hitting endpoint: {cntrb_url} ...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - self.update_gh_rate_limit(r) - contributor = r.json() - - contributors_insert.append({ - 'cntrb_login': contributor['login'], - 'cntrb_created_at': contributor['created_at'], - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, - 'gh_user_id': contributor['id'], - 'gh_login': contributor['login'], - 'gh_url': contributor['url'], - 'gh_html_url': contributor['html_url'], - 'gh_node_id': contributor['node_id'], - 'gh_avatar_url': contributor['avatar_url'], - 'gh_gravatar_id': contributor['gravatar_id'], - 'gh_followers_url': contributor['followers_url'], - 'gh_following_url': contributor['following_url'], - 'gh_gists_url': contributor['gists_url'], - 'gh_starred_url': contributor['starred_url'], - 'gh_subscriptions_url': contributor['subscriptions_url'], - 'gh_organizations_url': contributor['organizations_url'], - 'gh_repos_url': contributor['repos_url'], - 'gh_events_url': contributor['events_url'], - 'gh_received_events_url': contributor['received_events_url'], - 'gh_type': contributor['type'], - 'gh_site_admin': contributor['site_admin'], - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - }) - - contributors_insert_result, contributors_update_result = self.bulk_insert(self.contributors_table, - update=source_contributors['update'], unique_columns=action_map['insert']['augur'], - insert=contributors_insert, update_columns=action_map['update']['augur']) - - def query_github_contributors_fast(self, entry_info, repo_id): - """ Data collection function - Query the GitHub API for contributors - """ - self.logger.info(f"Querying contributors with given entry info: {entry_info}") - - github_url = ( - entry_info['given']['github_url'] if 'github_url' in entry_info['given'] - else entry_info['given']['git_url'] - ) - - contributors_url = ( - f"https://api.github.com/repos/{self.owner}/{self.name}/" - "contributors?per_page=100&page={}" - ) - - action_map = { - 'insert': { - 'source': ['login'], - 'augur': ['cntrb_login'] - }, - 'update': { - 'source': ['email'], - 'augur': ['cntrb_email'] - } - } - - source_contributors = self.paginate_endpoint( - contributors_url, action_map=action_map, table=self.contributors_table - ) - - contributors_insert = [ - { - 'cntrb_login': contributor['login'], - 'cntrb_created_at': ( - contributor['created_at'] if 'created_at' in contributor else None - ), - 'cntrb_email': contributor['email'] if 'email' in contributor else None, - 'cntrb_company': contributor['company'] if 'company' in contributor else None, - 'cntrb_location': contributor['location'] if 'location' in contributor else None, - 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, - 'gh_user_id': contributor['id'], - 'gh_login': contributor['login'], - 'gh_url': contributor['url'], - 'gh_html_url': contributor['html_url'], - 'gh_node_id': contributor['node_id'], - 'gh_avatar_url': contributor['avatar_url'], - 'gh_gravatar_id': contributor['gravatar_id'], - 'gh_followers_url': contributor['followers_url'], - 'gh_following_url': contributor['following_url'], - 'gh_gists_url': contributor['gists_url'], - 'gh_starred_url': contributor['starred_url'], - 'gh_subscriptions_url': contributor['subscriptions_url'], - 'gh_organizations_url': contributor['organizations_url'], - 'gh_repos_url': contributor['repos_url'], - 'gh_events_url': contributor['events_url'], - 'gh_received_events_url': contributor['received_events_url'], - 'gh_type': contributor['type'], - 'gh_site_admin': contributor['site_admin'], - 'tool_source': self.tool_source, - 'tool_version': self.tool_version, - 'data_source': self.data_source - } for contributor in source_contributors['insert'] - ] - - self.bulk_insert( - self.contributors_table, update=source_contributors['update'], - unique_columns=action_map['insert']['augur'], - insert=contributors_insert, update_columns=action_map['update']['augur'] - ) - - def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.info( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.info("Gitlab rate limit reached. Temp. disabling...") - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Updated rate limit, you have: " + - str(self.oauths[0]['rate_limit']) + " requests remaining.") - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['RateLimit-Reset'] - except Exception as e: - self.logger.info(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://gitlab.com/api/v4/version" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info(f"Higher rate limit found in oauth: {oauth}") - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} - - def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.warning( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.debug( - "Github thinks we are abusing their api. Preventing use " - "of this key until its rate limit resets..." - ) - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - # self.logger.info("Recieved rate limit from headers\n") - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Headers did not work, had to decrement") - self.logger.info( - f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." - ) - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['X-RateLimit-Reset'] - except Exception as e: - self.logger.error(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://api.github.com/users/gabe-heim" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - - attempts = 3 - success = False - while attempts > 0 and not success: - response = requests.get(url=url, headers=self.headers) - try: - oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['X-RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - success = True - except Exception as e: - self.logger.info( - f"oath method ran into error getting info from headers: {e}\n" - ) - self.logger.info(f"{self.headers}\n{url}\n") - attempts -= 1 - if not success: - continue - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}\n" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - - #TODO: figure out if changing this typo breaks anything - def query_gitlab_contributors(self, entry_info, repo_id): - - gitlab_url = ( - entry_info['given']['gitlab_url'] if 'gitlab_url' in entry_info['given'] - else entry_info['given']['git_url'] - ) - - self.logger.info("Querying contributors with given entry info: " + str(entry_info) + "\n") - - path = urlparse(gitlab_url) - split = path[2].split('/') - - owner = split[1] - name = split[2] - - # Handles git url case by removing the extension - if ".git" in name: - name = name[:-4] - - url_encoded_format = quote(owner + '/' + name, safe='') - - table = 'contributors' - table_pkey = 'cntrb_id' - ### Here we are adding gitlab user information from the API - ### Following Gabe's rework of the contributor worker. - - ### The GitLab API will NEVER give you an email. It will let you - ### Query an email, but never give you one. - ### ## Gitlab email api: https://gitlab.com/api/v4/users?search=s@goggins.com - ### We don't need to update right now, so commenting out. - ### TODO: SOLVE LOGIC. - # update_col_map = {'cntrb_email': 'email'} - update_col_map = {} - duplicate_col_map = {'gl_username': 'username'} - - # list to hold contributors needing insertion or update - contributors = self.paginate("https://gitlab.com/api/v4/projects/" + url_encoded_format + "/repository/contributors?per_page=100&page={}", duplicate_col_map, update_col_map, table, table_pkey, platform='gitlab') - - for repo_contributor in contributors: - try: - cntrb_compressed_url = ("https://gitlab.com/api/v4/users?search=" + repo_contributor['email']) - self.logger.info("Hitting endpoint: " + cntrb_compressed_url + " ...\n") - r = requests.get(url=cntrb_compressed_url, headers=self.headers) - contributor_compressed = r.json() - - email = repo_contributor['email'] - self.logger.info(contributor_compressed) - if len(contributor_compressed) == 0 or type(contributor_compressed) is dict or "id" not in contributor_compressed[0]: - continue - - self.logger.info("Fetching for user: " + str(contributor_compressed[0]["id"])) - - cntrb_url = ("https://gitlab.com/api/v4/users/" + str(contributor_compressed[0]["id"])) - self.logger.info("Hitting end point to get complete contributor info now: " + cntrb_url + "...\n") - r = requests.get(url=cntrb_url, headers=self.headers) - contributor = r.json() - - cntrb = { - "gl_id": contributor.get('gl_id', None), - "gl_full_name": contributor.get('full_name', None), - "gl_username": contributor.get('username', None), - "gl_state": contributor.get('state', None), - "gl_avatar_url": contributor.get('avatar_url', None), - "gl_web_url": contributor.get('web_url', None), - #"cntrb_login": contributor.get('username', None), - #"cntrb_created_at": contributor.get('created_at', None), - "cntrb_email": ('email', None), - #"cntrb_company": contributor.get('organization', None), - #"cntrb_location": contributor.get('location', None), - # "cntrb_type": , dont have a use for this as of now ... let it default to null - #"cntrb_canonical": contributor.get('public_email', None), - #"gh_user_id": contributor.get('id', None), - #"gh_login": contributor.get('username', None), - #"gh_url": contributor.get('web_url', None), - #"gh_html_url": contributor.get('web_url', None), - #"gh_node_id": None, - #"gh_avatar_url": contributor.get('avatar_url', None), - #"gh_gravatar_id": None, - #"gh_followers_url": None, - #"gh_following_url": None, - #"gh_gists_url": None, - #"gh_starred_url": None, - #"gh_subscriptions_url": None, - #"gh_organizations_url": None, - #"gh_repos_url": None, - #"gh_events_url": None, - #"gh_received_events_url": None, - #"gh_type": None, - #"gh_site_admin": None, - "tool_source": self.tool_source, - "tool_version": self.tool_version, - "data_source": self.data_source - } - - # Commit insertion to table - if repo_contributor['flag'] == 'need_update': - result = self.db.execute(self.contributors_table.update().where( - self.worker_history_table.c.cntrb_email == email).values(cntrb)) - self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) - self.cntrb_id_inc = repo_contributor['pkey'] - elif repo_contributor['flag'] == 'need_insertion': - result = self.db.execute(self.contributors_table.insert().values(cntrb)) - self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) - self.results_counter += 1 - - self.logger.info("Inserted contributor: " + contributor['username'] + "\n") - - # Increment our global track of the cntrb id for the possibility of it being used as a FK - self.cntrb_id_inc = int(result.inserted_primary_key[0]) - - except Exception as e: - self.logger.info("Caught exception: {}".format(e)) - self.logger.info("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) - continue - - - def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.info( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.info("Gitlab rate limit reached. Temp. disabling...") - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Updated rate limit, you have: " + - str(self.oauths[0]['rate_limit']) + " requests remaining.") - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['RateLimit-Reset'] - except Exception as e: - self.logger.info(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://gitlab.com/api/v4/version" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} - response = requests.get(url=url, headers=self.headers) - oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info(f"Higher rate limit found in oauth: {oauth}") - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} - - - def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.warning( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.debug( - "Github thinks we are abusing their api. Preventing use " - "of this key until its rate limit resets..." - ) - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - # self.logger.info("Recieved rate limit from headers\n") - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Headers did not work, had to decrement") - self.logger.info( - f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." - ) - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['X-RateLimit-Reset'] - except Exception as e: - self.logger.error(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://api.github.com/users/gabe-heim" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - - attempts = 3 - success = False - while attempts > 0 and not success: - response = requests.get(url=url, headers=self.headers) - try: - oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['X-RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - success = True - except Exception as e: - self.logger.info( - f"oath method ran into error getting info from headers: {e}\n" - ) - self.logger.info(f"{self.headers}\n{url}\n") - attempts -= 1 - if not success: - continue - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}\n" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - - def update_rate_limit( - self, response, bad_credentials=False, temporarily_disable=False, platform="gitlab" - ): - if platform == 'gitlab': - return self.update_gitlab_rate_limit( - response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable - ) - elif platform == 'github': - return self.update_gh_rate_limit( - response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable - ) - - - #Indexerror somewhere - def multi_thread_urls(self, all_urls, max_attempts=5, platform='github'): - """ - :param all_urls: list of tuples - """ - - if not len(all_urls): - self.logger.info("No urls to multithread, returning blank list.\n") - return [] - - def load_url(url, extra_data={}): - try: - html = requests.get(url, stream=True, headers=self.headers) - return html, extra_data - except requests.exceptions.RequestException as e: - self.logger.info(e, url) - - self.logger.info("Beginning to multithread API endpoints.") - - start = time.time() - - all_data = [] - valid_url_count = len(all_urls) - - partitions = math.ceil(len(all_urls) / 600) - self.logger.info(f"{len(all_urls)} urls to process. Trying {partitions} partitions. " + - f"Using {max(multiprocessing.cpu_count()//8, 1)} threads.") - for urls in numpy.array_split(all_urls, partitions): - attempts = 0 - self.logger.info(f"Total data points collected so far: {len(all_data)}") - while len(urls) > 0 and attempts < max_attempts: - with concurrent.futures.ThreadPoolExecutor( - max_workers=max(multiprocessing.cpu_count()//8, 1) - ) as executor: - # Start the load operations and mark each future with its URL - future_to_url = {executor.submit(load_url, *url): url for url in urls} - self.logger.info("Multithreaded urls and returned status codes:") - count = 0 - for future in concurrent.futures.as_completed(future_to_url): - - if count % 100 == 0: - self.logger.info( - f"Processed {len(all_data)} / {valid_url_count} urls. " - f"{len(urls)} remaining in this partition." - ) - count += 1 - - url = future_to_url[future] - try: - response, extra_data = future.result() - - if response.status_code != 200: - self.logger.info( - f"Url: {url[0]} ; Status code: {response.status_code}" - ) - - if response.status_code == 403 or response.status_code == 401: # 403 is rate limit, 404 is not found, 401 is bad credentials - self.update_rate_limit(response, platform=platform) - continue - - elif response.status_code == 200: - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - page_data = [{**data, **extra_data} for data in page_data] - all_data += page_data - - if 'last' in response.links and "&page=" not in url[0]: - urls += [ - (url[0] + f"&page={page}", extra_data) for page in range( - 2, int(response.links['last']['url'].split('=')[-1]) + 1 - ) - ] - try: - # self.logger.info(f"urls boundry issue? for {urls} where they are equal to {url}.") - - urls = numpy.delete(urls, numpy.where(urls == url), axis=0) - except: - self.logger.info(f"ERROR with axis = 0 - Now attempting without setting axis for numpy.delete for {urls} where they are equal to {url}.") - urls = numpy.delete(urls, numpy.where(urls == url)) - - elif response.status_code == 404: - urls = numpy.delete(urls, numpy.where(urls == url), axis=0) - self.logger.info(f"Not found url: {url}\n") - else: - self.logger.info( - f"Unhandled response code: {response.status_code} {url}\n" - ) - - except Exception as e: - self.logger.info( - f"{url} generated an exception: {traceback.format_exc()}\n" - ) - - attempts += 1 - - self.logger.info( - f"Processed {valid_url_count} urls and got {len(all_data)} data points " - f"in {time.time() - start} seconds thanks to multithreading!\n" - ) - return all_data - - - #insertion_method and stagger are arguments that allow paginate_endpoint to insert at around ~500 pages at a time. - def paginate_endpoint( - self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=500 - ): - - #Get augur columns using the action map along with the primary key - table_values = self.db.execute( - s.sql.select(self.get_relevant_columns(table, action_map)).where(where_clause) - ).fetchall() - - page_number = 1 - multiple_pages = False - need_insertion = [] - need_update = [] - - #Stores sum of page data - all_data = [] - forward_pagination = True - backwards_activation = False - last_page_number = -1 - - #Block to handle page queries and retry at least 10 times - while True: - - # Multiple attempts to hit endpoint - num_attempts = 0 - success = False - while num_attempts < 10: - self.logger.info(f"Hitting endpoint: {url.format(page_number)}...\n") - try: - response = requests.get(url=url.format(page_number), headers=self.headers) - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(10) - continue - - self.update_rate_limit(response, platform=platform) - - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - if type(page_data) == list: - success = True - break - elif type(page_data) == dict: - self.logger.info("Request returned a dict: {}\n".format(page_data)) - if page_data['message'] == "Not Found": - self.logger.warning( - "Github repo was not found or does not exist for endpoint: " - f"{url.format(page_number)}\n" - ) - break - if "You have triggered an abuse detection mechanism." in page_data['message']: - num_attempts -= 1 - self.update_rate_limit(response, temporarily_disable=True,platform=platform) - if page_data['message'] == "Bad credentials": - self.update_rate_limit(response, bad_credentials=True, platform=platform) - elif type(page_data) == str: - self.logger.info(f"Warning! page_data was string: {page_data}\n") - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - elif len(page_data) == 0: - self.logger.warning("Empty string, trying again...\n") - else: - try: - page_data = json.loads(page_data) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Success - - # Determine if continued pagination is needed - - if len(page_data) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - all_data += page_data - - if not forward_pagination: - - # Checking contents of requests with what we already have in the db - page_insertions, page_updates = self.organize_needed_data( - page_data, table_values, list(table.primary_key)[0].name, - action_map, in_memory=True - ) - - # Reached a page where we already have all tuples - if len(need_insertion) == 0 and len(need_update) == 0 and \ - backwards_activation: - self.logger.info( - "No more pages with unknown tuples, breaking from pagination.\n" - ) - break - - need_insertion += page_insertions - need_update += page_updates - - # Find last page so we can decrement from there - if 'last' in response.links and last_page_number == -1: - if platform == 'github': - last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) - elif platform == 'gitlab': - last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) - - if not forward_pagination and not backwards_activation: - page_number = last_page_number - backwards_activation = True - - self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, - int(last_page_number) if last_page_number != -1 else "*last page not known*")) - - if (page_number <= 1 and not forward_pagination) or \ - (page_number >= last_page_number and forward_pagination): - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - #This is probably where we should insert at around ~500 at a time - #makes sure that stagger is enabled, we have an insertion method, and the insertion happens every 500 pages or so. - if stagger and insertion_method != None and page_number % insertion_threshold == 0: - #call insertion method passed as argument. - staggered_source_prs = { - 'insert' : need_insertion, - 'update' : need_update, - 'all' : all_data - } - - #Use the method the subclass needs in order to insert the data. - insertion_method(staggered_source_prs,action_map) - - #clear the data from memory and avoid duplicate insertions. - need_insertion = [] - need_update = [] - all_data = [] - - page_number = page_number + 1 if forward_pagination else page_number - 1 - - if forward_pagination: - need_insertion, need_update = self.organize_needed_data( - all_data, table_values, list(table.primary_key)[0].name, action_map, - in_memory=in_memory - ) - - return { - 'insert': need_insertion, - 'update': need_update, - 'all': all_data - } - - #TODO: deprecated but still used by the issues worker. - def paginate(self, url, duplicate_col_map, update_col_map, table, table_pkey, where_clause="", value_update_col_map={}, platform="github"): - """ DEPRECATED - Paginate either backwards or forwards (depending on the value of the worker's - finishing_task attribute) through all the GitHub or GitLab api endpoint pages. - - :param url: String, the url of the API endpoint we are paginating through, expects - a curly brace string formatter within the string to format the Integer - representing the page number that is wanted to be returned - :param duplicate_col_map: Dictionary, maps the column names of the source data - to the field names in our database for columns that should be checked for - duplicates (if source data value == value in existing database row, then this - element is a duplicate and would not need an insertion). Key is source data - column name, value is database field name. Example: {'id': 'gh_issue_id'} - :param update_col_map: Dictionary, maps the column names of the source data - to the field names in our database for columns that should be checked for - updates (if source data value != value in existing database row, then an - update is needed). Key is source data column name, value is database field name. - Example: {'id': 'gh_issue_id'} - :param table: String, the name of the table that holds the values to check for - duplicates/updates against - :param table_pkey: String, the field name of the primary key of the table in - the database that we are getting the values for to cross-reference to check - for duplicates. - :param where_clause: String, optional where clause to filter the values - that are queried when preparing the values that will be cross-referenced - for duplicates/updates - :param value_update_col_map: Dictionary, sometimes we add a new field to a table, - and we want to trigger an update of that row in the database even if all of the - data values are the same and would not need an update ordinarily. Checking for - a specific existing value in the database field allows us to do this. The key is the - name of the field in the database we are checking for a specific value to trigger - an update, the value is the value we are checking for equality to trigger an update. - Example: {'cntrb_id': None} - :return: List of dictionaries, all data points from the pages of the specified API endpoint - each with a 'flag' key-value pair representing the required action to take with that - data point (i.e. 'need_insertion', 'need_update', 'none') - """ - - update_keys = list(update_col_map.keys()) if update_col_map else [] - update_keys += list(value_update_col_map.keys()) if value_update_col_map else [] - cols_to_query = list(duplicate_col_map.keys()) + update_keys + [table_pkey] - table_values = self.get_table_values(cols_to_query, [table], where_clause) - - i = 1 - multiple_pages = False - tuples = [] - while True: - num_attempts = 0 - success = False - while num_attempts < 3: - self.logger.info(f'Hitting endpoint: {url.format(i)}...\n') - r = requests.get(url=url.format(i), headers=self.headers) - - self.update_rate_limit(r, platform=platform) - if 'last' not in r.links: - last_page = None - else: - if platform == "github": - last_page = r.links['last']['url'][-6:].split('=')[1] - elif platform == "gitlab": - last_page = r.links['last']['url'].split('&')[2].split("=")[1] - self.logger.info("Analyzing page {} of {}\n".format(i, int(last_page) + 1 if last_page is not None else '*last page not known*')) - - try: - j = r.json() - except: - j = json.loads(json.dumps(r.text)) - - if type(j) != dict and type(j) != str: - success = True - break - elif type(j) == dict: - self.logger.info("Request returned a dict: {}\n".format(j)) - if j['message'] == 'Not Found': - self.logger.warning("Github repo was not found or does not exist for endpoint: {}\n".format(url)) - break - if j['message'] == 'You have triggered an abuse detection mechanism. Please wait a few minutes before you try again.': - num_attempts -= 1 - self.logger.info("rate limit update code goes here") - self.update_rate_limit(r, temporarily_disable=True,platform=platform) - if j['message'] == 'Bad credentials': - self.logger.info("rate limit update code goes here") - self.update_rate_limit(r, bad_credentials=True, platform=platform) - elif type(j) == str: - self.logger.info(f'J was string: {j}\n') - if '' in j: - self.logger.info('HTML was returned, trying again...\n') - elif len(j) == 0: - self.logger.warning('Empty string, trying again...\n') - else: - try: - j = json.loads(j) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Find last page so we can decrement from there - if 'last' in r.links and not multiple_pages and not self.finishing_task: - if platform == "github": - param = r.links['last']['url'][-6:] - i = int(param.split('=')[1]) + 1 - elif platform == "gitlab": - i = int(r.links['last']['url'].split('&')[2].split("=")[1]) + 1 - self.logger.info("Multiple pages of request, last page is " + str(i - 1) + "\n") - multiple_pages = True - elif not multiple_pages and not self.finishing_task: - self.logger.info("Only 1 page of request\n") - elif self.finishing_task: - self.logger.info("Finishing a previous task, paginating forwards ..." - " excess rate limit requests will be made\n") - - if len(j) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - # Checking contents of requests with what we already have in the db - j = self.assign_tuple_action(j, table_values, update_col_map, duplicate_col_map, table_pkey, value_update_col_map) - - if not j: - self.logger.error("Assigning tuple action failed, moving to next page.\n") - i = i + 1 if self.finishing_task else i - 1 - continue - try: - to_add = [obj for obj in j if obj not in tuples and (obj['flag'] != 'none')] - except Exception as e: - self.logger.error("Failure accessing data of page: {}. Moving to next page.\n".format(e)) - i = i + 1 if self.finishing_task else i - 1 - continue - if len(to_add) == 0 and multiple_pages and 'last' in r.links: - self.logger.info("{}".format(r.links['last'])) - if platform == "github": - page_number = int(r.links['last']['url'][-6:].split('=')[1]) - elif platform == "gitlab": - page_number = int(r.links['last']['url'].split('&')[2].split("=")[1]) - if i - 1 != page_number: - self.logger.info("No more pages with unknown tuples, breaking from pagination.\n") - break - - tuples += to_add - - i = i + 1 if self.finishing_task else i - 1 - - # Since we already wouldve checked the first page... break - if (i == 1 and multiple_pages and not self.finishing_task) or i < 1 or len(j) == 0: - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - return tuples - - def new_paginate_endpoint( - self, url, action_map={}, table=None, where_clause=True, platform='github' - ): - - page_number = 1 - multiple_pages = False - need_insertion = [] - need_update = [] - all_data = [] - forward_pagination = True - backwards_activation = False - last_page_number = -1 - while True: - - # Multiple attempts to hit endpoint - num_attempts = 0 - success = False - while num_attempts < 10: - self.logger.info("hitting an endpiont") - # f"Hitting endpoint: ...\n" - # f"{url.format(page_number)} on page number. \n") - try: - response = requests.get(url=url.format(page_number), headers=self.headers) - except TimeoutError as e: - self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") - time.sleep(10) - continue - - self.update_rate_limit(response, platform=platform) - - try: - page_data = response.json() - except: - page_data = json.loads(json.dumps(response.text)) - - if type(page_data) == list: - success = True - break - elif type(page_data) == dict: - self.logger.info("Request returned a dict: {}\n".format(page_data)) - if page_data['message'] == "Not Found": - self.logger.warning( - "Github repo was not found or does not exist for endpoint: " - f"{url.format(page_number)}\n" - ) - break - if "You have triggered an abuse detection mechanism." in page_data['message']: - num_attempts -= 1 - self.update_rate_limit(response, temporarily_disable=True,platform=platform) - if page_data['message'] == "Bad credentials": - self.update_rate_limit(response, bad_credentials=True, platform=platform) - elif type(page_data) == str: - self.logger.info(f"Warning! page_data was string: {page_data}\n") - if "" in page_data: - self.logger.info("HTML was returned, trying again...\n") - elif len(page_data) == 0: - self.logger.warning("Empty string, trying again...\n") - else: - try: - page_data = json.loads(page_data) - success = True - break - except: - pass - num_attempts += 1 - if not success: - break - - # Success - - # Determine if continued pagination is needed - - if len(page_data) == 0: - self.logger.info("Response was empty, breaking from pagination.\n") - break - - all_data += page_data - - if not forward_pagination: - - # Checking contents of requests with what we already have in the db - page_insertions, page_updates = self.new_organize_needed_data( - page_data, augur_table=table, action_map=action_map - ) - - # Reached a page where we already have all tuples - if len(need_insertion) == 0 and len(need_update) == 0 and \ - backwards_activation: - self.logger.info( - "No more pages with unknown tuples, breaking from pagination.\n" - ) - break - - need_insertion += page_insertions - need_update += page_updates - - # Find last page so we can decrement from there - if 'last' in response.links and last_page_number == -1: - if platform == 'github': - last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) - elif platform == 'gitlab': - last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) - - if not forward_pagination and not backwards_activation: - page_number = last_page_number - backwards_activation = True - - self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, - int(last_page_number) if last_page_number != -1 else "*last page not known*")) - - if (page_number <= 1 and not forward_pagination) or \ - (page_number >= last_page_number and forward_pagination): - self.logger.info("No more pages to check, breaking from pagination.\n") - break - - page_number = page_number + 1 if forward_pagination else page_number - 1 - - if forward_pagination: - need_insertion, need_update = self.new_organize_needed_data( - all_data, augur_table=table, action_map=action_map - ) - - return { - 'insert': need_insertion, - 'update': need_update, - 'all': all_data - } + company = None + location = None + email = None + if 'company' in contributor: + company = contributor['company'] + if 'location' in contributor: + location = contributor['location'] + if 'email' in contributor: + email = contributor['email'] + + + if platform == 'github': + cntrb = { + 'cntrb_login': contributor['login'] if 'login' in contributor else None, + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_created_at': contributor['created_at'] if 'created_at' in contributor else None, + 'cntrb_canonical': None, + 'gh_user_id': contributor['id'] if 'id' in contributor else None, + 'gh_login': contributor['login'] if 'login' in contributor else None, + 'gh_url': contributor['url'] if 'url' in contributor else None, + 'gh_html_url': contributor['html_url'] if 'html_url' in contributor else None, + 'gh_node_id': contributor['node_id'] if 'node_id' in contributor else None, + 'gh_avatar_url': contributor['avatar_url'] if 'avatar_url' in contributor else None, + 'gh_gravatar_id': contributor['gravatar_id'] if 'gravatar_id' in contributor else None, + 'gh_followers_url': contributor['followers_url'] if 'followers_url' in contributor else None, + 'gh_following_url': contributor['following_url'] if 'following_url' in contributor else None, + 'gh_gists_url': contributor['gists_url'] if 'gists_url' in contributor else None, + 'gh_starred_url': contributor['starred_url'] if 'starred_url' in contributor else None, + 'gh_subscriptions_url': contributor['subscriptions_url'] if 'subscriptions_url' in contributor else None, + 'gh_organizations_url': contributor['organizations_url'] if 'organizations_url' in contributor else None, + 'gh_repos_url': contributor['repos_url'] if 'repos_url' in contributor else None, + 'gh_events_url': contributor['events_url'] if 'events_url' in contributor else None, + 'gh_received_events_url': contributor['received_events_url'] if 'received_events_url' in contributor else None, + 'gh_type': contributor['type'] if 'type' in contributor else None, + 'gh_site_admin': contributor['site_admin'] if 'site_admin' in contributor else None, + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } + + elif platform == 'gitlab': + cntrb = { + 'cntrb_login': contributor[0]['username'] if 'username' in contributor[0] else None, + 'cntrb_email': email, + 'cntrb_company': company, + 'cntrb_location': location, + 'cntrb_created_at': contributor[0]['created_at'] if 'created_at' in contributor[0] else None, + 'cntrb_canonical': None, + 'gh_user_id': contributor[0]['id'], + 'gh_login': contributor[0]['username'], + 'gh_url': contributor[0]['web_url'], + 'gh_html_url': None, + 'gh_node_id': None, + 'gh_avatar_url': contributor[0]['avatar_url'], + 'gh_gravatar_id': None, + 'gh_followers_url': None, + 'gh_following_url': None, + 'gh_gists_url': None, + 'gh_starred_url': None, + 'gh_subscriptions_url': None, + 'gh_organizations_url': None, + 'gh_repos_url': None, + 'gh_events_url': None, + 'gh_received_events_url': None, + 'gh_type': None, + 'gh_site_admin': None, + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: " + str(result.inserted_primary_key)) + self.results_counter += 1 + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + self.logger.info(f"Inserted contributor: {cntrb['cntrb_login']}\n") + + return self.find_id_from_login(login, platform) + + #Blatently only for api key usage + def init_oauths(self, platform='github'): + + self.oauths = [] + self.headers = None + self.logger.info("Trying initialization.") + # Make a list of api key in the config combined w keys stored in the database + # Select endpoint to hit solely to retrieve rate limit + # information from headers of the response + # Adjust header keys needed to fetch rate limit information from the API responses + if platform == 'github': + url = "https://api.github.com/users/gabe-heim" + oauthSQL = s.sql.text(""" + SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'github' + """.format(self.config['gh_api_key'])) + key_name = 'gh_api_key' + rate_limit_header_key = "X-RateLimit-Remaining" + rate_limit_reset_header_key = "X-RateLimit-Reset" + elif platform == 'gitlab': + url = "https://gitlab.com/api/v4/version" + oauthSQL = s.sql.text(""" + SELECT * FROM worker_oauth WHERE access_token <> '{}' and platform = 'gitlab' + """.format(self.config['gitlab_api_key'])) + key_name = 'gitlab_api_key' + rate_limit_header_key = 'ratelimit-remaining' + rate_limit_reset_header_key = 'ratelimit-reset' + + for oauth in [{'oauth_id': 0, 'access_token': self.config[key_name]}] + json.loads( + pd.read_sql(oauthSQL, self.helper_db, params={}).to_json(orient="records") + ): + if platform == 'github': + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + elif platform == 'gitlab': + self.headers = {'Authorization': 'Bearer %s' % oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + self.oauths.append({ + 'oauth_id': oauth['oauth_id'], + 'access_token': oauth['access_token'], + 'rate_limit': int(response.headers[rate_limit_header_key]), + 'seconds_to_reset': ( + datetime.datetime.fromtimestamp( + int(response.headers[rate_limit_reset_header_key]) + ) - datetime.datetime.now() + ).total_seconds() + }) + self.logger.debug("Found OAuth available for use: {}".format(self.oauths[-1])) + + if len(self.oauths) == 0: + self.logger.info( + "No API keys detected, please include one in your config or in the " + "worker_oauths table in the augur_operations schema of your database." + ) + + # First key to be used will be the one specified in the config (first element in + # self.oauths array will always be the key in use) + if platform == 'github': + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + elif platform == 'gitlab': + self.headers = {'Authorization': 'Bearer %s' % self.oauths[0]['access_token']} + + self.logger.info("OAuth initialized\n") + + def enrich_cntrb_id( + self, data, key, action_map_additions={'insert': {'source': [], 'augur': []}}, + platform='github', prefix='' + ): + + if not len(data): + self.logger.info(f"Enrich contrib data is empty for {len(data)}, for the key {key}.") + + raise ValueError + + self.logger.info(f"Enriching contributor ids for {len(data)} data points...") + + source_df = pd.DataFrame(data) + expanded_source_df = self._add_nested_columns( + source_df.copy(), [key] + action_map_additions['insert']['source'] + ) + + # Insert cntrbs that are not in db + + cntrb_action_map = { + 'insert': { + 'source': [key] + action_map_additions['insert']['source'] + [f'{prefix}id'], + 'augur': ['cntrb_login'] + action_map_additions['insert']['augur'] + ['gh_user_id'] + } + } + + table_values_cntrb = self.db.execute( + s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)) + ).fetchall() + + source_data = expanded_source_df.to_dict(orient='records') + + #Filter out bad data where we can't even hit the api. + source_data = [data for data in source_data if f'{prefix}login' in data and data[f'{prefix}login'] != None] + + self.logger.info(f"table_values_cntrb keys: {table_values_cntrb[0].keys()}") + self.logger.info(f"source_data keys: {source_data[0].keys()}") + + #We can't use this because of worker collisions + #TODO: seperate this method into it's own worker. + #cntrb_id_offset = self.get_max_id(self.contributors_table, 'cntrb_id') - 1 + + # loop through data to test if it is already in the database + for index, data in enumerate(source_data): + + self.logger.info(f"Enriching {index} of {len(source_data)}") + + + user_unique_ids = [] + + #Allow for alt identifiers to be checked if user.id is not present in source_data + try: + #This will trigger a KeyError if data has alt identifier. + data[f'{prefix}id'] + for row in table_values_cntrb: + try: + user_unique_ids.append(row['gh_user_id']) + except Exception as e: + self.logger.info(f"Error adding gh_user_id: {e}. Row: {row}") + except KeyError: + self.logger.info("Source data doesn't have user.id. Using node_id instead.") + for row in table_values_cntrb: + try: + user_unique_ids.append(row['gh_node_id']) + except Exception as e: + self.logger.info(f"Error adding gh_node_id: {e}. Row: {row}") + + + #self.logger.info(f"gh_user_ids: {gh_user_ids}") + + # self.logger.info(f"Users gh_user_id: {data['user.id']}") + # in_user_ids = False + # if data['user.id'] in gh_user_ids: + # in_user_ids = True + # self.logger.info(f"{data['user.id']} is in gh_user_ids") + + # self.logger.info(f"table_values_cntrb len: {len(table_values_cntrb)}") + + #Deal with if data + #See if we can check using the user.id + source_data_id = None + try: + source_data_id = data[f'{prefix}id'] + except KeyError: + source_data_id = data[f'{prefix}node_id'] + + + #if user.id is in the database then there is no need to add the contributor + if source_data_id in user_unique_ids: + + self.logger.info("{} found in database".format(source_data_id)) + + user_id_row = [] + try: + data[f'{prefix}id'] + #gets the dict from the table_values_cntrb that contains data['user.id'] + user_id_row = list(filter(lambda x: x['gh_user_id'] == source_data_id, table_values_cntrb))[0] + except KeyError: + user_id_row = list(filter(lambda x: x['gh_node_id'] == source_data_id, table_values_cntrb))[0] + + + #assigns the cntrb_id to the source data to be returned to the workers + data['cntrb_id'] = user_id_row['cntrb_id'] + self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") + + #contributor is not in the database + else: + + self.logger.info("{} not in database, making api call".format(source_data_id)) + + self.logger.info("login: {}".format(data[f'{prefix}login'])) + + try: + url = ("https://api.github.com/users/" + data[f'{prefix}login']) + except Exception as e: + self.logger.info(f"Error when creating url: {e}. Data: {data}") + + attempts = 0 + contributor = None + success = False + + while attempts < 10: + self.logger.info("Hitting endpoint: " + url + " ...\n") + try: + response = requests.get(url=url , headers=self.headers) + except TimeoutError: + self.logger.info(f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") + time.sleep(10) + continue + + self.update_rate_limit(response,platform=platform) + + try: + contributor = response.json() + except: + contributor = json.loads(json.dumps(response.text)) + + + if type(contributor) == dict: + self.logger.info("Request returned a dict!") + self.logger.info(f"Contributor data: {contributor}") + success = True + break + elif type(contributor) == list: + self.logger.warning("Wrong type returned trying again...") + self.logger.info(f"Contributor data: {contributor}") + elif type(contributor) == str: + self.logger.info(f"Warning! page_data was string: {contributor}\n") + if "" in contributor: + self.logger.info("HTML was returned, trying again...\n") + elif len(contributor) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + contributor = json.loads(contributor) + success = True + break + except: + pass + attempts += 1 + + + + + self.logger.info(f"Contributor data: {contributor}") + + cntrb = { + "cntrb_login": contributor['login'], + "cntrb_created_at": contributor['created_at'], + "cntrb_email": contributor['email'] if 'email' in contributor else None, + "cntrb_company": contributor['company'] if 'company' in contributor else None, + "cntrb_location": contributor['location'] if 'location' in contributor else None, + # "cntrb_type": , dont have a use for this as of now ... let it default to null + "cntrb_canonical": contributor['email'] if 'email' in contributor else None, + "gh_user_id": contributor['id'], + "gh_login": contributor['login'], + "gh_url": contributor['url'], + "gh_html_url": contributor['html_url'], + "gh_node_id": contributor['node_id'], + "gh_avatar_url": contributor['avatar_url'], + "gh_gravatar_id": contributor['gravatar_id'], + "gh_followers_url": contributor['followers_url'], + "gh_following_url": contributor['following_url'], + "gh_gists_url": contributor['gists_url'], + "gh_starred_url": contributor['starred_url'], + "gh_subscriptions_url": contributor['subscriptions_url'], + "gh_organizations_url": contributor['organizations_url'], + "gh_repos_url": contributor['repos_url'], + "gh_events_url": contributor['events_url'], + "gh_received_events_url": contributor['received_events_url'], + "gh_type": contributor['type'], + "gh_site_admin": contributor['site_admin'], + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + + #insert new contributor into database + # TODO: make this method it's own worker. This errors because of collisions between github_worker and pull_request_worker. + #We can solve this by making another worker with a queue. It wouldn't have to be too complicated. + try: + self.db.execute(self.contributors_table.insert().values(cntrb)) + # except s.exc.IntegrityError: + except Exception as e: + self.logger.info(f"Contributor was unable to be added to table! Attempting to get cntrb_id from table anyway because of possible collision. Error: {e}") + + + #Get the contributor id from the newly inserted contributor. + cntrb_id_row = self.db.execute( + s.sql.select(self.get_relevant_columns(self.contributors_table,cntrb_action_map)).where( + self.contributors_table.c.gh_user_id==cntrb["gh_user_id"] + ) + ).fetchall() + + #Handle and log rare failure cases. If this part errors something is very wrong. + if len(cntrb_id_row) == 1: + data['cntrb_id'] = cntrb_id_row[0]['cntrb_id'] + self.logger.info(f"cntrb_id {data['cntrb_id']} found in database and assigned to enriched data") + elif len(cntrb_id_row) == 0: + self.logger.error("Couldn't find contributor in database. Something has gone very wrong. Augur ran into a contributor that is unable to be inserted into the contributors table but is also not present in that table.") + else: + self.logger.info(f"There are more than one contributors in the table with gh_user_id={cntrb['gh_user_id']}") + + + cntrb_data = { + 'cntrb_id': data['cntrb_id'], + 'gh_node_id': cntrb['gh_node_id'], + 'cntrb_login': cntrb['cntrb_login'], + 'gh_user_id': cntrb['gh_user_id'] + } + #This updates our list of who is already in the database as we iterate to avoid duplicates. + #People who make changes tend to make more than one in a row. + table_values_cntrb.append(cntrb_data) + + self.logger.info( + "Contributor id enrichment successful, result has " + f"{len(source_data)} data points.\n" + ) + return source_data + + + + + #old method + """ + # source_cntrb_insert, _ = self.organize_needed_data( + # expanded_source_df.to_dict(orient='records'), table_values=table_values_cntrb, + # action_map=cntrb_action_map + # ) + + # cntrb_insert = [ + # { + # 'cntrb_login': contributor[f'{prefix}login'], + # 'cntrb_created_at': None if ( + # f'{prefix}created_at' not in contributor + # ) else contributor[f'{prefix}created_at'], + # 'cntrb_email': None if f'{prefix}email' not in contributor else contributor[f'{prefix}email'], + # 'cntrb_company': None if f'{prefix}company' not in contributor else contributor[f'{prefix}company'], + # 'cntrb_location': None if ( + # f'{prefix}location' not in contributor + # ) else contributor[f'{prefix}location'], + # 'gh_user_id': None if ( + # not contributor[f'{prefix}id'] + # ) else int(float(contributor[f'{prefix}id'])), + # 'gh_login': contributor[f'{prefix}login'], + # 'gh_url': contributor[f'{prefix}url'], + # 'gh_html_url': contributor[f'{prefix}html_url'], + # 'gh_node_id': contributor[f'{prefix}node_id'], #valid for dup check + # 'gh_avatar_url': contributor[f'{prefix}avatar_url'], + # 'gh_gravatar_id': contributor[f'{prefix}gravatar_id'], + # 'gh_followers_url': contributor[f'{prefix}followers_url'], + # 'gh_following_url': contributor[f'{prefix}following_url'], + # 'gh_gists_url': contributor[f'{prefix}gists_url'], + # 'gh_starred_url': contributor[f'{prefix}starred_url'], + # 'gh_subscriptions_url': contributor[f'{prefix}subscriptions_url'], + # 'gh_organizations_url': contributor[f'{prefix}organizations_url'], + # 'gh_repos_url': contributor[f'{prefix}repos_url'], + # 'gh_events_url': contributor[f'{prefix}events_url'], + # 'gh_received_events_url': contributor[f'{prefix}received_events_url'], + # 'gh_type': contributor[f'{prefix}type'], + # 'gh_site_admin': contributor[f'{prefix}site_admin'], + # 'tool_source': self.tool_source, + # 'tool_version': self.tool_version, + # 'data_source': self.data_source + # } for contributor in source_cntrb_insert if contributor[f'{prefix}login'] + # ] + # + # try: + # self.bulk_insert(self.contributors_table, cntrb_insert) + # except s.exc.IntegrityError: + # self.logger.info("Unique Violation in contributors table! ") + # + # # Query db for inserted cntrb pkeys and add to shallow level of data + # + # # Query + # cntrb_pk_name = list(self.contributors_table.primary_key)[0].name + # session = s.orm.Session(self.db) + # inserted_pks = pd.DataFrame( + # session.query( + # self.contributors_table.c[cntrb_pk_name], self.contributors_table.c.cntrb_login, + # self.contributors_table.c.gh_node_id + # ).distinct(self.contributors_table.c.cntrb_login).order_by( + # self.contributors_table.c.cntrb_login, self.contributors_table.c[cntrb_pk_name] + # ).all(), columns=[cntrb_pk_name, 'cntrb_login', 'gh_node_id'] + # ).to_dict(orient='records') + # session.close() + # + # # Prepare for merge + # source_columns = sorted(list(source_df.columns)) + # necessary_columns = sorted(list(set(source_columns + cntrb_action_map['insert']['source']))) + # (source_table, inserted_pks_table), metadata, session = self._setup_postgres_merge( + # [ + # expanded_source_df[necessary_columns].to_dict(orient='records'), + # inserted_pks + # ], sort=True + # ) + # final_columns = [cntrb_pk_name] + sorted(list(set(necessary_columns))) + # + # # Merge + # source_pk = pd.DataFrame( + # session.query( + # inserted_pks_table.c.cntrb_id, source_table + # ).join( + # source_table, + # eval( + # ' and '.join( + # [ + # ( + # f"inserted_pks_table.c['{table_column}'] " + # f"== source_table.c['{source_column}']" + # ) for table_column, source_column in zip( + # cntrb_action_map['insert']['augur'], + # cntrb_action_map['insert']['source'] + # ) + # ] + # ) + # ) + # ).all(), columns=final_columns + # ) + # + # # Cleanup merge + # source_pk = self._eval_json_columns(source_pk) + # self._close_postgres_merge(metadata, session) + + #self.logger.info( + # "Contributor id enrichment successful, result has " + # f"{len(source_pk)} data points.\n" + #) + + #return source_pk.to_dict(orient='records')""" + + def query_github_contributors(self, entry_info, repo_id): + + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") + + ## It absolutely doesn't matter if the contributor has already contributoed to a repo. it only matters that they exist in our table, and + ## if the DO, then we DO NOT want to insert them again in any GitHub Method. + github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] + + # Extract owner/repo from the url for the endpoint + owner, name = self.get_owner_repo(github_url) + + # Set the base of the url and place to hold contributors to insert + contributors_url = ( + f"https://api.github.com/repos/{owner}/{name}/" + + "contributors?per_page=100&page={}" + ) + + # Get contributors that we already have stored + # Set our duplicate and update column map keys (something other than PK) to + # check dupicates/needed column updates with + table = 'contributors' + table_pkey = 'cntrb_id' + update_col_map = {'cntrb_email': 'email'} + duplicate_col_map = {'cntrb_login': 'login'} + + #list to hold contributors needing insertion or update + contributors = self.paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) + + self.logger.info("Count of contributors needing insertion: " + str(len(contributors)) + "\n") + + for repo_contributor in contributors: + try: + # Need to hit this single contributor endpoint to get extra data including... + # `created at` + # i think that's it + cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) + self.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + self.update_gh_rate_limit(r) + contributor = r.json() + + company = None + location = None + email = None + if 'company' in contributor: + company = contributor['company'] + if 'location' in contributor: + location = contributor['location'] + if 'email' in contributor: + email = contributor['email'] + canonical_email = contributor['email'] + + cntrb = { + "cntrb_login": contributor['login'], + "cntrb_created_at": contributor['created_at'], + "cntrb_email": email, + "cntrb_company": company, + "cntrb_location": location, + # "cntrb_type": , dont have a use for this as of now ... let it default to null + "cntrb_canonical": canonical_email, + "gh_user_id": contributor['id'], + "gh_login": contributor['login'], + "gh_url": contributor['url'], + "gh_html_url": contributor['html_url'], + "gh_node_id": contributor['node_id'], #This is what we are dup checking + "gh_avatar_url": contributor['avatar_url'], + "gh_gravatar_id": contributor['gravatar_id'], + "gh_followers_url": contributor['followers_url'], + "gh_following_url": contributor['following_url'], + "gh_gists_url": contributor['gists_url'], + "gh_starred_url": contributor['starred_url'], + "gh_subscriptions_url": contributor['subscriptions_url'], + "gh_organizations_url": contributor['organizations_url'], + "gh_repos_url": contributor['repos_url'], + "gh_events_url": contributor['events_url'], + "gh_received_events_url": contributor['received_events_url'], + "gh_type": contributor['type'], + "gh_site_admin": contributor['site_admin'], + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + #dup check + #TODO: add additional fields to check if needed. + existingMatchingContributors = self.db.execute( + self.sql.select( + [self.contributors_table.c.gh_node_id] + ).where( + self.contributors_table.c.gh_node_id==cntrb["gh_node_id"] + ).fetchall() + ) + + if len(existingMatchingContributors) > 0: + break #if contributor already exists in table + + + # Commit insertion to table + if repo_contributor['flag'] == 'need_update': + result = self.db.execute(self.contributors_table.update().where( + self.worker_history_table.c.cntrb_email==email).values(cntrb)) + self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) + self.cntrb_id_inc = repo_contributor['pkey'] + elif repo_contributor['flag'] == 'need_insertion': + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) + self.results_counter += 1 + + self.logger.info("Inserted contributor: " + contributor['login'] + "\n") + + # Increment our global track of the cntrb id for the possibility of it being used as a FK + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + + except Exception as e: + self.logger.error("Caught exception: {}".format(e)) + self.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + continue + + + def query_github_contributors_bulk(self, entry_info, repo_id): + + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}\n") + + github_url = entry_info['given']['github_url'] if 'github_url' in entry_info['given'] else entry_info['given']['git_url'] + + owner, name = self.get_owner_repo(github_url) + + contributors_url = (f"https://api.github.com/repos/{owner}/{name}/" + + "contributors?per_page=100&page={}") + + action_map = { + 'insert': { + 'source': ['login'], + 'augur': ['cntrb_login'] + }, + 'update': { + 'source': ['email'], + 'augur': ['cntrb_email'] + } + } + + source_contributors = self.paginate_endpoint(contributors_url, action_map=action_map, + table=self.contributors_table) + + contributors_insert = [] + + for repo_contributor in source_contributors['insert']: + # Need to hit this single contributor endpoint to get extra data + cntrb_url = (f"https://api.github.com/users/{repo_contributor['login']}") + self.logger.info(f"Hitting endpoint: {cntrb_url} ...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + self.update_gh_rate_limit(r) + contributor = r.json() + + contributors_insert.append({ + 'cntrb_login': contributor['login'], + 'cntrb_created_at': contributor['created_at'], + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, + 'gh_user_id': contributor['id'], + 'gh_login': contributor['login'], + 'gh_url': contributor['url'], + 'gh_html_url': contributor['html_url'], + 'gh_node_id': contributor['node_id'], + 'gh_avatar_url': contributor['avatar_url'], + 'gh_gravatar_id': contributor['gravatar_id'], + 'gh_followers_url': contributor['followers_url'], + 'gh_following_url': contributor['following_url'], + 'gh_gists_url': contributor['gists_url'], + 'gh_starred_url': contributor['starred_url'], + 'gh_subscriptions_url': contributor['subscriptions_url'], + 'gh_organizations_url': contributor['organizations_url'], + 'gh_repos_url': contributor['repos_url'], + 'gh_events_url': contributor['events_url'], + 'gh_received_events_url': contributor['received_events_url'], + 'gh_type': contributor['type'], + 'gh_site_admin': contributor['site_admin'], + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + }) + + contributors_insert_result, contributors_update_result = self.bulk_insert(self.contributors_table, + update=source_contributors['update'], unique_columns=action_map['insert']['augur'], + insert=contributors_insert, update_columns=action_map['update']['augur']) + + def query_github_contributors_fast(self, entry_info, repo_id): + """ Data collection function + Query the GitHub API for contributors + """ + self.logger.info(f"Querying contributors with given entry info: {entry_info}") + + github_url = ( + entry_info['given']['github_url'] if 'github_url' in entry_info['given'] + else entry_info['given']['git_url'] + ) + + contributors_url = ( + f"https://api.github.com/repos/{self.owner}/{self.name}/" + "contributors?per_page=100&page={}" + ) + + action_map = { + 'insert': { + 'source': ['login'], + 'augur': ['cntrb_login'] + }, + 'update': { + 'source': ['email'], + 'augur': ['cntrb_email'] + } + } + + source_contributors = self.paginate_endpoint( + contributors_url, action_map=action_map, table=self.contributors_table + ) + + contributors_insert = [ + { + 'cntrb_login': contributor['login'], + 'cntrb_created_at': ( + contributor['created_at'] if 'created_at' in contributor else None + ), + 'cntrb_email': contributor['email'] if 'email' in contributor else None, + 'cntrb_company': contributor['company'] if 'company' in contributor else None, + 'cntrb_location': contributor['location'] if 'location' in contributor else None, + 'cntrb_canonical': contributor['email'] if 'email' in contributor else None, + 'gh_user_id': contributor['id'], + 'gh_login': contributor['login'], + 'gh_url': contributor['url'], + 'gh_html_url': contributor['html_url'], + 'gh_node_id': contributor['node_id'], + 'gh_avatar_url': contributor['avatar_url'], + 'gh_gravatar_id': contributor['gravatar_id'], + 'gh_followers_url': contributor['followers_url'], + 'gh_following_url': contributor['following_url'], + 'gh_gists_url': contributor['gists_url'], + 'gh_starred_url': contributor['starred_url'], + 'gh_subscriptions_url': contributor['subscriptions_url'], + 'gh_organizations_url': contributor['organizations_url'], + 'gh_repos_url': contributor['repos_url'], + 'gh_events_url': contributor['events_url'], + 'gh_received_events_url': contributor['received_events_url'], + 'gh_type': contributor['type'], + 'gh_site_admin': contributor['site_admin'], + 'tool_source': self.tool_source, + 'tool_version': self.tool_version, + 'data_source': self.data_source + } for contributor in source_contributors['insert'] + ] + + self.bulk_insert( + self.contributors_table, update=source_contributors['update'], + unique_columns=action_map['insert']['augur'], + insert=contributors_insert, update_columns=action_map['update']['augur'] + ) + + def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.info( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.info("Gitlab rate limit reached. Temp. disabling...") + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Updated rate limit, you have: " + + str(self.oauths[0]['rate_limit']) + " requests remaining.") + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['RateLimit-Reset'] + except Exception as e: + self.logger.info(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://gitlab.com/api/v4/version" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info(f"Higher rate limit found in oauth: {oauth}") + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} + + def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.warning( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.debug( + "Github thinks we are abusing their api. Preventing use " + "of this key until its rate limit resets..." + ) + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + # self.logger.info("Recieved rate limit from headers\n") + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Headers did not work, had to decrement") + self.logger.info( + f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." + ) + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['X-RateLimit-Reset'] + except Exception as e: + self.logger.error(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://api.github.com/users/gabe-heim" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + + attempts = 3 + success = False + while attempts > 0 and not success: + response = requests.get(url=url, headers=self.headers) + try: + oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['X-RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + success = True + except Exception as e: + self.logger.info( + f"oath method ran into error getting info from headers: {e}\n" + ) + self.logger.info(f"{self.headers}\n{url}\n") + attempts -= 1 + if not success: + continue + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}\n" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + + #TODO: figure out if changing this typo breaks anything + def query_gitlab_contributors(self, entry_info, repo_id): + + gitlab_url = ( + entry_info['given']['gitlab_url'] if 'gitlab_url' in entry_info['given'] + else entry_info['given']['git_url'] + ) + + self.logger.info("Querying contributors with given entry info: " + str(entry_info) + "\n") + + path = urlparse(gitlab_url) + split = path[2].split('/') + + owner = split[1] + name = split[2] + + # Handles git url case by removing the extension + if ".git" in name: + name = name[:-4] + + url_encoded_format = quote(owner + '/' + name, safe='') + + table = 'contributors' + table_pkey = 'cntrb_id' + ### Here we are adding gitlab user information from the API + ### Following Gabe's rework of the contributor worker. + + ### The GitLab API will NEVER give you an email. It will let you + ### Query an email, but never give you one. + ### ## Gitlab email api: https://gitlab.com/api/v4/users?search=s@goggins.com + ### We don't need to update right now, so commenting out. + ### TODO: SOLVE LOGIC. + # update_col_map = {'cntrb_email': 'email'} + update_col_map = {} + duplicate_col_map = {'gl_username': 'username'} + + # list to hold contributors needing insertion or update + contributors = self.paginate("https://gitlab.com/api/v4/projects/" + url_encoded_format + "/repository/contributors?per_page=100&page={}", duplicate_col_map, update_col_map, table, table_pkey, platform='gitlab') + + for repo_contributor in contributors: + try: + cntrb_compressed_url = ("https://gitlab.com/api/v4/users?search=" + repo_contributor['email']) + self.logger.info("Hitting endpoint: " + cntrb_compressed_url + " ...\n") + r = requests.get(url=cntrb_compressed_url, headers=self.headers) + contributor_compressed = r.json() + + email = repo_contributor['email'] + self.logger.info(contributor_compressed) + if len(contributor_compressed) == 0 or type(contributor_compressed) is dict or "id" not in contributor_compressed[0]: + continue + + self.logger.info("Fetching for user: " + str(contributor_compressed[0]["id"])) + + cntrb_url = ("https://gitlab.com/api/v4/users/" + str(contributor_compressed[0]["id"])) + self.logger.info("Hitting end point to get complete contributor info now: " + cntrb_url + "...\n") + r = requests.get(url=cntrb_url, headers=self.headers) + contributor = r.json() + + cntrb = { + "gl_id": contributor.get('gl_id', None), + "gl_full_name": contributor.get('full_name', None), + "gl_username": contributor.get('username', None), + "gl_state": contributor.get('state', None), + "gl_avatar_url": contributor.get('avatar_url', None), + "gl_web_url": contributor.get('web_url', None), + #"cntrb_login": contributor.get('username', None), + #"cntrb_created_at": contributor.get('created_at', None), + "cntrb_email": ('email', None), + #"cntrb_company": contributor.get('organization', None), + #"cntrb_location": contributor.get('location', None), + # "cntrb_type": , dont have a use for this as of now ... let it default to null + #"cntrb_canonical": contributor.get('public_email', None), + #"gh_user_id": contributor.get('id', None), + #"gh_login": contributor.get('username', None), + #"gh_url": contributor.get('web_url', None), + #"gh_html_url": contributor.get('web_url', None), + #"gh_node_id": None, + #"gh_avatar_url": contributor.get('avatar_url', None), + #"gh_gravatar_id": None, + #"gh_followers_url": None, + #"gh_following_url": None, + #"gh_gists_url": None, + #"gh_starred_url": None, + #"gh_subscriptions_url": None, + #"gh_organizations_url": None, + #"gh_repos_url": None, + #"gh_events_url": None, + #"gh_received_events_url": None, + #"gh_type": None, + #"gh_site_admin": None, + "tool_source": self.tool_source, + "tool_version": self.tool_version, + "data_source": self.data_source + } + + # Commit insertion to table + if repo_contributor['flag'] == 'need_update': + result = self.db.execute(self.contributors_table.update().where( + self.worker_history_table.c.cntrb_email == email).values(cntrb)) + self.logger.info("Updated tuple in the contributors table with existing email: {}".format(email)) + self.cntrb_id_inc = repo_contributor['pkey'] + elif repo_contributor['flag'] == 'need_insertion': + result = self.db.execute(self.contributors_table.insert().values(cntrb)) + self.logger.info("Primary key inserted into the contributors table: {}".format(result.inserted_primary_key)) + self.results_counter += 1 + + self.logger.info("Inserted contributor: " + contributor['username'] + "\n") + + # Increment our global track of the cntrb id for the possibility of it being used as a FK + self.cntrb_id_inc = int(result.inserted_primary_key[0]) + + except Exception as e: + self.logger.info("Caught exception: {}".format(e)) + self.logger.info("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + continue + + + def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.info( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.info("Gitlab rate limit reached. Temp. disabling...") + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['RateLimit-Remaining']) + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Updated rate limit, you have: " + + str(self.oauths[0]['rate_limit']) + " requests remaining.") + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['RateLimit-Reset'] + except Exception as e: + self.logger.info(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://gitlab.com/api/v4/version" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {"PRIVATE-TOKEN" : oauth['access_token']} + response = requests.get(url=url, headers=self.headers) + oauth['rate_limit'] = int(response.headers['RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info(f"Higher rate limit found in oauth: {oauth}") + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} + + + def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): + # Try to get rate limit from request headers, sometimes it does not work (GH's issue) + # In that case we just decrement from last recieved header count + if bad_credentials and len(self.oauths) > 1: + self.logger.warning( + f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" + ) + del self.oauths[0] + + if temporarily_disable: + self.logger.debug( + "Github thinks we are abusing their api. Preventing use " + "of this key until its rate limit resets..." + ) + self.oauths[0]['rate_limit'] = 0 + else: + try: + self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + # self.logger.info("Recieved rate limit from headers\n") + except: + self.oauths[0]['rate_limit'] -= 1 + self.logger.info("Headers did not work, had to decrement") + self.logger.info( + f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." + ) + if self.oauths[0]['rate_limit'] <= 0: + try: + reset_time = response.headers['X-RateLimit-Reset'] + except Exception as e: + self.logger.error(f"Could not get reset time from headers because of error: {e}") + reset_time = 3600 + time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() + self.logger.info("Rate limit exceeded, checking for other available keys to use.") + + # We will be finding oauth with the highest rate limit left out of our list of oauths + new_oauth = self.oauths[0] + # Endpoint to hit solely to retrieve rate limit information from headers of the response + url = "https://api.github.com/users/gabe-heim" + + other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] + for oauth in other_oauths: + # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) + self.headers = {'Authorization': 'token %s' % oauth['access_token']} + + attempts = 3 + success = False + while attempts > 0 and not success: + response = requests.get(url=url, headers=self.headers) + try: + oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) + oauth['seconds_to_reset'] = ( + datetime.datetime.fromtimestamp( + int(response.headers['X-RateLimit-Reset']) + ) - datetime.datetime.now() + ).total_seconds() + success = True + except Exception as e: + self.logger.info( + f"oath method ran into error getting info from headers: {e}\n" + ) + self.logger.info(f"{self.headers}\n{url}\n") + attempts -= 1 + if not success: + continue + + # Update oauth to switch to if a higher limit is found + if oauth['rate_limit'] > new_oauth['rate_limit']: + self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) + new_oauth = oauth + elif ( + oauth['rate_limit'] == new_oauth['rate_limit'] + and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] + ): + self.logger.info( + f"Lower wait time found in oauth with same rate limit: {oauth}\n" + ) + new_oauth = oauth + + if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: + self.logger.info( + "No oauths with >0 rate limit were found, waiting for oauth with " + f"smallest wait time: {new_oauth}\n" + ) + time.sleep(new_oauth['seconds_to_reset']) + + # Make new oauth the 0th element in self.oauths so we know which one is in use + index = self.oauths.index(new_oauth) + self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] + self.logger.info("Using oauth: {}\n".format(self.oauths[0])) + + # Change headers to be using the new oauth's key + self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} + + def update_rate_limit( + self, response, bad_credentials=False, temporarily_disable=False, platform="gitlab" + ): + if platform == 'gitlab': + return self.update_gitlab_rate_limit( + response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable + ) + elif platform == 'github': + return self.update_gh_rate_limit( + response, bad_credentials=bad_credentials, temporarily_disable=temporarily_disable + ) + + + #Indexerror somewhere + def multi_thread_urls(self, all_urls, max_attempts=5, platform='github'): + """ + :param all_urls: list of tuples + """ + + if not len(all_urls): + self.logger.info("No urls to multithread, returning blank list.\n") + return [] + + def load_url(url, extra_data={}): + try: + html = requests.get(url, stream=True, headers=self.headers) + return html, extra_data + except requests.exceptions.RequestException as e: + self.logger.info(e, url) + + self.logger.info("Beginning to multithread API endpoints.") + + start = time.time() + + all_data = [] + valid_url_count = len(all_urls) + + partitions = math.ceil(len(all_urls) / 600) + self.logger.info(f"{len(all_urls)} urls to process. Trying {partitions} partitions. " + + f"Using {max(multiprocessing.cpu_count()//8, 1)} threads.") + for urls in numpy.array_split(all_urls, partitions): + attempts = 0 + self.logger.info(f"Total data points collected so far: {len(all_data)}") + while len(urls) > 0 and attempts < max_attempts: + with concurrent.futures.ThreadPoolExecutor( + max_workers=max(multiprocessing.cpu_count()//8, 1) + ) as executor: + # Start the load operations and mark each future with its URL + future_to_url = {executor.submit(load_url, *url): url for url in urls} + self.logger.info("Multithreaded urls and returned status codes:") + count = 0 + for future in concurrent.futures.as_completed(future_to_url): + + if count % 100 == 0: + self.logger.info( + f"Processed {len(all_data)} / {valid_url_count} urls. " + f"{len(urls)} remaining in this partition." + ) + count += 1 + + url = future_to_url[future] + try: + response, extra_data = future.result() + + if response.status_code != 200: + self.logger.info( + f"Url: {url[0]} ; Status code: {response.status_code}" + ) + + if response.status_code == 403 or response.status_code == 401: # 403 is rate limit, 404 is not found, 401 is bad credentials + self.update_rate_limit(response, platform=platform) + continue + + elif response.status_code == 200: + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + page_data = [{**data, **extra_data} for data in page_data] + all_data += page_data + + if 'last' in response.links and "&page=" not in url[0]: + urls += [ + (url[0] + f"&page={page}", extra_data) for page in range( + 2, int(response.links['last']['url'].split('=')[-1]) + 1 + ) + ] + try: + # self.logger.info(f"urls boundry issue? for {urls} where they are equal to {url}.") + + urls = numpy.delete(urls, numpy.where(urls == url), axis=0) + except: + self.logger.info(f"ERROR with axis = 0 - Now attempting without setting axis for numpy.delete for {urls} where they are equal to {url}.") + urls = numpy.delete(urls, numpy.where(urls == url)) + + elif response.status_code == 404: + urls = numpy.delete(urls, numpy.where(urls == url), axis=0) + self.logger.info(f"Not found url: {url}\n") + else: + self.logger.info( + f"Unhandled response code: {response.status_code} {url}\n" + ) + + except Exception as e: + self.logger.info( + f"{url} generated an exception: {traceback.format_exc()}\n" + ) + + attempts += 1 + + self.logger.info( + f"Processed {valid_url_count} urls and got {len(all_data)} data points " + f"in {time.time() - start} seconds thanks to multithreading!\n" + ) + return all_data + + + #insertion_method and stagger are arguments that allow paginate_endpoint to insert at around ~500 pages at a time. + def paginate_endpoint( + self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=500 + ): + + #Get augur columns using the action map along with the primary key + table_values = self.db.execute( + s.sql.select(self.get_relevant_columns(table, action_map)).where(where_clause) + ).fetchall() + + page_number = 1 + multiple_pages = False + need_insertion = [] + need_update = [] + + #Stores sum of page data + all_data = [] + forward_pagination = True + backwards_activation = False + last_page_number = -1 + + #Block to handle page queries and retry at least 10 times + while True: + + # Multiple attempts to hit endpoint + num_attempts = 0 + success = False + while num_attempts < 10: + self.logger.info(f"Hitting endpoint: {url.format(page_number)}...\n") + try: + response = requests.get(url=url.format(page_number), headers=self.headers) + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(10) + continue + + self.update_rate_limit(response, platform=platform) + + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + if type(page_data) == list: + success = True + break + elif type(page_data) == dict: + self.logger.info("Request returned a dict: {}\n".format(page_data)) + if page_data['message'] == "Not Found": + self.logger.warning( + "Github repo was not found or does not exist for endpoint: " + f"{url.format(page_number)}\n" + ) + break + if "You have triggered an abuse detection mechanism." in page_data['message']: + num_attempts -= 1 + self.update_rate_limit(response, temporarily_disable=True,platform=platform) + if page_data['message'] == "Bad credentials": + self.update_rate_limit(response, bad_credentials=True, platform=platform) + elif type(page_data) == str: + self.logger.info(f"Warning! page_data was string: {page_data}\n") + if "" in page_data: + self.logger.info("HTML was returned, trying again...\n") + elif len(page_data) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + page_data = json.loads(page_data) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Success + + # Determine if continued pagination is needed + + if len(page_data) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + all_data += page_data + + if not forward_pagination: + + # Checking contents of requests with what we already have in the db + page_insertions, page_updates = self.organize_needed_data( + page_data, table_values, list(table.primary_key)[0].name, + action_map, in_memory=True + ) + + # Reached a page where we already have all tuples + if len(need_insertion) == 0 and len(need_update) == 0 and \ + backwards_activation: + self.logger.info( + "No more pages with unknown tuples, breaking from pagination.\n" + ) + break + + need_insertion += page_insertions + need_update += page_updates + + # Find last page so we can decrement from there + if 'last' in response.links and last_page_number == -1: + if platform == 'github': + last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) + elif platform == 'gitlab': + last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) + + if not forward_pagination and not backwards_activation: + page_number = last_page_number + backwards_activation = True + + self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, + int(last_page_number) if last_page_number != -1 else "*last page not known*")) + + if (page_number <= 1 and not forward_pagination) or \ + (page_number >= last_page_number and forward_pagination): + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + #This is probably where we should insert at around ~500 at a time + #makes sure that stagger is enabled, we have an insertion method, and the insertion happens every 500 pages or so. + if stagger and insertion_method != None and page_number % insertion_threshold == 0: + #call insertion method passed as argument. + staggered_source_prs = { + 'insert' : need_insertion, + 'update' : need_update, + 'all' : all_data + } + + #Use the method the subclass needs in order to insert the data. + insertion_method(staggered_source_prs,action_map) + + #clear the data from memory and avoid duplicate insertions. + need_insertion = [] + need_update = [] + all_data = [] + + page_number = page_number + 1 if forward_pagination else page_number - 1 + + if forward_pagination: + need_insertion, need_update = self.organize_needed_data( + all_data, table_values, list(table.primary_key)[0].name, action_map, + in_memory=in_memory + ) + + return { + 'insert': need_insertion, + 'update': need_update, + 'all': all_data + } + + #TODO: deprecated but still used by the issues worker. + def paginate(self, url, duplicate_col_map, update_col_map, table, table_pkey, where_clause="", value_update_col_map={}, platform="github"): + """ DEPRECATED + Paginate either backwards or forwards (depending on the value of the worker's + finishing_task attribute) through all the GitHub or GitLab api endpoint pages. + + :param url: String, the url of the API endpoint we are paginating through, expects + a curly brace string formatter within the string to format the Integer + representing the page number that is wanted to be returned + :param duplicate_col_map: Dictionary, maps the column names of the source data + to the field names in our database for columns that should be checked for + duplicates (if source data value == value in existing database row, then this + element is a duplicate and would not need an insertion). Key is source data + column name, value is database field name. Example: {'id': 'gh_issue_id'} + :param update_col_map: Dictionary, maps the column names of the source data + to the field names in our database for columns that should be checked for + updates (if source data value != value in existing database row, then an + update is needed). Key is source data column name, value is database field name. + Example: {'id': 'gh_issue_id'} + :param table: String, the name of the table that holds the values to check for + duplicates/updates against + :param table_pkey: String, the field name of the primary key of the table in + the database that we are getting the values for to cross-reference to check + for duplicates. + :param where_clause: String, optional where clause to filter the values + that are queried when preparing the values that will be cross-referenced + for duplicates/updates + :param value_update_col_map: Dictionary, sometimes we add a new field to a table, + and we want to trigger an update of that row in the database even if all of the + data values are the same and would not need an update ordinarily. Checking for + a specific existing value in the database field allows us to do this. The key is the + name of the field in the database we are checking for a specific value to trigger + an update, the value is the value we are checking for equality to trigger an update. + Example: {'cntrb_id': None} + :return: List of dictionaries, all data points from the pages of the specified API endpoint + each with a 'flag' key-value pair representing the required action to take with that + data point (i.e. 'need_insertion', 'need_update', 'none') + """ + + update_keys = list(update_col_map.keys()) if update_col_map else [] + update_keys += list(value_update_col_map.keys()) if value_update_col_map else [] + cols_to_query = list(duplicate_col_map.keys()) + update_keys + [table_pkey] + table_values = self.get_table_values(cols_to_query, [table], where_clause) + + i = 1 + multiple_pages = False + tuples = [] + while True: + num_attempts = 0 + success = False + while num_attempts < 3: + self.logger.info(f'Hitting endpoint: {url.format(i)}...\n') + r = requests.get(url=url.format(i), headers=self.headers) + + self.update_rate_limit(r, platform=platform) + if 'last' not in r.links: + last_page = None + else: + if platform == "github": + last_page = r.links['last']['url'][-6:].split('=')[1] + elif platform == "gitlab": + last_page = r.links['last']['url'].split('&')[2].split("=")[1] + self.logger.info("Analyzing page {} of {}\n".format(i, int(last_page) + 1 if last_page is not None else '*last page not known*')) + + try: + j = r.json() + except: + j = json.loads(json.dumps(r.text)) + + if type(j) != dict and type(j) != str: + success = True + break + elif type(j) == dict: + self.logger.info("Request returned a dict: {}\n".format(j)) + if j['message'] == 'Not Found': + self.logger.warning("Github repo was not found or does not exist for endpoint: {}\n".format(url)) + break + if j['message'] == 'You have triggered an abuse detection mechanism. Please wait a few minutes before you try again.': + num_attempts -= 1 + self.logger.info("rate limit update code goes here") + self.update_rate_limit(r, temporarily_disable=True,platform=platform) + if j['message'] == 'Bad credentials': + self.logger.info("rate limit update code goes here") + self.update_rate_limit(r, bad_credentials=True, platform=platform) + elif type(j) == str: + self.logger.info(f'J was string: {j}\n') + if '' in j: + self.logger.info('HTML was returned, trying again...\n') + elif len(j) == 0: + self.logger.warning('Empty string, trying again...\n') + else: + try: + j = json.loads(j) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Find last page so we can decrement from there + if 'last' in r.links and not multiple_pages and not self.finishing_task: + if platform == "github": + param = r.links['last']['url'][-6:] + i = int(param.split('=')[1]) + 1 + elif platform == "gitlab": + i = int(r.links['last']['url'].split('&')[2].split("=")[1]) + 1 + self.logger.info("Multiple pages of request, last page is " + str(i - 1) + "\n") + multiple_pages = True + elif not multiple_pages and not self.finishing_task: + self.logger.info("Only 1 page of request\n") + elif self.finishing_task: + self.logger.info("Finishing a previous task, paginating forwards ..." + " excess rate limit requests will be made\n") + + if len(j) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + # Checking contents of requests with what we already have in the db + j = self.assign_tuple_action(j, table_values, update_col_map, duplicate_col_map, table_pkey, value_update_col_map) + + if not j: + self.logger.error("Assigning tuple action failed, moving to next page.\n") + i = i + 1 if self.finishing_task else i - 1 + continue + try: + to_add = [obj for obj in j if obj not in tuples and (obj['flag'] != 'none')] + except Exception as e: + self.logger.error("Failure accessing data of page: {}. Moving to next page.\n".format(e)) + i = i + 1 if self.finishing_task else i - 1 + continue + if len(to_add) == 0 and multiple_pages and 'last' in r.links: + self.logger.info("{}".format(r.links['last'])) + if platform == "github": + page_number = int(r.links['last']['url'][-6:].split('=')[1]) + elif platform == "gitlab": + page_number = int(r.links['last']['url'].split('&')[2].split("=")[1]) + if i - 1 != page_number: + self.logger.info("No more pages with unknown tuples, breaking from pagination.\n") + break + + tuples += to_add + + i = i + 1 if self.finishing_task else i - 1 + + # Since we already wouldve checked the first page... break + if (i == 1 and multiple_pages and not self.finishing_task) or i < 1 or len(j) == 0: + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + return tuples + + def new_paginate_endpoint( + self, url, action_map={}, table=None, where_clause=True, platform='github' + ): + + page_number = 1 + multiple_pages = False + need_insertion = [] + need_update = [] + all_data = [] + forward_pagination = True + backwards_activation = False + last_page_number = -1 + while True: + + # Multiple attempts to hit endpoint + num_attempts = 0 + success = False + while num_attempts < 10: + self.logger.info("hitting an endpiont") + # f"Hitting endpoint: ...\n" + # f"{url.format(page_number)} on page number. \n") + try: + response = requests.get(url=url.format(page_number), headers=self.headers) + except TimeoutError as e: + self.logger.info("Request timed out. Sleeping 10 seconds and trying again...\n") + time.sleep(10) + continue + + self.update_rate_limit(response, platform=platform) + + try: + page_data = response.json() + except: + page_data = json.loads(json.dumps(response.text)) + + if type(page_data) == list: + success = True + break + elif type(page_data) == dict: + self.logger.info("Request returned a dict: {}\n".format(page_data)) + if page_data['message'] == "Not Found": + self.logger.warning( + "Github repo was not found or does not exist for endpoint: " + f"{url.format(page_number)}\n" + ) + break + if "You have triggered an abuse detection mechanism." in page_data['message']: + num_attempts -= 1 + self.update_rate_limit(response, temporarily_disable=True,platform=platform) + if page_data['message'] == "Bad credentials": + self.update_rate_limit(response, bad_credentials=True, platform=platform) + elif type(page_data) == str: + self.logger.info(f"Warning! page_data was string: {page_data}\n") + if "" in page_data: + self.logger.info("HTML was returned, trying again...\n") + elif len(page_data) == 0: + self.logger.warning("Empty string, trying again...\n") + else: + try: + page_data = json.loads(page_data) + success = True + break + except: + pass + num_attempts += 1 + if not success: + break + + # Success + + # Determine if continued pagination is needed + + if len(page_data) == 0: + self.logger.info("Response was empty, breaking from pagination.\n") + break + + all_data += page_data + + if not forward_pagination: + + # Checking contents of requests with what we already have in the db + page_insertions, page_updates = self.new_organize_needed_data( + page_data, augur_table=table, action_map=action_map + ) + + # Reached a page where we already have all tuples + if len(need_insertion) == 0 and len(need_update) == 0 and \ + backwards_activation: + self.logger.info( + "No more pages with unknown tuples, breaking from pagination.\n" + ) + break + + need_insertion += page_insertions + need_update += page_updates + + # Find last page so we can decrement from there + if 'last' in response.links and last_page_number == -1: + if platform == 'github': + last_page_number = int(response.links['last']['url'][-6:].split('=')[1]) + elif platform == 'gitlab': + last_page_number = int(response.links['last']['url'].split('&')[2].split('=')[1]) + + if not forward_pagination and not backwards_activation: + page_number = last_page_number + backwards_activation = True + + self.logger.info("Analyzation of page {} of {} complete\n".format(page_number, + int(last_page_number) if last_page_number != -1 else "*last page not known*")) + + if (page_number <= 1 and not forward_pagination) or \ + (page_number >= last_page_number and forward_pagination): + self.logger.info("No more pages to check, breaking from pagination.\n") + break + + page_number = page_number + 1 if forward_pagination else page_number - 1 + + if forward_pagination: + need_insertion, need_update = self.new_organize_needed_data( + all_data, augur_table=table, action_map=action_map + ) + + return { + 'insert': need_insertion, + 'update': need_update, + 'all': all_data + } From 1f649109135802964424d8f8dfdb8b7f3318dba3 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 13 Aug 2021 15:03:37 -0500 Subject: [PATCH 3/5] Remove weird duplicate method Signed-off-by: Isaac Milarsky --- workers/worker_git_integration.py | 94 ------------------------------- 1 file changed, 94 deletions(-) diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index e43ddea867..5b5c617661 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -1211,100 +1211,6 @@ def update_gitlab_rate_limit(self, response, bad_credentials=False, temporarily_ # Change headers to be using the new oauth's key self.headers = {"PRIVATE-TOKEN" : self.oauths[0]['access_token']} - - def update_gh_rate_limit(self, response, bad_credentials=False, temporarily_disable=False): - # Try to get rate limit from request headers, sometimes it does not work (GH's issue) - # In that case we just decrement from last recieved header count - if bad_credentials and len(self.oauths) > 1: - self.logger.warning( - f"Removing oauth with bad credentials from consideration: {self.oauths[0]}" - ) - del self.oauths[0] - - if temporarily_disable: - self.logger.debug( - "Github thinks we are abusing their api. Preventing use " - "of this key until its rate limit resets..." - ) - self.oauths[0]['rate_limit'] = 0 - else: - try: - self.oauths[0]['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - # self.logger.info("Recieved rate limit from headers\n") - except: - self.oauths[0]['rate_limit'] -= 1 - self.logger.info("Headers did not work, had to decrement") - self.logger.info( - f"Updated rate limit, you have: {self.oauths[0]['rate_limit']} requests remaining." - ) - if self.oauths[0]['rate_limit'] <= 0: - try: - reset_time = response.headers['X-RateLimit-Reset'] - except Exception as e: - self.logger.error(f"Could not get reset time from headers because of error: {e}") - reset_time = 3600 - time_diff = datetime.datetime.fromtimestamp(int(reset_time)) - datetime.datetime.now() - self.logger.info("Rate limit exceeded, checking for other available keys to use.") - - # We will be finding oauth with the highest rate limit left out of our list of oauths - new_oauth = self.oauths[0] - # Endpoint to hit solely to retrieve rate limit information from headers of the response - url = "https://api.github.com/users/gabe-heim" - - other_oauths = self.oauths[0:] if len(self.oauths) > 1 else [] - for oauth in other_oauths: - # self.logger.info("Inspecting rate limit info for oauth: {}\n".format(oauth)) - self.headers = {'Authorization': 'token %s' % oauth['access_token']} - - attempts = 3 - success = False - while attempts > 0 and not success: - response = requests.get(url=url, headers=self.headers) - try: - oauth['rate_limit'] = int(response.headers['X-RateLimit-Remaining']) - oauth['seconds_to_reset'] = ( - datetime.datetime.fromtimestamp( - int(response.headers['X-RateLimit-Reset']) - ) - datetime.datetime.now() - ).total_seconds() - success = True - except Exception as e: - self.logger.info( - f"oath method ran into error getting info from headers: {e}\n" - ) - self.logger.info(f"{self.headers}\n{url}\n") - attempts -= 1 - if not success: - continue - - # Update oauth to switch to if a higher limit is found - if oauth['rate_limit'] > new_oauth['rate_limit']: - self.logger.info("Higher rate limit found in oauth: {}\n".format(oauth)) - new_oauth = oauth - elif ( - oauth['rate_limit'] == new_oauth['rate_limit'] - and oauth['seconds_to_reset'] < new_oauth['seconds_to_reset'] - ): - self.logger.info( - f"Lower wait time found in oauth with same rate limit: {oauth}\n" - ) - new_oauth = oauth - - if new_oauth['rate_limit'] <= 0 and new_oauth['seconds_to_reset'] > 0: - self.logger.info( - "No oauths with >0 rate limit were found, waiting for oauth with " - f"smallest wait time: {new_oauth}\n" - ) - time.sleep(new_oauth['seconds_to_reset']) - - # Make new oauth the 0th element in self.oauths so we know which one is in use - index = self.oauths.index(new_oauth) - self.oauths[0], self.oauths[index] = self.oauths[index], self.oauths[0] - self.logger.info("Using oauth: {}\n".format(self.oauths[0])) - - # Change headers to be using the new oauth's key - self.headers = {'Authorization': 'token %s' % self.oauths[0]['access_token']} - def update_rate_limit( self, response, bad_credentials=False, temporarily_disable=False, platform="gitlab" ): From 56bf3bd2cfe9753ec344a4e0799f8092550d4230 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 13 Aug 2021 16:51:27 -0500 Subject: [PATCH 4/5] Base failure condition Signed-off-by: Isaac Milarsky --- workers/worker_git_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index 5b5c617661..a0c7e8d328 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -400,6 +400,8 @@ def enrich_cntrb_id( except: pass attempts += 1 + if not success: + break From df99c8f4a09a9f8c7e05b282138486fcfee6b9fc Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 13 Aug 2021 16:51:59 -0500 Subject: [PATCH 5/5] typo Signed-off-by: Isaac Milarsky --- workers/worker_git_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index a0c7e8d328..98e82339fc 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -1405,7 +1405,7 @@ def paginate_endpoint( pass num_attempts += 1 if not success: - break + break # Success