Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(openchallenges): update data in the local OC database #2968

Merged
merged 33 commits into from
Feb 21, 2025
Merged
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
be8f6e3
update project readme
vpchung Jan 16, 2025
712cc7d
update `statusCode` to return appropriate code
vpchung Jan 16, 2025
e0470f1
extract data pull functions into a separate module
vpchung Jan 16, 2025
6772f63
add default "" value for PRIVATE_KEY to allow for encoding
vpchung Jan 16, 2025
6399f7c
comment out data pulls from non-platform worksheets (for now)
vpchung Jan 16, 2025
10e56c1
reorder variables for consistency with other code blocks
vpchung Jan 16, 2025
14f4a67
Merge branch 'main' into CHALLENGE-585
vpchung Feb 5, 2025
1924ee9
Merge branch 'main' into CHALLENGE-585
vpchung Feb 11, 2025
00d32a5
add mariadb creds to .env.example
vpchung Feb 13, 2025
84fa6e4
update paths in Dockerfile
vpchung Feb 13, 2025
72874e8
add mariadb dependency
vpchung Feb 13, 2025
f6ddb0d
add connection to mariadb
vpchung Feb 14, 2025
136c56e
add functions for accessing and updating db tables
vpchung Feb 14, 2025
7cdbe82
reformat; remove commented out lines
vpchung Feb 14, 2025
1060147
add remaining table updates
vpchung Feb 17, 2025
46ec674
update error logging message
vpchung Feb 17, 2025
5e3e74b
replace \\N with None; rename select columns
vpchung Feb 17, 2025
fe46e6b
reassign `platform_id` value after reformatting to prevent float down…
vpchung Feb 18, 2025
2542a76
fix exception handling
vpchung Feb 18, 2025
a9a61f2
add data insertions for `organization_service` tables
vpchung Feb 18, 2025
8e4041d
extract db functions into separate module
vpchung Feb 18, 2025
6f094d2
add type hinting
vpchung Feb 18, 2025
fe29366
update Dockerfile to install MariaDB Connector/C
vpchung Feb 19, 2025
77f82e1
lint Dockerfile
vpchung Feb 19, 2025
69b0b29
Revert "update Dockerfile to install MariaDB Connector/C"
vpchung Feb 20, 2025
8df96d1
replace `mariadb` with `PyMySql`
vpchung Feb 20, 2025
e164ef1
replace `UPDATE_ME` with `CHANGEME` for consistency
vpchung Feb 20, 2025
aa24d61
update docstring for `lambda_handler()`
vpchung Feb 21, 2025
261483f
add type hinting to `oc_data_sheet` module
vpchung Feb 21, 2025
8dcd150
add logic to remove extraneous rows between `challenge_contribution` …
vpchung Feb 21, 2025
952ecf2
update comment
vpchung Feb 21, 2025
bceb261
restrict PyMySql version to 1.1.1 for project
vpchung Feb 21, 2025
0a4104f
add logging message to explain why rows are being removed
vpchung Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions apps/openchallenges/data-lambda/.env.example
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
# Google Service Account credentials
TYPE="service_account"
PROJECT_ID="UPDATE_ME"
PRIVATE_KEY_ID="UPDATE_ME"
PRIVATE_KEY="UPDATE_ME"
CLIENT_EMAIL="UPDATE_ME"
CLIENT_ID="UPDATE_ME"
PROJECT_ID="CHANGEME"
PRIVATE_KEY_ID="CHANGEME"
PRIVATE_KEY="CHANGEME"
CLIENT_EMAIL="CHANGEME"
CLIENT_ID="CHANGEME"
AUTH_URI="https://accounts.google.com/o/oauth2/auth"
TOKEN_URI="https://oauth2.googleapis.com/token"
AUTH_PROVIDER_X509_CERT_URL="https://www.googleapis.com/oauth2/v1/certs"
CLIENT_X509_CERT_URL="UPDATE_ME"
CLIENT_X509_CERT_URL="CHANGEME"
UNIVERSE_DOMAIN="googleapis.com"

# MariaDB credentials
MARIADB_HOST=openchallenges-mariadb
MARIADB_USER=maria
MARIADB_PASSWORD=changeme
MARIADB_PORT=3306
5 changes: 3 additions & 2 deletions apps/openchallenges/data-lambda/Dockerfile
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@ FROM public.ecr.aws/lambda/python:3.13
COPY --from=builder ${LAMBDA_TASK_ROOT} ${LAMBDA_TASK_ROOT}

# Copy the application code.
COPY ./openchallenges_data_lambda ${LAMBDA_TASK_ROOT}/app
COPY ./openchallenges_data_lambda ${LAMBDA_TASK_ROOT}

CMD ["app.lambda_handler"]

CMD ["app.app.lambda_handler"]
16 changes: 16 additions & 0 deletions apps/openchallenges/data-lambda/README.md
Original file line number Diff line number Diff line change
@@ -12,6 +12,22 @@ nx build openchallenges-data-lambda
nx build-image openchallenges-data-lambda
```

## Update .env with credentials to utilize Google Sheets API

Before running the Lambda function locally (see next section), update the `.env` file and replace
all "CHANGEME" values with real credentials.

Failing to update `.env` will result in the following output during invocation:

```json
{
"statusCode": 401,
"body": {
"message": "Private key not found in the credentials file. Please try again."
}
}
```

## Start the Lambda function locally with Docker Compose

Starts the Lambda function in the foreground, allowing you to view logs and interact with it
329 changes: 108 additions & 221 deletions apps/openchallenges/data-lambda/openchallenges_data_lambda/app.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
import os
import json
import logging

import gspread
import numpy as np
import pandas as pd

import db_utils
import oc_data_sheet


GOOGLE_SHEET_CREDENTIALS_FILE = "service_account.json"
GOOGLE_SHEET_TITLE = "OpenChallenges Data"


def lambda_handler(event, context):
"""Sample pure Lambda function
def write_credentials_file(output_json):
"""Write credentials JSON file for Google Sheets API authentication."""
with open(output_json, "w") as out:
credentials = {
"type": os.getenv("TYPE"),
"project_id": os.getenv("PROJECT_ID"),
"private_key_id": os.getenv("PRIVATE_KEY_ID"),
"private_key": os.getenv("PRIVATE_KEY", "")
.encode()
.decode("unicode_escape"),
"client_email": os.getenv("CLIENT_EMAIL"),
"client_id": os.getenv("CLIENT_ID"),
"auth_uri": os.getenv("AUTH_URI"),
"token_uri": os.getenv("TOKEN_URI"),
"auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
"client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
"universe_domain": os.getenv("UNIVERSE_DOMAIN"),
}
out.write(json.dumps(credentials))


def lambda_handler(event, context) -> dict:
"""Main function.
Pulls data from the OC Data Google sheet (https://shorturl.at/pf3Mr) and "syncs" the data
to the OpenChallenges database.
Parameters
----------
@@ -37,240 +63,101 @@ def lambda_handler(event, context):
try:
google_client = gspread.service_account(filename=GOOGLE_SHEET_CREDENTIALS_FILE)
except Exception as err:
status_code = 401
message = "Private key not found in the credentials file. Please try again."
else:
try:
wks = google_client.open(GOOGLE_SHEET_TITLE)

platforms = get_platform_data(wks)
print(platforms.head())

roles = get_roles(wks)
print(roles.head())

categories = get_challenge_categories(wks)
print(categories.head())

organizations = get_organization_data(wks)
print(organizations.head())

edam_data_annotations = get_edam_annotations(wks)
print(edam_data_annotations.head())

challenges, incentives, sub_types = get_challenge_data(wks)
print(challenges.head())
print(incentives.head())
print(sub_types.head())

message = "Data successfully pulled from OC Data google sheet."
platforms = oc_data_sheet.get_platform_data(wks)
platforms["avatar_url"] = (
"" # FIXME: table has this column for some reason?
)

roles = oc_data_sheet.get_roles(wks)
categories = oc_data_sheet.get_challenge_categories(wks)
organizations = oc_data_sheet.get_organization_data(wks)
edam_data_annotations = oc_data_sheet.get_edam_annotations(wks)
challenges, incentives, sub_types = oc_data_sheet.get_challenge_data(wks)
except Exception as err:
status_code = 400
message = f"Something went wrong with pulling the data: {err}."

data = {"message": message}
return {
"statusCode": 200,
"body": json.dumps(data),
}


def write_credentials_file(output_json):
"""Write credentials JSON file for Google Sheets API authentication."""
with open(output_json, "w") as out:
credentials = {
"type": os.getenv("TYPE"),
"project_id": os.getenv("PROJECT_ID"),
"private_key_id": os.getenv("PRIVATE_KEY_ID"),
"private_key": os.getenv("PRIVATE_KEY").encode().decode("unicode_escape"),
"client_email": os.getenv("CLIENT_EMAIL"),
"client_id": os.getenv("CLIENT_ID"),
"auth_uri": os.getenv("AUTH_URI"),
"token_uri": os.getenv("TOKEN_URI"),
"auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
"client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
"universe_domain": os.getenv("UNIVERSE_DOMAIN"),
}
out.write(json.dumps(credentials))


def get_challenge_data(wks, sheet_name="challenges"):
"""Get challenges data and clean up as needed.
Output:
- challenges
- challenge incentives
- challenge submission types
"""
df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
df.loc[df._platform == "Other", "platform"] = "\\N"

challenges = df[
[
"id",
"slug",
"name",
"headline",
"description",
"avatar_url",
"website_url",
"status",
"platform",
"doi",
"start_date",
"end_date",
"operation_id",
"created_at",
"updated_at",
]
]
challenges = (
challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
challenges["headline"] = (
challenges["headline"]
.astype(str)
.apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
)
challenges["description"] = (
challenges["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
challenges.loc[challenges.operation_id == "", "operation_id"] = "\\N"

incentives = pd.concat(
[
df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="monetary"
),
df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="publication"
),
df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="speaking_engagement"
),
df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="other"
),
]
).rename(columns={"id": "challenge_id"})
incentives["incentives"] = pd.Categorical(
incentives["incentives"],
categories=["monetary", "publication", "speaking_engagement", "other"],
)
incentives = incentives.sort_values(["challenge_id", "incentives"])
incentives.index = np.arange(1, len(incentives) + 1)

sub_types = pd.concat(
[
df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="prediction_file"
),
df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="container_image"
),
df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="notebook"
),
df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="mlcube"
),
df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="other"
),
]
).rename(columns={"id": "challenge_id"})
sub_types["submission_types"] = pd.Categorical(
sub_types["submission_types"],
categories=[
"prediction_file",
"container_image",
"notebook",
"mlcube",
"other",
# output logs to stdout and logfile
logging.basicConfig(
level=logging.DEBUG,
format="%(levelname)s | %(asctime)s | %(message)s",
handlers=[
logging.FileHandler("oc_database_update.log"),
logging.StreamHandler(),
],
)
sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
sub_types.index = np.arange(1, len(sub_types) + 1)

return (
challenges,
incentives[["incentives", "challenge_id", "created_at"]],
sub_types[["submission_types", "challenge_id", "created_at"]],
)


def get_challenge_categories(wks, sheet_name="challenge_category"):
"""Get challenge categories."""
return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
["id", "challenge_id", "category"]
]


def get_platform_data(wks, sheet_name="platforms"):
"""Get platform data and clean up as needed."""
platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
return platforms[platforms._public == "TRUE"][
["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
]


def get_organization_data(wks, sheet_name="organizations"):
"""Get organization data and clean up as needed."""
organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
organizations = organizations[organizations._public == "TRUE"][
[
"id",
"name",
"login",
"avatar_url",
"website_url",
"description",
"challenge_count",
"created_at",
"updated_at",
"acronym",
]
]
organizations = (
organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
# Update c`hallenge_service` tables
conn = db_utils.connect_to_db()
db_utils.update_table(conn, table_name="challenge_platform", data=platforms)
db_utils.update_table(conn, table_name="challenge", data=challenges)
db_utils.update_table(conn, table_name="challenge_contribution", data=roles)
db_utils.update_table(conn, table_name="challenge_incentive", data=incentives)
db_utils.update_table(conn, table_name="challenge_submission_type", data=sub_types)
db_utils.update_table(
conn, table_name="challenge_input_data_type", data=edam_data_annotations
)
organizations["description"] = (
organizations["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
db_utils.update_table(conn, table_name="challenge_category", data=categories)

# Get the newly-updated `challenge_contribution` table before closing the connection for
# later comparison.
challenge_service_roles = db_utils.get_table(conn, "challenge_contribution")
conn.close()

# Update `organization_service` tables
conn = db_utils.connect_to_db("organization_service")
db_utils.update_table(conn, table_name="organization", data=organizations)
db_utils.update_table(conn, table_name="challenge_contribution", data=roles)
organization_service_roles = db_utils.get_table(conn, "challenge_contribution")
conn.close()

# Identify rows that differ between the two tables, and remove as needed.
challenge_service_ids = set(challenge_service_roles["id"].tolist())
organization_service_ids = set(organization_service_roles["id"].tolist())
rows_to_remove_from_challenge_service = list(
challenge_service_ids - organization_service_ids
)
return organizations


def get_roles(wks, sheet_name="contribution_role"):
"""Get data on organization's role(s) in challenges."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_organization"], axis=1)
rows_to_remove_from_organization_service = list(
organization_service_ids - challenge_service_ids
)

if rows_to_remove_from_challenge_service:
logging.warning(
f"Mismatch found in the `challenge_contribution` tables; removing extra rows from "
"the `challenge_service` database"
)
conn = db_utils.connect_to_db()
db_utils.delete_rows_by_id(
conn, "challenge_contribution", rows_to_remove_from_challenge_service
)
conn.close()

if rows_to_remove_from_organization_service:
logging.warning(
f"Mismatch found in the `challenge_contribution` tables; removing extra rows from "
"the `organization_service` database"
)
conn = db_utils.connect_to_db("organization_service")
db_utils.delete_rows_by_id(
conn, "challenge_contribution", rows_to_remove_from_organization_service
)
conn.close()

logging.info("FIN. ✅")
status_code = 200
message = "Data from the OC Data Sheet successfully added to the database."

def get_edam_annotations(wks, sheet_name="challenge_data"):
"""Get data on challenge's EDAM annotations."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_edam_name"], axis=1)
)
data = {"message": message}
return {
"statusCode": status_code,
"body": json.dumps(data),
}


if __name__ == "__main__":
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import os
import sys
import logging

import pymysql
import pymysql.cursors
import pandas as pd


def connect_to_db(db: str = "challenge_service") -> pymysql.Connection:
"""Establishes connection to the MariaDB database."""
credentials = {
"host": os.getenv("MARIADB_HOST"),
"port": int(os.getenv("MARIADB_PORT", 3306)),
"user": os.getenv("MARIADB_USER"),
"password": os.getenv("MARIADB_PASSWORD"),
"database": db,
"cursorclass": pymysql.cursors.DictCursor,
}
try:
conn = pymysql.connect(**credentials)
logging.info(f"Connected to `{db}` database")
return conn
except pymysql.Error as err:
logging.error(f"Error connecting to the database: {err}")
sys.exit(1)


def get_table(conn: pymysql.Connection, table_name: str) -> pd.DataFrame:
"""Returns all records from the specified table."""
query = f"SELECT * FROM {table_name}"
try:
with conn.cursor() as cursor:
cursor.execute(query)
records = cursor.fetchall()
colnames = [val[0] for val in cursor.description]
return pd.DataFrame(records, columns=colnames)
except pymysql.Error as err:
logging.error(f"Error executing query: {err}")
return pd.DataFrame()


def truncate_table(conn: pymysql.Connection, table_name: str):
"""Deletes all rows from the specified table.
Temporarily disables foreign key checks for this operation.
"""
logging.info(f"Truncating table `{table_name}`")
try:
with conn.cursor() as cursor:
cursor.execute("SET FOREIGN_KEY_CHECKS = 0")
cursor.execute(f"TRUNCATE TABLE {table_name}")
cursor.execute("SET FOREIGN_KEY_CHECKS = 1")
conn.commit() # Save changes made to table.
except pymysql.Error as err:
logging.error(f"Error truncating: {err}")
conn.rollback() # Revert any changes made to data.


def delete_rows_by_id(conn: pymysql.Connection, table_name: str, row_ids: list):
"""Delete rows from the specified table, one row at a time."""
logging.info(f"Removing rows from `{table_name}`")
with conn.cursor() as cursor:
query = f"DELETE FROM {table_name} WHERE id = %s"
for row_id in row_ids:
try:
cursor.execute(query, (row_id,))
conn.commit()
logging.info(f" → Removed row {row_id}")
except pymysql.Error as err:
logging.error(f" → Error removing row {row_id}: {err}")
conn.rollback()


def insert_data(conn: pymysql.Connection, table_name: str, data_df: pd.DataFrame):
"""Adds data to the specified table, one row at a time.
This iterative approach allows for logging invalid rows for later review.
"""
logging.info(f"Adding data to table `{table_name}`")
with conn.cursor() as cursor:
for _, row in data_df.iterrows():
colnames = ", ".join(row.index)
placeholders = ", ".join(["%s"] * len(row))
query = f"INSERT INTO {table_name} ({colnames}) VALUES ({placeholders})"
try:
cursor.execute(query, tuple(row))
conn.commit()
except (pymysql.IntegrityError, pymysql.DataError) as err:
id_colname = "id" if row.get("id") else "challenge_id"
id_value = row.get("id", row.get("challenge_id"))
logging.error(
f"Invalid row to table `{table_name}`\n"
+ f" → {id_colname} in Google Sheet: {id_value}\n"
+ f" → Error: {err}"
)
conn.rollback()
except pymysql.Error as err:
logging.error(f"Error adding row to table `{table_name}`: {err}")
conn.rollback()


def update_table(conn: pymysql.Connection, table_name: str, data: pd.DataFrame):
"""Updates the specified table."""
truncate_table(conn, table_name)
insert_data(conn, table_name, data)
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import gspread
import numpy as np
import pandas as pd


def _reformat_df_values(df: pd.DataFrame) -> pd.DataFrame:
df = (
df.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
return df


def get_challenge_data(wks: gspread.Worksheet, sheet_name: str = "challenges") -> tuple:
"""Get challenges data and clean up as needed.
Output:
- challenges
- challenge incentives
- challenge submission types
"""
df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")

# Challenges
challenges = df[
[
"id",
"slug",
"name",
"headline",
"description",
"avatar_url",
"website_url",
"status",
"_platform",
"platform",
"doi",
"start_date",
"end_date",
"operation_id",
"created_at",
"updated_at",
]
]
challenges = _reformat_df_values(challenges)
challenges["headline"] = (
challenges["headline"]
.astype(str)
.apply(lambda x: x[:75] + "..." if len(x) > 80 else x)
)
challenges["description"] = (
challenges["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
challenges.loc[challenges._platform == "Other", "platform"] = None
challenges.loc[challenges.start_date == "", "start_date"] = None
challenges.loc[challenges.end_date == "", "end_date"] = None
challenges.loc[challenges.operation_id == "", "operation_id"] = None

# Challenge incentive(s)
incentives = pd.concat(
[
df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
name="monetary"
),
df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
name="publication"
),
df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
name="speaking_engagement"
),
df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(name="other"),
]
).rename(columns={"id": "challenge_id"})
incentives["name"] = pd.Categorical(
incentives["name"],
categories=["monetary", "publication", "speaking_engagement", "other"],
)
incentives = incentives.sort_values(["challenge_id", "name"])
incentives.index = np.arange(1, len(incentives) + 1)

# Challenge submission type(s)
sub_types = pd.concat(
[
df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
name="prediction_file"
),
df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
name="container_image"
),
df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
name="notebook"
),
df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
name="mlcube"
),
df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
name="other"
),
]
).rename(columns={"id": "challenge_id"})
sub_types["name"] = pd.Categorical(
sub_types["name"],
categories=[
"prediction_file",
"container_image",
"notebook",
"mlcube",
"other",
],
)
sub_types = sub_types.sort_values(["challenge_id", "name"])
sub_types.index = np.arange(1, len(sub_types) + 1)

return (
challenges.rename(columns={"platform": "platform_id"}).drop(
columns=["_platform"]
),
incentives[["name", "challenge_id", "created_at"]],
sub_types[["name", "challenge_id", "created_at"]],
)


def get_challenge_categories(
wks: gspread.Worksheet, sheet_name: str = "challenge_category"
) -> pd.DataFrame:
"""Get challenge categories."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.rename(columns={"category": "name"})[["id", "challenge_id", "name"]]
)


def get_platform_data(
wks: gspread.Worksheet, sheet_name: str = "platforms"
) -> pd.DataFrame:
"""Get platform data and clean up as needed."""
platforms = (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.rename(columns={"avatar_url": "avatar_key"})
)
return platforms[platforms._public == "TRUE"][
["id", "slug", "name", "avatar_key", "website_url", "created_at", "updated_at"]
]


def get_organization_data(
wks: gspread.Worksheet, sheet_name: str = "organizations"
) -> pd.DataFrame:
"""Get organization data and clean up as needed."""
organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
organizations = organizations[organizations._public == "TRUE"][
[
"id",
"name",
"login",
"avatar_url",
"website_url",
"description",
"challenge_count",
"created_at",
"updated_at",
"acronym",
]
]
organizations = _reformat_df_values(organizations)
organizations["description"] = (
organizations["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
return organizations.rename(columns={"avatar_url": "avatar_key"})


def get_roles(
wks: gspread.Worksheet, sheet_name: str = "contribution_role"
) -> pd.DataFrame:
"""Get data on organization's role(s) in challenges."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_organization"], axis=1)
)


def get_edam_annotations(
wks: gspread.Worksheet, sheet_name: str = "challenge_data"
) -> pd.DataFrame:
"""Get data on challenge's EDAM annotations."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_edam_name"], axis=1)
.rename(columns={"edam_id": "edam_concept_id"})
)
3 changes: 2 additions & 1 deletion apps/openchallenges/data-lambda/pyproject.toml
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@ dependencies = [
"gspread==6.1.4",
"pandas==2.2.3",
"numpy==2.1.0",
"pymysql==1.1.1",
]
name = "openchallenges-data-lambda"
version = "0.1.0"
@@ -24,4 +25,4 @@ prod = []
test = []

[tool.uv]
default-groups = []
default-groups = []
11 changes: 11 additions & 0 deletions apps/openchallenges/data-lambda/uv.lock