Sage-Bionetworks · tschaffter · Feb 21, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -1,11 +1,18 @@
+# Google Service Account credentials
 TYPE="service_account"
-PROJECT_ID="UPDATE_ME"
-PRIVATE_KEY_ID="UPDATE_ME"
-PRIVATE_KEY="UPDATE_ME"
-CLIENT_EMAIL="UPDATE_ME"
-CLIENT_ID="UPDATE_ME"
+PROJECT_ID="CHANGEME"
+PRIVATE_KEY_ID="CHANGEME"
+PRIVATE_KEY="CHANGEME"
+CLIENT_EMAIL="CHANGEME"
+CLIENT_ID="CHANGEME"
 AUTH_URI="https://accounts.google.com/o/oauth2/auth"
 TOKEN_URI="https://oauth2.googleapis.com/token"
 AUTH_PROVIDER_X509_CERT_URL="https://www.googleapis.com/oauth2/v1/certs"
-CLIENT_X509_CERT_URL="UPDATE_ME"
+CLIENT_X509_CERT_URL="CHANGEME"
 UNIVERSE_DOMAIN="googleapis.com"
+
+# MariaDB credentials
+MARIADB_HOST=openchallenges-mariadb
+MARIADB_USER=maria
+MARIADB_PASSWORD=changeme
+MARIADB_PORT=3306
@@ -29,6 +29,7 @@ FROM public.ecr.aws/lambda/python:3.13
 COPY --from=builder ${LAMBDA_TASK_ROOT} ${LAMBDA_TASK_ROOT}
 
 # Copy the application code.
-COPY ./openchallenges_data_lambda ${LAMBDA_TASK_ROOT}/app
+COPY ./openchallenges_data_lambda ${LAMBDA_TASK_ROOT}
+
+CMD ["app.lambda_handler"]
 
-CMD ["app.app.lambda_handler"]
@@ -12,6 +12,22 @@ nx build openchallenges-data-lambda
 nx build-image openchallenges-data-lambda
 ```
 
+## Update .env with credentials to utilize Google Sheets API
+
+Before running the Lambda function locally (see next section), update the `.env` file and replace
+all "CHANGEME" values with real credentials.
+
+Failing to update `.env` will result in the following output during invocation:
+
+```json
+{
+  "statusCode": 401,
+  "body": {
+    "message": "Private key not found in the credentials file. Please try again."
+  }
+}
+```
+
 ## Start the Lambda function locally with Docker Compose
 
 Starts the Lambda function in the foreground, allowing you to view logs and interact with it

@@ -1,17 +1,43 @@
 import os
 import json
+import logging
 
 import gspread
-import numpy as np
-import pandas as pd
+
+import db_utils
+import oc_data_sheet
 
 
 GOOGLE_SHEET_CREDENTIALS_FILE = "service_account.json"
 GOOGLE_SHEET_TITLE = "OpenChallenges Data"
 
 
-def lambda_handler(event, context):
-    """Sample pure Lambda function
+def write_credentials_file(output_json):
+    """Write credentials JSON file for Google Sheets API authentication."""
+    with open(output_json, "w") as out:
+        credentials = {
+            "type": os.getenv("TYPE"),
+            "project_id": os.getenv("PROJECT_ID"),
+            "private_key_id": os.getenv("PRIVATE_KEY_ID"),
+            "private_key": os.getenv("PRIVATE_KEY", "")
+            .encode()
+            .decode("unicode_escape"),
+            "client_email": os.getenv("CLIENT_EMAIL"),
+            "client_id": os.getenv("CLIENT_ID"),
+            "auth_uri": os.getenv("AUTH_URI"),
+            "token_uri": os.getenv("TOKEN_URI"),
+            "auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
+            "client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
+            "universe_domain": os.getenv("UNIVERSE_DOMAIN"),
+        }
+        out.write(json.dumps(credentials))
+
+
+def lambda_handler(event, context) -> dict:
+    """Main function.
+
+    Pulls data from the OC Data Google sheet (https://shorturl.at/pf3Mr) and "syncs" the data
+    to the OpenChallenges database.
 
     Parameters
     ----------
@@ -37,240 +63,101 @@ def lambda_handler(event, context):
     try:
         google_client = gspread.service_account(filename=GOOGLE_SHEET_CREDENTIALS_FILE)
     except Exception as err:
+        status_code = 401
         message = "Private key not found in the credentials file. Please try again."
     else:
         try:
             wks = google_client.open(GOOGLE_SHEET_TITLE)
 
-            platforms = get_platform_data(wks)
-            print(platforms.head())
-
-            roles = get_roles(wks)
-            print(roles.head())
-
-            categories = get_challenge_categories(wks)
-            print(categories.head())
-
-            organizations = get_organization_data(wks)
-            print(organizations.head())
-
-            edam_data_annotations = get_edam_annotations(wks)
-            print(edam_data_annotations.head())
-
-            challenges, incentives, sub_types = get_challenge_data(wks)
-            print(challenges.head())
-            print(incentives.head())
-            print(sub_types.head())
-
-            message = "Data successfully pulled from OC Data google sheet."
+            platforms = oc_data_sheet.get_platform_data(wks)
+            platforms["avatar_url"] = (
+                ""  # FIXME: table has this column for some reason?
+            )
 
+            roles = oc_data_sheet.get_roles(wks)
+            categories = oc_data_sheet.get_challenge_categories(wks)
+            organizations = oc_data_sheet.get_organization_data(wks)
+            edam_data_annotations = oc_data_sheet.get_edam_annotations(wks)
+            challenges, incentives, sub_types = oc_data_sheet.get_challenge_data(wks)
         except Exception as err:
+            status_code = 400
             message = f"Something went wrong with pulling the data: {err}."
 
-    data = {"message": message}
-    return {
-        "statusCode": 200,
-        "body": json.dumps(data),
-    }
-
-
-def write_credentials_file(output_json):
-    """Write credentials JSON file for Google Sheets API authentication."""
-    with open(output_json, "w") as out:
-        credentials = {
-            "type": os.getenv("TYPE"),
-            "project_id": os.getenv("PROJECT_ID"),
-            "private_key_id": os.getenv("PRIVATE_KEY_ID"),
-            "private_key": os.getenv("PRIVATE_KEY").encode().decode("unicode_escape"),
-            "client_email": os.getenv("CLIENT_EMAIL"),
-            "client_id": os.getenv("CLIENT_ID"),
-            "auth_uri": os.getenv("AUTH_URI"),
-            "token_uri": os.getenv("TOKEN_URI"),
-            "auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
-            "client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
-            "universe_domain": os.getenv("UNIVERSE_DOMAIN"),
-        }
-        out.write(json.dumps(credentials))
-
-
-def get_challenge_data(wks, sheet_name="challenges"):
-    """Get challenges data and clean up as needed.
-
-    Output:
-        - challenges
-        - challenge incentives
-        - challenge submission types
-    """
-    df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    df.loc[df._platform == "Other", "platform"] = "\\N"
-
-    challenges = df[
-        [
-            "id",
-            "slug",
-            "name",
-            "headline",
-            "description",
-            "avatar_url",
-            "website_url",
-            "status",
-            "platform",
-            "doi",
-            "start_date",
-            "end_date",
-            "operation_id",
-            "created_at",
-            "updated_at",
-        ]
-    ]
-    challenges = (
-        challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
-        .replace(r"\n", " ", regex=True)
-        .replace("'", "''")
-        .replace("\u2019", "''", regex=True)  # replace curly right-quote
-        .replace("\u202f", " ", regex=True)  # replace narrow no-break space
-        .replace("\u2060", "", regex=True)  # remove word joiner
-    )
-    challenges["headline"] = (
-        challenges["headline"]
-        .astype(str)
-        .apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
-    )
-    challenges["description"] = (
-        challenges["description"]
-        .astype(str)
-        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
-    )
-    challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
-    challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
-    challenges.loc[challenges.operation_id == "", "operation_id"] = "\\N"
-
-    incentives = pd.concat(
-        [
-            df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="monetary"
-            ),
-            df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="publication"
-            ),
-            df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="speaking_engagement"
-            ),
-            df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="other"
-            ),
-        ]
-    ).rename(columns={"id": "challenge_id"})
-    incentives["incentives"] = pd.Categorical(
-        incentives["incentives"],
-        categories=["monetary", "publication", "speaking_engagement", "other"],
-    )
-    incentives = incentives.sort_values(["challenge_id", "incentives"])
-    incentives.index = np.arange(1, len(incentives) + 1)
-
-    sub_types = pd.concat(
-        [
-            df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="prediction_file"
-            ),
-            df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="container_image"
-            ),
-            df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="notebook"
-            ),
-            df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="mlcube"
-            ),
-            df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="other"
-            ),
-        ]
-    ).rename(columns={"id": "challenge_id"})
-    sub_types["submission_types"] = pd.Categorical(
-        sub_types["submission_types"],
-        categories=[
-            "prediction_file",
-            "container_image",
-            "notebook",
-            "mlcube",
-            "other",
+    # output logs to stdout and logfile
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(levelname)s | %(asctime)s | %(message)s",
+        handlers=[
+            logging.FileHandler("oc_database_update.log"),
+            logging.StreamHandler(),
         ],
     )
-    sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
-    sub_types.index = np.arange(1, len(sub_types) + 1)
-
-    return (
-        challenges,
-        incentives[["incentives", "challenge_id", "created_at"]],
-        sub_types[["submission_types", "challenge_id", "created_at"]],
-    )
-
-
-def get_challenge_categories(wks, sheet_name="challenge_category"):
-    """Get challenge categories."""
-    return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
-        ["id", "challenge_id", "category"]
-    ]
-
 
-def get_platform_data(wks, sheet_name="platforms"):
-    """Get platform data and clean up as needed."""
-    platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    return platforms[platforms._public == "TRUE"][
-        ["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
-    ]
-
-
-def get_organization_data(wks, sheet_name="organizations"):
-    """Get organization data and clean up as needed."""
-    organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    organizations = organizations[organizations._public == "TRUE"][
-        [
-            "id",
-            "name",
-            "login",
-            "avatar_url",
-            "website_url",
-            "description",
-            "challenge_count",
-            "created_at",
-            "updated_at",
-            "acronym",
-        ]
-    ]
-    organizations = (
-        organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
-        .replace(r"\n", " ", regex=True)
-        .replace("'", "''")
-        .replace("\u2019", "''", regex=True)  # replace curly right-quote
-        .replace("\u202f", " ", regex=True)  # replace narrow no-break space
-        .replace("\u2060", "", regex=True)  # remove word joiner
+    # Update c`hallenge_service` tables
+    conn = db_utils.connect_to_db()
+    db_utils.update_table(conn, table_name="challenge_platform", data=platforms)
+    db_utils.update_table(conn, table_name="challenge", data=challenges)
+    db_utils.update_table(conn, table_name="challenge_contribution", data=roles)
+    db_utils.update_table(conn, table_name="challenge_incentive", data=incentives)
+    db_utils.update_table(conn, table_name="challenge_submission_type", data=sub_types)
+    db_utils.update_table(
+        conn, table_name="challenge_input_data_type", data=edam_data_annotations
     )
-    organizations["description"] = (
-        organizations["description"]
-        .astype(str)
-        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
+    db_utils.update_table(conn, table_name="challenge_category", data=categories)
+
+    # Get the newly-updated `challenge_contribution` table before closing the connection for
+    # later comparison.
+    challenge_service_roles = db_utils.get_table(conn, "challenge_contribution")
+    conn.close()
+
+    # Update `organization_service` tables
+    conn = db_utils.connect_to_db("organization_service")
+    db_utils.update_table(conn, table_name="organization", data=organizations)
+    db_utils.update_table(conn, table_name="challenge_contribution", data=roles)
+    organization_service_roles = db_utils.get_table(conn, "challenge_contribution")
+    conn.close()
+
+    # Identify rows that differ between the two tables, and remove as needed.
+    challenge_service_ids = set(challenge_service_roles["id"].tolist())
+    organization_service_ids = set(organization_service_roles["id"].tolist())
+    rows_to_remove_from_challenge_service = list(
+        challenge_service_ids - organization_service_ids
     )
-    return organizations
-
-
-def get_roles(wks, sheet_name="contribution_role"):
-    """Get data on organization's role(s) in challenges."""
-    return (
-        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
-        .fillna("")
-        .drop(["_challenge", "_organization"], axis=1)
+    rows_to_remove_from_organization_service = list(
+        organization_service_ids - challenge_service_ids
     )
 
+    if rows_to_remove_from_challenge_service:
+        logging.warning(
+            f"Mismatch found in the `challenge_contribution` tables; removing extra rows from "
+            "the `challenge_service` database"
+        )
+        conn = db_utils.connect_to_db()
+        db_utils.delete_rows_by_id(
+            conn, "challenge_contribution", rows_to_remove_from_challenge_service
+        )
+        conn.close()
+
+    if rows_to_remove_from_organization_service:
+        logging.warning(
+            f"Mismatch found in the `challenge_contribution` tables; removing extra rows from "
+            "the `organization_service` database"
+        )
+        conn = db_utils.connect_to_db("organization_service")
+        db_utils.delete_rows_by_id(
+            conn, "challenge_contribution", rows_to_remove_from_organization_service
+        )
+        conn.close()
+
+    logging.info("FIN. ✅")
+    status_code = 200
+    message = "Data from the OC Data Sheet successfully added to the database."
 
-def get_edam_annotations(wks, sheet_name="challenge_data"):
-    """Get data on challenge's EDAM annotations."""
-    return (
-        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
-        .fillna("")
-        .drop(["_challenge", "_edam_name"], axis=1)
-    )
+    data = {"message": message}
+    return {
+        "statusCode": status_code,
+        "body": json.dumps(data),
+    }
 
 
 if __name__ == "__main__":

@@ -0,0 +1,106 @@
+import os
+import sys
+import logging
+
+import pymysql
+import pymysql.cursors
+import pandas as pd
+
+
+def connect_to_db(db: str = "challenge_service") -> pymysql.Connection:
+    """Establishes connection to the MariaDB database."""
+    credentials = {
+        "host": os.getenv("MARIADB_HOST"),
+        "port": int(os.getenv("MARIADB_PORT", 3306)),
+        "user": os.getenv("MARIADB_USER"),
+        "password": os.getenv("MARIADB_PASSWORD"),
+        "database": db,
+        "cursorclass": pymysql.cursors.DictCursor,
+    }
+    try:
+        conn = pymysql.connect(**credentials)
+        logging.info(f"Connected to `{db}` database")
+        return conn
+    except pymysql.Error as err:
+        logging.error(f"Error connecting to the database: {err}")
+        sys.exit(1)
+
+
+def get_table(conn: pymysql.Connection, table_name: str) -> pd.DataFrame:
+    """Returns all records from the specified table."""
+    query = f"SELECT * FROM {table_name}"
+    try:
+        with conn.cursor() as cursor:
+            cursor.execute(query)
+            records = cursor.fetchall()
+            colnames = [val[0] for val in cursor.description]
+        return pd.DataFrame(records, columns=colnames)
+    except pymysql.Error as err:
+        logging.error(f"Error executing query: {err}")
+        return pd.DataFrame()
+
+
+def truncate_table(conn: pymysql.Connection, table_name: str):
+    """Deletes all rows from the specified table.
+
+    Temporarily disables foreign key checks for this operation.
+    """
+    logging.info(f"Truncating table `{table_name}`")
+    try:
+        with conn.cursor() as cursor:
+            cursor.execute("SET FOREIGN_KEY_CHECKS = 0")
+            cursor.execute(f"TRUNCATE TABLE {table_name}")
+            cursor.execute("SET FOREIGN_KEY_CHECKS = 1")
+            conn.commit()  # Save changes made to table.
+    except pymysql.Error as err:
+        logging.error(f"Error truncating: {err}")
+        conn.rollback()  # Revert any changes made to data.
+
+
+def delete_rows_by_id(conn: pymysql.Connection, table_name: str, row_ids: list):
+    """Delete rows from the specified table, one row at a time."""
+    logging.info(f"Removing rows from `{table_name}`")
+    with conn.cursor() as cursor:
+        query = f"DELETE FROM {table_name} WHERE id = %s"
+        for row_id in row_ids:
+            try:
+                cursor.execute(query, (row_id,))
+                conn.commit()
+                logging.info(f"   → Removed row {row_id}")
+            except pymysql.Error as err:
+                logging.error(f"   → Error removing row {row_id}: {err}")
+                conn.rollback()
+
+
+def insert_data(conn: pymysql.Connection, table_name: str, data_df: pd.DataFrame):
+    """Adds data to the specified table, one row at a time.
+
+    This iterative approach allows for logging invalid rows for later review.
+    """
+    logging.info(f"Adding data to table `{table_name}`")
+    with conn.cursor() as cursor:
+        for _, row in data_df.iterrows():
+            colnames = ", ".join(row.index)
+            placeholders = ", ".join(["%s"] * len(row))
+            query = f"INSERT INTO {table_name} ({colnames}) VALUES ({placeholders})"
+            try:
+                cursor.execute(query, tuple(row))
+                conn.commit()
+            except (pymysql.IntegrityError, pymysql.DataError) as err:
+                id_colname = "id" if row.get("id") else "challenge_id"
+                id_value = row.get("id", row.get("challenge_id"))
+                logging.error(
+                    f"Invalid row to table `{table_name}`\n"
+                    + f"   → {id_colname} in Google Sheet: {id_value}\n"
+                    + f"   → Error: {err}"
+                )
+                conn.rollback()
+            except pymysql.Error as err:
+                logging.error(f"Error adding row to table `{table_name}`: {err}")
+                conn.rollback()
+
+
+def update_table(conn: pymysql.Connection, table_name: str, data: pd.DataFrame):
+    """Updates the specified table."""
+    truncate_table(conn, table_name)
+    insert_data(conn, table_name, data)
@@ -0,0 +1,202 @@
+import gspread
+import numpy as np
+import pandas as pd
+
+
+def _reformat_df_values(df: pd.DataFrame) -> pd.DataFrame:
+    df = (
+        df.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
+        .replace(r"\n", " ", regex=True)
+        .replace("'", "''")
+        .replace("\u2019", "''", regex=True)  # replace curly right-quote
+        .replace("\u202f", " ", regex=True)  # replace narrow no-break space
+        .replace("\u2060", "", regex=True)  # remove word joiner
+    )
+    return df
+
+
+def get_challenge_data(wks: gspread.Worksheet, sheet_name: str = "challenges") -> tuple:
+    """Get challenges data and clean up as needed.
+
+    Output:
+        - challenges
+        - challenge incentives
+        - challenge submission types
+    """
+    df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
+
+    # Challenges
+    challenges = df[
+        [
+            "id",
+            "slug",
+            "name",
+            "headline",
+            "description",
+            "avatar_url",
+            "website_url",
+            "status",
+            "_platform",
+            "platform",
+            "doi",
+            "start_date",
+            "end_date",
+            "operation_id",
+            "created_at",
+            "updated_at",
+        ]
+    ]
+    challenges = _reformat_df_values(challenges)
+    challenges["headline"] = (
+        challenges["headline"]
+        .astype(str)
+        .apply(lambda x: x[:75] + "..." if len(x) > 80 else x)
+    )
+    challenges["description"] = (
+        challenges["description"]
+        .astype(str)
+        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
+    )
+    challenges.loc[challenges._platform == "Other", "platform"] = None
+    challenges.loc[challenges.start_date == "", "start_date"] = None
+    challenges.loc[challenges.end_date == "", "end_date"] = None
+    challenges.loc[challenges.operation_id == "", "operation_id"] = None
+
+    # Challenge incentive(s)
+    incentives = pd.concat(
+        [
+            df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
+                name="monetary"
+            ),
+            df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
+                name="publication"
+            ),
+            df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
+                name="speaking_engagement"
+            ),
+            df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(name="other"),
+        ]
+    ).rename(columns={"id": "challenge_id"})
+    incentives["name"] = pd.Categorical(
+        incentives["name"],
+        categories=["monetary", "publication", "speaking_engagement", "other"],
+    )
+    incentives = incentives.sort_values(["challenge_id", "name"])
+    incentives.index = np.arange(1, len(incentives) + 1)
+
+    # Challenge submission type(s)
+    sub_types = pd.concat(
+        [
+            df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
+                name="prediction_file"
+            ),
+            df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
+                name="container_image"
+            ),
+            df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
+                name="notebook"
+            ),
+            df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
+                name="mlcube"
+            ),
+            df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
+                name="other"
+            ),
+        ]
+    ).rename(columns={"id": "challenge_id"})
+    sub_types["name"] = pd.Categorical(
+        sub_types["name"],
+        categories=[
+            "prediction_file",
+            "container_image",
+            "notebook",
+            "mlcube",
+            "other",
+        ],
+    )
+    sub_types = sub_types.sort_values(["challenge_id", "name"])
+    sub_types.index = np.arange(1, len(sub_types) + 1)
+
+    return (
+        challenges.rename(columns={"platform": "platform_id"}).drop(
+            columns=["_platform"]
+        ),
+        incentives[["name", "challenge_id", "created_at"]],
+        sub_types[["name", "challenge_id", "created_at"]],
+    )
+
+
+def get_challenge_categories(
+    wks: gspread.Worksheet, sheet_name: str = "challenge_category"
+) -> pd.DataFrame:
+    """Get challenge categories."""
+    return (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .rename(columns={"category": "name"})[["id", "challenge_id", "name"]]
+    )
+
+
+def get_platform_data(
+    wks: gspread.Worksheet, sheet_name: str = "platforms"
+) -> pd.DataFrame:
+    """Get platform data and clean up as needed."""
+    platforms = (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .rename(columns={"avatar_url": "avatar_key"})
+    )
+    return platforms[platforms._public == "TRUE"][
+        ["id", "slug", "name", "avatar_key", "website_url", "created_at", "updated_at"]
+    ]
+
+
+def get_organization_data(
+    wks: gspread.Worksheet, sheet_name: str = "organizations"
+) -> pd.DataFrame:
+    """Get organization data and clean up as needed."""
+    organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
+    organizations = organizations[organizations._public == "TRUE"][
+        [
+            "id",
+            "name",
+            "login",
+            "avatar_url",
+            "website_url",
+            "description",
+            "challenge_count",
+            "created_at",
+            "updated_at",
+            "acronym",
+        ]
+    ]
+    organizations = _reformat_df_values(organizations)
+    organizations["description"] = (
+        organizations["description"]
+        .astype(str)
+        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
+    )
+    return organizations.rename(columns={"avatar_url": "avatar_key"})
+
+
+def get_roles(
+    wks: gspread.Worksheet, sheet_name: str = "contribution_role"
+) -> pd.DataFrame:
+    """Get data on organization's role(s) in challenges."""
+    return (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .drop(["_challenge", "_organization"], axis=1)
+    )
+
+
+def get_edam_annotations(
+    wks: gspread.Worksheet, sheet_name: str = "challenge_data"
+) -> pd.DataFrame:
+    """Get data on challenge's EDAM annotations."""
+    return (
+        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
+        .fillna("")
+        .drop(["_challenge", "_edam_name"], axis=1)
+        .rename(columns={"edam_id": "edam_concept_id"})
+    )
@@ -12,6 +12,7 @@ dependencies = [
     "gspread==6.1.4",
     "pandas==2.2.3",
     "numpy==2.1.0",
+    "pymysql==1.1.1",
 ]
 name = "openchallenges-data-lambda"
 version = "0.1.0"
@@ -24,4 +25,4 @@ prod = []
 test = []
 
 [tool.uv]
-default-groups = []
+default-groups = []