Sage-Bionetworks · jaymedina · Aug 13, 2024 · Jul 19, 2024 · Jul 19, 2024 · Jul 19, 2024
@@ -0,0 +1,12 @@
+repos:
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: isort (python)
+
+  - repo: https://github.com/psf/black
+    rev: 24.3.0
+    hooks:
+      - id: black
+        language_version: python3
@@ -56,13 +56,11 @@ QUERY_NUMBER_OF_FILES = """
 select
     count(*) as number_of_files
 from
-    node_latest
+    synapse_data_warehouse.synapse.node_latest
 where 
     project_id = '53214489'
 and
-    node_type = 'file' // we want files, not folders or any other entity
-and
-    annotations is not NULL;
+    node_type = 'file';
 """
 ```
 
@@ -114,7 +112,7 @@ as you see fit.
 ### 6. Dockerize your Application
 
 - Update the `requirements.txt` file with the packages used in any of the scripts above.
-- Ensure you have pushed all your changes to your fork of the repository that you are working in.
+- Ensure you have pushed all your changes to your fork of the repository that you are working in (remember not to commit your `secrets.toml` file).
 - **_(Optional)_** You can choose to push a Docker image to the GitHub Container Registry to pull it directly from the container registry when ready to deploy.
   For instructions on how to deploy your Docker image to the GitHub Container Registry, [see here](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry).
 

@@ -1,15 +1,18 @@
 import numpy as np
 import streamlit as st
-
-from toolkit.queries import (QUERY_ENTITY_DISTRIBUTION, QUERY_PROJECT_DOWNLOADS,
-                     QUERY_PROJECT_SIZES, QUERY_UNIQUE_USERS)
+from toolkit.queries import (
+    QUERY_ENTITY_DISTRIBUTION,
+    QUERY_PROJECT_DOWNLOADS,
+    QUERY_PROJECT_SIZES,
+    QUERY_UNIQUE_USERS,
+)
 from toolkit.utils import get_data_from_snowflake
-from toolkit.widgets import (plot_download_sizes, plot_unique_users_trend)
-
+from toolkit.widgets import plot_download_sizes, plot_unique_users_trend
 
 # Custom CSS for styling
-with open('style.css') as f:
-    st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+with open("style.css") as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+
 
 def main():
 
@@ -21,14 +24,21 @@ def main():
 
     # 2. Transform the data as needed
     convert_to_gib = 1024 * 1024 * 1024
-    project_sizes = dict(PROJECT_ID=list(project_sizes_df['PROJECT_ID']), TOTAL_CONTENT_SIZE=list(project_sizes_df['TOTAL_CONTENT_SIZE']))
-    total_data_size = sum(project_sizes['TOTAL_CONTENT_SIZE']) #round(sum(project_sizes['TOTAL_CONTENT_SIZE']) / convert_to_gib, 2)
-    average_project_size = round(np.mean(project_sizes['TOTAL_CONTENT_SIZE']) / convert_to_gib, 2)
+    project_sizes = dict(
+        PROJECT_ID=list(project_sizes_df["PROJECT_ID"]),
+        TOTAL_CONTENT_SIZE=list(project_sizes_df["TOTAL_CONTENT_SIZE"]),
+    )
+    total_data_size = sum(
+        project_sizes["TOTAL_CONTENT_SIZE"]
+    )  # round(sum(project_sizes['TOTAL_CONTENT_SIZE']) / convert_to_gib, 2)
+    average_project_size = round(
+        np.mean(project_sizes["TOTAL_CONTENT_SIZE"]) / convert_to_gib, 2
+    )
 
     # 3. Format the app, and visualize the data with your widgets in widgets.py
     # -------------------------------------------------------------------------
     # Row 1 -------------------------------------------------------------------
-    st.markdown('### Monthly Overview :calendar:')
+    st.markdown("### Monthly Overview :calendar:")
     col1, col2, col3 = st.columns([1, 1, 1])
     col1.metric("Total Storage Occupied", f"{total_data_size} GB", "7.2 GB")
     col2.metric("Avg. Project Size", f"{average_project_size} GB", "8.0 GB")
@@ -45,5 +55,6 @@ def main():
     st.markdown("### Entity Trends :pencil:")
     st.dataframe(entity_distribution_df)
 
+
 if __name__ == "__main__":
-    main()
+    main()
@@ -1,7 +1,10 @@
+black==24.3.0
+isort==5.13.2
 numpy==1.26.3
 streamlit==1.36.0
 pandas==2.2.2
 plotly==5.22.0
 pytest==8.3.2
+pre-commit==3.6.0
 snowflake-connector-python==3.9.1
 snowflake-snowpark-python==1.15.0
@@ -19,14 +19,18 @@
 from streamlit.testing.v1 import AppTest
 
 # Ensure that the base directory is in PYTHONPATH so ``toolkit`` and other tools can be found
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 
 # The timeout limit to wait for the app to load before shutdown ( in seconds )
 DEFAULT_TIMEOUT = 30
 
+
 @pytest.fixture(scope="module")
 def app():
-    return AppTest.from_file('app.py', default_timeout=DEFAULT_TIMEOUT).run()  # Point to your main Streamlit app file
+    return AppTest.from_file(
+        "app.py", default_timeout=DEFAULT_TIMEOUT
+    ).run()  # Point to your main Streamlit app file
+
 
 def test_monthly_overview(app):
     """
@@ -44,6 +48,7 @@ def test_monthly_overview(app):
     assert avg_project_size.label == "Avg. Project Size"
     assert annual_cost.label == "Annual Cost"
 
+
 def test_plotly_charts(app):
     """Ensure both plotly charts are being displayed."""
 
@@ -52,6 +57,7 @@ def test_plotly_charts(app):
     assert plotly_charts is not None
     assert len(plotly_charts) == 2
 
+
 def test_dataframe(app):
     """Ensure that the dataframe is being displayed."""
 

@@ -1 +1 @@
-import toolkit
+import toolkit
@@ -1,4 +1,5 @@
-QUERY_ENTITY_DISTRIBUTION = """
+SYNID = 20446927
+QUERY_ENTITY_DISTRIBUTION = f"""
 with htan_projects as (
     // select distinct cast(replace(NF.projectid, 'syn', '') as INTEGER) as project_id from sage.portal_raw.HTAN
     select
@@ -7,7 +8,7 @@
         synapse_data_warehouse.synapse.node_latest,
         lateral flatten(input => node_latest.scope_ids) scopes
     where
-        id = 20446927
+        id = {SYNID}
 )
 SELECT
     node_type,
@@ -163,4 +164,4 @@
 ORDER BY
     project_id,
     access_month;
-"""
+"""
@@ -1,13 +1,15 @@
 import streamlit as st
 from snowflake.snowpark import Session
 
+
 @st.cache_resource
 def connect_to_snowflake():
     session = Session.builder.configs(st.secrets.snowflake).create()
     return session
 
+
 @st.cache_data
 def get_data_from_snowflake(query=""):
     session = connect_to_snowflake()
     node_latest = session.sql(query).to_pandas()
-    return node_latest
+    return node_latest