Merge pull request #3 from abkfenris/pixi-dev-tools

Packaging cleanup
oceanhackweek · Sep 2, 2024 · 7b64ff5 · 7b64ff5
2 parents 6342938 + 4e99fcd
commit 7b64ff5
Show file tree

Hide file tree

Showing 14 changed files with 591 additions and 355 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,12 +5,6 @@ ci:
 exclude: ^.cruft.json|.copier-answers.yml$
 
 repos:
-  - repo: https://github.com/adamchainz/blacken-docs
-    rev: "1.18.0"
-    hooks:
-      - id: blacken-docs
-        additional_dependencies: [black==24.*]
-
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: "v4.6.0"
     hooks:
@@ -42,25 +36,28 @@ repos:
         args: [--prose-wrap=always]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.6.1"
+    rev: "v0.6.3"
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
+        exclude_types: [jupyter]
       - id: ruff-format
+        exclude_types: [jupyter]
 
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.11.1"
-    hooks:
-      - id: mypy
-        files: src|tests
-        args: []
-        additional_dependencies:
-          - pytest
+  # - repo: https://github.com/pre-commit/mirrors-mypy
+  #   rev: "v1.11.2"
+  #   hooks:
+  #     - id: mypy
+  #       files: src|tests
+  #       args: []
+  #       additional_dependencies:
+  #         - pytest
 
   - repo: https://github.com/codespell-project/codespell
     rev: "v2.3.0"
     hooks:
       - id: codespell
+        exclude_types: [jupyter]
 
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: "v0.10.0.1"
@@ -74,6 +71,7 @@ repos:
         language: pygrep
         entry: PyBind|Numpy|Cmake|CCache|Github|PyTest
         exclude: .pre-commit-config.yaml
+        exclude_types: [jupyter]
 
   - repo: https://github.com/abravalheri/validate-pyproject
     rev: "v0.19"
@@ -82,7 +80,7 @@ repos:
         additional_dependencies: ["validate-pyproject-schema-store[all]"]
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: "0.29.1"
+    rev: "0.29.2"
     hooks:
       - id: check-dependabot
       - id: check-github-workflows

diff --git a/README.md b/README.md
@@ -1,38 +1,49 @@
-# XArray load_by_step: An accessor for loading huge files from a THREDDS server  
+# XArray load_by_step: An accessor for loading huge files from a THREDDS server
 
-OceanHackWeek 2024 Project done by [Marcelo Andrioni](https://github.com/marceloandrioni) and [João Pedro Amorim](https://github.com/joaopedroamorim) in which the main goal was to develop a xarray function capable of loading huge amounts of data accessed trough a THREDDS server.  
+OceanHackWeek 2024 Project done by
+[Marcelo Andrioni](https://github.com/marceloandrioni) and
+[João Pedro Amorim](https://github.com/joaopedroamorim) in which the main goal
+was to develop a xarray function capable of loading huge amounts of data
+accessed through a THREDDS server.
 
 **Folder Structure**
 
-* `src/load_by_step` Contains the main functions necessary to use the **_load_by_step()_** accessor
-* `final_notebooks` Contains the main notebooks with examples of using the **_load_by_step()_** accessor
+- `src/load_by_step` Contains the main functions necessary to use the
+  **_load_by_step()_** accessor
+- `final_notebooks` Contains the main notebooks with examples of using the
+  **_load_by_step()_** accessor
 
 ## Project Name
+
 xarray load-by-step
 
 ## One-line Description
-A Xarray accessor to download large quantities of data from THREDDS server automatically breaking a large request in smaller requests to avoid server timeout.
+
+A Xarray accessor to download large quantities of data from THREDDS server
+automatically breaking a large request in smaller requests to avoid server
+timeout.
 
 ## How to install?
+
 `pip install load_by_step@git+https://github.com/oceanhackweek/ohw24_proj_xarray_load_by_step_us`
 
 ## Planning
 
 ## Collaborators
 
-| Name                | Location   | Role                |
-|---------------------|------------|---------------------|
-| Marcelo Andrioni    | Bigelow    | Participant         |
-| João Pedro Amorim   | Bigelow    | Participant         |
+| Name              | Location | Role        |
+| ----------------- | -------- | ----------- |
+| Marcelo Andrioni  | Bigelow  | Participant |
+| João Pedro Amorim | Bigelow  | Participant |
 
 ## Planning
 
-* Initial idea: "short description"
-* Ideation jam board: Add link
-* Ideation Presentation: Add link
-* Slack channel: ohw24_proj_name
-* Project google drive: Add link
-* Final presentation: Add link
+- Initial idea: "short description"
+- Ideation jam board: Add link
+- Ideation Presentation: Add link
+- Slack channel: ohw24_proj_name
+- Project google drive: Add link
+- Final presentation: Add link
 
 ## Background
 
@@ -47,4 +58,3 @@ A Xarray accessor to download large quantities of data from THREDDS server autom
 ## Lessons Learned
 
 ## References
-
diff --git a/contributor_folders/team_member_1/tm1_notebook.ipynb b/contributor_folders/team_member_1/tm1_notebook.ipynb
diff --git a/contributor_folders/team_member_2/ErrorDescription.ipynb b/contributor_folders/team_member_2/ErrorDescription.ipynb
@@ -18,14 +18,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
+    "from __future__ import annotations\n",
+    "\n",
+    "import re\n",
+    "from collections import Counter\n",
+    "\n",
     "import matplotlib.pyplot as plt\n",
-    "import xarray as xr\n",
-    "import cf_xarray\n",
+    "import numpy as np\n",
     "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "from collections import Counter\n",
-    "import re"
+    "import xarray as xr\n",
+    "from bs4 import BeautifulSoup"
    ]
   },
   {
@@ -51,13 +53,15 @@
     "ds = xr.open_dataset(url, drop_variables=\"tau\")\n",
     "\n",
     "### Domain Specification\n",
-    "lonmin=-54\n",
-    "lonmax=-31\n",
-    "latmin=-36\n",
-    "latmax=7\n",
-    "selvar = 'water_u'\n",
+    "lonmin = -54\n",
+    "lonmax = -31\n",
+    "latmin = -36\n",
+    "latmax = 7\n",
+    "selvar = \"water_u\"\n",
     "\n",
-    "da = ds[selvar].sel(lon=slice(convert_to_360(lonmin), convert_to_360(lonmax)), lat=slice(latmin,latmax))"
+    "da = ds[selvar].sel(\n",
+    "    lon=slice(convert_to_360(lonmin), convert_to_360(lonmax)), lat=slice(latmin, latmax)\n",
+    ")"
    ]
   },
   {
@@ -77,8 +81,10 @@
    ],
    "source": [
     "unit = 1e9\n",
-    "print(f'The present dataset is about {np.round(da.nbytes/unit)}GB in size')\n",
-    "print(f'The amount of data present in a single level is about {np.round(da.sel(depth=0).nbytes/unit)}GB')"
+    "print(f\"The present dataset is about {np.round(da.nbytes/unit)}GB in size\")\n",
+    "print(\n",
+    "    f\"The amount of data present in a single level is about {np.round(da.sel(depth=0).nbytes/unit)}GB\"\n",
+    ")"
    ]
   },
   {
@@ -165,7 +171,7 @@
    ],
    "source": [
     "test2 = da.sel(time=slice(\"2024-01-01\", \"2024-01-10\"))\n",
-    "print(f'The present dataset is about {np.round(test2.nbytes/unit)}GB in size')"
+    "print(f\"The present dataset is about {np.round(test2.nbytes/unit)}GB in size\")"
    ]
   },
   {
@@ -225,29 +231,29 @@
    ],
    "source": [
     "try:\n",
-    "    from wordcloud import WordCloud\n",
-    "    from nltk.corpus import stopwords\n",
-    "    from nltk import pos_tag  # Import pos_tag for POS tagging\n",
-    "    from nltk.tokenize import word_tokenize \n",
     "    import nltk\n",
-    "    nltk.download('stopwords')\n",
-    "    nltk.download('punkt')\n",
-    "    nltk.download('averaged_perceptron_tagger')\n",
-    "    nltk.download('averaged_perceptron_tagger_eng')\n",
+    "    from nltk import pos_tag  # Import pos_tag for POS tagging\n",
+    "    from nltk.corpus import stopwords\n",
+    "    from wordcloud import WordCloud\n",
+    "\n",
+    "    nltk.download(\"stopwords\")\n",
+    "    nltk.download(\"punkt\")\n",
+    "    nltk.download(\"averaged_perceptron_tagger\")\n",
+    "    nltk.download(\"averaged_perceptron_tagger_eng\")\n",
     "\n",
     "except:\n",
     "    %pip install wordcloud\n",
     "    %pip install nltk\n",
     "    %pip install selenium\n",
-    "    from wordcloud import WordCloud\n",
-    "    from nltk.corpus import stopwords\n",
-    "    from nltk import pos_tag  # Import pos_tag for POS tagging\n",
-    "    from nltk.tokenize import word_tokenize \n",
     "    import nltk\n",
-    "    nltk.download('stopwords')\n",
-    "    nltk.download('punkt')\n",
-    "    nltk.download('averaged_perceptron_tagger')\n",
-    "    nltk.download('averaged_perceptron_tagger_eng') "
+    "    from nltk import pos_tag  # Import pos_tag for POS tagging\n",
+    "    from nltk.corpus import stopwords\n",
+    "    from wordcloud import WordCloud\n",
+    "\n",
+    "    nltk.download(\"stopwords\")\n",
+    "    nltk.download(\"punkt\")\n",
+    "    nltk.download(\"averaged_perceptron_tagger\")\n",
+    "    nltk.download(\"averaged_perceptron_tagger_eng\")"
    ]
   },
   {
@@ -265,31 +271,34 @@
     "        return []\n",
     "\n",
     "    # Extrai o conteúdo da página\n",
-    "    soup = BeautifulSoup(response.text, 'html.parser')\n",
-    "    \n",
+    "    soup = BeautifulSoup(response.text, \"html.parser\")\n",
+    "\n",
     "    # Encontra todo o texto nos tópicos (ajustar seletor conforme necessário)\n",
     "    text_content = soup.get_text()\n",
-    "    \n",
+    "\n",
     "    # Remove pontuação e números, e converte para minúsculas\n",
-    "    words = re.findall(r'\\b[a-zá-úà-ùãõâêîôûç]+\\b', text_content.lower())\n",
-    "    \n",
+    "    words = re.findall(r\"\\b[a-zá-úà-ùãõâêîôûç]+\\b\", text_content.lower())\n",
+    "\n",
     "    return words\n",
     "\n",
+    "\n",
     "def filter_words(words):\n",
     "    # Get the list of English stopwords and extend with common articles/prepositions\n",
-    "    stop_words = set(stopwords.words('english'))\n",
+    "    stop_words = set(stopwords.words(\"english\"))\n",
     "\n",
     "    # POS tagging\n",
     "    words_with_pos = pos_tag(words)\n",
     "\n",
     "    # Filter out stopwords and nouns\n",
     "    filtered_words = [\n",
-    "        word for word, pos in words_with_pos\n",
-    "        if word not in stop_words and pos not in ['NN', 'NNS', 'NNP', 'NNPS']\n",
+    "        word\n",
+    "        for word, pos in words_with_pos\n",
+    "        if word not in stop_words and pos not in [\"NN\", \"NNS\", \"NNP\", \"NNPS\"]\n",
     "    ]\n",
-    "    \n",
+    "\n",
     "    return filtered_words\n",
     "\n",
+    "\n",
     "def main_url_search(urls):\n",
     "    # Lista de URLs dos tópicos do fórum (adicione mais URLs conforme necessário)\n",
     "    all_words = []\n",
@@ -298,16 +307,17 @@
     "        words = get_words_from_topic(url)\n",
     "        filtered_words = filter_words(words)\n",
     "        all_words.extend(filtered_words)\n",
-    "    \n",
+    "\n",
     "    # Contagem das palavras\n",
     "    word_count = Counter(all_words)\n",
     "\n",
     "    return word_count\n",
     "\n",
-    "url = 'https://groups.google.com/a/hycom.org/g/forum'\n",
+    "\n",
+    "url = \"https://groups.google.com/a/hycom.org/g/forum\"\n",
     "response = requests.get(url)\n",
     "# Extrai o conteúdo da página\n",
-    "soup = BeautifulSoup(response.text, 'html.parser')"
+    "soup = BeautifulSoup(response.text, \"html.parser\")"
    ]
   },
   {
@@ -318,7 +328,20 @@
    "outputs": [],
    "source": [
     "# Define the list of words to color differently\n",
-    "highlight_words = {\"runtime\":\"red\",\"via\":\"red\",\"opendap\":\"red\",\"https\":\"red\",\"read\":\"red\",\"netcdf\":\"red\",\"download\":\"red\", \"downloading\": \"red\", \"thredds\": \"red\",\"accessing\":\"red\",\"access\":\"red\"}\n",
+    "highlight_words = {\n",
+    "    \"runtime\": \"red\",\n",
+    "    \"via\": \"red\",\n",
+    "    \"opendap\": \"red\",\n",
+    "    \"https\": \"red\",\n",
+    "    \"read\": \"red\",\n",
+    "    \"netcdf\": \"red\",\n",
+    "    \"download\": \"red\",\n",
+    "    \"downloading\": \"red\",\n",
+    "    \"thredds\": \"red\",\n",
+    "    \"accessing\": \"red\",\n",
+    "    \"access\": \"red\",\n",
+    "}\n",
+    "\n",
     "\n",
     "# Custom color function\n",
     "def color_func(word, font_size, position, orientation, random_state=None, **kwargs):\n",
@@ -344,10 +367,12 @@
    ],
    "source": [
     "word_counters = main_url_search([url])\n",
-    "wordcloud = WordCloud(width=800, height=400, background_color='white',color_func=color_func).generate_from_frequencies(word_counters)\n",
+    "wordcloud = WordCloud(\n",
+    "    width=800, height=400, background_color=\"white\", color_func=color_func\n",
+    ").generate_from_frequencies(word_counters)\n",
     "plt.figure(figsize=(10, 5))\n",
-    "plt.imshow(wordcloud)#, interpolation='bilinear')\n",
-    "plt.axis('off')\n",
+    "plt.imshow(wordcloud)  # , interpolation='bilinear')\n",
+    "plt.axis(\"off\")\n",
     "plt.show()"
    ]
   },

diff --git a/contributor_folders/team_member_2/tm2_notebook.ipynb b/contributor_folders/team_member_2/tm2_notebook.ipynb
diff --git a/data/data.csv b/data/data.csv
@@ -1,3 +1,3 @@
 site, value
 a, 1
-b, 2
+b, 2