From 68aa0c9dd4da9f94a5b01b20fa4045da3702401a Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 16 Nov 2023 10:49:31 -0700 Subject: [PATCH] updates-after-2023.10-release (#3324) * updates-after-2023.10-release * fix bug in has_human --- notebooks/resource-allocation/102023.1.ipynb | 499 ++++++++++++ notebooks/resource-allocation/112023.ipynb | 744 ++++++++++++++++++ .../generate-allocation-summary.py | 4 - qiita_db/artifact.py | 3 +- qiita_db/processing_job.py | 10 +- 5 files changed, 1251 insertions(+), 9 deletions(-) create mode 100644 notebooks/resource-allocation/102023.1.ipynb create mode 100644 notebooks/resource-allocation/112023.ipynb diff --git a/notebooks/resource-allocation/102023.1.ipynb b/notebooks/resource-allocation/102023.1.ipynb new file mode 100644 index 000000000..fce3cfd46 --- /dev/null +++ b/notebooks/resource-allocation/102023.1.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "from datetime import datetime, timedelta, date\n", + "from humanize import naturalsize\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Qiita's resource allocation - quick update from previous version\n", + "\n", + "After the 2023.10 release we noticed that:\n", + "1. `job-output-folder` `VALIDATE` command didn't have valid request because those jobs do not have sample/column values\n", + "2. The default during a resource allocation for time is minutes and the calculations were done in seconds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading data\n", + "\n", + "First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n", + "\n", + "The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "m1g = 2**30\n", + "df = pd.read_csv('jobs_2023-10-04.tsv.gz', sep='\\t', dtype={'extra_info': str})\n", + "df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'There are 101147 successful jobs since we moved to barnacle2 and the largest external_id is: 1581986'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for reference for the next iteration of this notebook\n", + "f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Getting the default values for `job-output-folder` `VALIDATE`" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ElapsedRawTimeMaxRSSRaw
countminmaxcountminmax
cNamesName
ValidateBIOM type - BIOM6870 days 00:00:550 days 01:03:49687222.8 MB82.0 GB
Diversity types - FeatureData60 days 00:01:200 days 00:02:496331.4 MB384.3 MB
Diversity types - alpha_vector1230 days 00:01:123 days 04:36:54123289.3 MB101.5 GB
Diversity types - distance_matrix1170 days 00:00:370 days 00:03:55117122.7 MB12.5 GB
Diversity types - ordination_results1070 days 00:00:390 days 00:03:19107117.2 MB2.9 GB
Sequencing Data Type - Demultiplexed430 days 00:00:350 days 00:12:234383.4 MB517.4 MB
Sequencing Data Type - FASTA20 days 00:00:560 days 00:02:23279.8 MB83.6 MB
Sequencing Data Type - FASTQ320 days 00:00:410 days 01:50:443278.7 MB84.4 MB
Sequencing Data Type - SFF10 days 00:01:090 days 00:01:09179.6 MB79.6 MB
Sequencing Data Type - per_sample_FASTQ730 days 00:00:360 days 18:13:217377.6 MB83.6 MB
Visualization types - q2_visualization1330 days 00:00:360 days 00:24:5613351.5 MB67.5 MB
qtp-job-output-folder - job-output-folder2280 days 00:00:310 days 00:04:0622818.3 MB46.7 MB
\n", + "
" + ], + "text/plain": [ + " ElapsedRawTime \\\n", + " count \n", + "cName sName \n", + "Validate BIOM type - BIOM 687 \n", + " Diversity types - FeatureData 6 \n", + " Diversity types - alpha_vector 123 \n", + " Diversity types - distance_matrix 117 \n", + " Diversity types - ordination_results 107 \n", + " Sequencing Data Type - Demultiplexed 43 \n", + " Sequencing Data Type - FASTA 2 \n", + " Sequencing Data Type - FASTQ 32 \n", + " Sequencing Data Type - SFF 1 \n", + " Sequencing Data Type - per_sample_FASTQ 73 \n", + " Visualization types - q2_visualization 133 \n", + " qtp-job-output-folder - job-output-folder 228 \n", + "\n", + " \\\n", + " min \n", + "cName sName \n", + "Validate BIOM type - BIOM 0 days 00:00:55 \n", + " Diversity types - FeatureData 0 days 00:01:20 \n", + " Diversity types - alpha_vector 0 days 00:01:12 \n", + " Diversity types - distance_matrix 0 days 00:00:37 \n", + " Diversity types - ordination_results 0 days 00:00:39 \n", + " Sequencing Data Type - Demultiplexed 0 days 00:00:35 \n", + " Sequencing Data Type - FASTA 0 days 00:00:56 \n", + " Sequencing Data Type - FASTQ 0 days 00:00:41 \n", + " Sequencing Data Type - SFF 0 days 00:01:09 \n", + " Sequencing Data Type - per_sample_FASTQ 0 days 00:00:36 \n", + " Visualization types - q2_visualization 0 days 00:00:36 \n", + " qtp-job-output-folder - job-output-folder 0 days 00:00:31 \n", + "\n", + " MaxRSSRaw \\\n", + " max count \n", + "cName sName \n", + "Validate BIOM type - BIOM 0 days 01:03:49 687 \n", + " Diversity types - FeatureData 0 days 00:02:49 6 \n", + " Diversity types - alpha_vector 3 days 04:36:54 123 \n", + " Diversity types - distance_matrix 0 days 00:03:55 117 \n", + " Diversity types - ordination_results 0 days 00:03:19 107 \n", + " Sequencing Data Type - Demultiplexed 0 days 00:12:23 43 \n", + " Sequencing Data Type - FASTA 0 days 00:02:23 2 \n", + " Sequencing Data Type - FASTQ 0 days 01:50:44 32 \n", + " Sequencing Data Type - SFF 0 days 00:01:09 1 \n", + " Sequencing Data Type - per_sample_FASTQ 0 days 18:13:21 73 \n", + " Visualization types - q2_visualization 0 days 00:24:56 133 \n", + " qtp-job-output-folder - job-output-folder 0 days 00:04:06 228 \n", + "\n", + " \n", + " min max \n", + "cName sName \n", + "Validate BIOM type - BIOM 222.8 MB 82.0 GB \n", + " Diversity types - FeatureData 331.4 MB 384.3 MB \n", + " Diversity types - alpha_vector 289.3 MB 101.5 GB \n", + " Diversity types - distance_matrix 122.7 MB 12.5 GB \n", + " Diversity types - ordination_results 117.2 MB 2.9 GB \n", + " Sequencing Data Type - Demultiplexed 83.4 MB 517.4 MB \n", + " Sequencing Data Type - FASTA 79.8 MB 83.6 MB \n", + " Sequencing Data Type - FASTQ 78.7 MB 84.4 MB \n", + " Sequencing Data Type - SFF 79.6 MB 79.6 MB \n", + " Sequencing Data Type - per_sample_FASTQ 77.6 MB 83.6 MB \n", + " Visualization types - q2_visualization 51.5 MB 67.5 MB \n", + " qtp-job-output-folder - job-output-folder 18.3 MB 46.7 MB " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cname = 'Validate'\n", + "_df = df[(df.cName == cname)].copy()\n", + "\n", + "summary = _df[_df['samples'].isnull() & _df['columns'].isnull()].groupby(\n", + " ['cName', 'sName'])[['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n", + "\n", + "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n", + "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n", + "\n", + "display(summary)\n", + "\n", + "# New allocation: -p qiita -N 1 -n 1 --mem 100mb --time 00:40:00" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Updates for the seconds to minute confusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**30)+({samples}*150000) '\n", + " || '--time 240'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**30)+({samples}*150000) '\n", + " || '--time 4'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'delete_sample_or_column';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem {samples}*10000000'\n", + " || '--time 61200'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem {samples}*10000000'\n", + " || '--time 1020'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Sequence Processing Pipeline';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 --mem 4g --time 900'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 --mem 4g --time 15'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Filter samples from table [filter_samples]';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**31)+({input_size}*6) if\n", + "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n", + " || '--time 2400'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**31)+({input_size}*6) if\n", + "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n", + " || '--time 40'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Rarefy table [rarefy]';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem 14g'\n", + " || '--time 360'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem 14g '\n", + " || '--time 6'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Alpha diversity (phylogenetic) [alpha_phylogenetic]';\n", + "\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000)'\n", + " || '--time 1800'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem\n", + "(2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000) '\n", + " || '--time 30'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Visualize and Interact with Principal Coordinates Analysis\n", + "Plots [plot]';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**32)+(({samples}*{columns}*{input_size}')/20000)'\n", + " || '--time 90000'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**32)+(({samples}*{columns}*{input_size})/20000) '\n", + " || '--time 1500'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Alpha rarefaction curves [alpha_rarefaction]';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size} < 16*(2**30) else 16*(2**30)'\n", + " || '--time 36000'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size}\n", + "< 16*(2**30) else 16*(2**30) '\n", + " || '--time 600'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'Trimming';\n", + "\n", + "=============\n", + "=============\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**30)+({samples}*{columns}*2000)'\n", + " || '--time 2300'\n", + "\n", + "UPDATE qiita.processing_job_resource_allocation set\n", + " allocation = '-p qiita -N 1 -n 1 '\n", + " || '--mem (2**30)+({samples}*{columns}*2000) '\n", + " || '--time 39'\n", + " WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n", + " name = 'update_sample_template';" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/resource-allocation/112023.ipynb b/notebooks/resource-allocation/112023.ipynb new file mode 100644 index 000000000..08b2d9a25 --- /dev/null +++ b/notebooks/resource-allocation/112023.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import seaborn as sns\n", + "from datetime import datetime, timedelta, date\n", + "from humanize import naturalsize\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Qiita's resource allocation\n", + "\n", + "This notebook walks through how to load & parse the job stats from Qiita. It additionally tries to split the different commands by their resource utilization and make sure to be as accurate/fair to request them. Here resource allocations are mainly walltime (`ElapsedRawTime`), memory (`MaxRSSRaw`) and the time a job took to start running (`WaitTime`: Start - Submit). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading data\n", + "\n", + "First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n", + "\n", + "The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "m1g = 2**30\n", + "df = pd.read_csv('jobs_2023-10-31.tsv.gz', sep='\\t', dtype={'extra_info': str})\n", + "df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'There are 106548 successful jobs since we moved to barnacle2 and the largest external_id is: 1614116'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for reference for the next iteration of this notebook\n", + "f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deciding what to optimize and what to leave with a default value\n", + "\n", + "In the previous versions (072023, 102023, 102023.1) we decided to only optimize things that are using more than 4gb or 4hrs and now we want to review commands that are below 4g and 4hrs so we add specific parameters for them." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "qiita: 10\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ElapsedRawTimeMaxRSSRaw
countminmaxminmax
cNamesName
delete_artifactQiita15340 days 00:00:030 days 02:48:080 Bytes122.2 MB
create_sample_templateQiita5690 days 00:00:030 days 00:11:220 Bytes415.8 MB
delete_analysisQiita3200 days 00:00:030 days 00:06:130 Bytes120.8 MB
download_remote_filesQiita1940 days 00:00:070 days 03:29:360 Bytes128.9 MB
delete_sample_templateQiita1810 days 00:00:040 days 00:19:310 Bytes120.6 MB
delete_studyQiita1360 days 00:00:030 days 00:16:090 Bytes125.5 MB
update_prep_templateQiita1260 days 00:00:030 days 00:02:250 Bytes125.3 MB
copy_artifactQiita1010 days 00:00:060 days 00:33:160 Bytes124.1 MB
Generate HTML summarySequencing Data Type780 days 00:00:350 days 02:18:5456.6 MB85.7 MB
list_remote_filesQiita470 days 00:00:050 days 00:02:210 Bytes121.7 MB
\n", + "
" + ], + "text/plain": [ + " ElapsedRawTime \\\n", + " count min \n", + "cName sName \n", + "delete_artifact Qiita 1534 0 days 00:00:03 \n", + "create_sample_template Qiita 569 0 days 00:00:03 \n", + "delete_analysis Qiita 320 0 days 00:00:03 \n", + "download_remote_files Qiita 194 0 days 00:00:07 \n", + "delete_sample_template Qiita 181 0 days 00:00:04 \n", + "delete_study Qiita 136 0 days 00:00:03 \n", + "update_prep_template Qiita 126 0 days 00:00:03 \n", + "copy_artifact Qiita 101 0 days 00:00:06 \n", + "Generate HTML summary Sequencing Data Type 78 0 days 00:00:35 \n", + "list_remote_files Qiita 47 0 days 00:00:05 \n", + "\n", + " MaxRSSRaw \\\n", + " max min \n", + "cName sName \n", + "delete_artifact Qiita 0 days 02:48:08 0 Bytes \n", + "create_sample_template Qiita 0 days 00:11:22 0 Bytes \n", + "delete_analysis Qiita 0 days 00:06:13 0 Bytes \n", + "download_remote_files Qiita 0 days 03:29:36 0 Bytes \n", + "delete_sample_template Qiita 0 days 00:19:31 0 Bytes \n", + "delete_study Qiita 0 days 00:16:09 0 Bytes \n", + "update_prep_template Qiita 0 days 00:02:25 0 Bytes \n", + "copy_artifact Qiita 0 days 00:33:16 0 Bytes \n", + "Generate HTML summary Sequencing Data Type 0 days 02:18:54 56.6 MB \n", + "list_remote_files Qiita 0 days 00:02:21 0 Bytes \n", + "\n", + " \n", + " max \n", + "cName sName \n", + "delete_artifact Qiita 122.2 MB \n", + "create_sample_template Qiita 415.8 MB \n", + "delete_analysis Qiita 120.8 MB \n", + "download_remote_files Qiita 128.9 MB \n", + "delete_sample_template Qiita 120.6 MB \n", + "delete_study Qiita 125.5 MB \n", + "update_prep_template Qiita 125.3 MB \n", + "copy_artifact Qiita 124.1 MB \n", + "Generate HTML summary Sequencing Data Type 85.7 MB \n", + "list_remote_files Qiita 121.7 MB " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "qiime2: 8\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ElapsedRawTimeMaxRSSRaw
countminmaxminmax
cNamesName
adonis PERMANOVA test for beta group significance [adonis]qiime25520 days 00:00:570 days 00:39:12147.4 MB3.5 GB
Core diversity metrics (non-phylogenetic) [core_metrics]qiime21000 days 00:02:170 days 00:25:31213.5 MB4.3 GB
Taxonomy-based feature table filter. [filter_table]qiime2740 days 00:00:520 days 00:19:37214.9 MB2.6 GB
Summarize table [summarize]qiime2640 days 00:00:560 days 00:05:54229.8 MB3.0 GB
Add pseudocount to table. [add_pseudocount]qiime2550 days 00:01:040 days 00:06:14242.5 MB2.9 GB
Filter features from a table based on abundance and prevalence [filter_features_conditionally]qiime2530 days 00:00:530 days 00:02:33212.4 MB553.3 MB
Identify core features in table [core_features]qiime2490 days 00:01:030 days 00:59:29212.9 MB2.6 GB
Filter features from table [filter_features]qiime2480 days 00:00:470 days 00:03:34208.3 MB398.4 MB
\n", + "
" + ], + "text/plain": [ + " ElapsedRawTime \\\n", + " count \n", + "cName sName \n", + "adonis PERMANOVA test for beta group significan... qiime2 552 \n", + "Core diversity metrics (non-phylogenetic) [core... qiime2 100 \n", + "Taxonomy-based feature table filter. [filter_ta... qiime2 74 \n", + "Summarize table [summarize] qiime2 64 \n", + "Add pseudocount to table. [add_pseudocount] qiime2 55 \n", + "Filter features from a table based on abundance... qiime2 53 \n", + "Identify core features in table [core_features] qiime2 49 \n", + "Filter features from table [filter_features] qiime2 48 \n", + "\n", + " \\\n", + " min \n", + "cName sName \n", + "adonis PERMANOVA test for beta group significan... qiime2 0 days 00:00:57 \n", + "Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:02:17 \n", + "Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:00:52 \n", + "Summarize table [summarize] qiime2 0 days 00:00:56 \n", + "Add pseudocount to table. [add_pseudocount] qiime2 0 days 00:01:04 \n", + "Filter features from a table based on abundance... qiime2 0 days 00:00:53 \n", + "Identify core features in table [core_features] qiime2 0 days 00:01:03 \n", + "Filter features from table [filter_features] qiime2 0 days 00:00:47 \n", + "\n", + " \\\n", + " max \n", + "cName sName \n", + "adonis PERMANOVA test for beta group significan... qiime2 0 days 00:39:12 \n", + "Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:25:31 \n", + "Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:19:37 \n", + "Summarize table [summarize] qiime2 0 days 00:05:54 \n", + "Add pseudocount to table. [add_pseudocount] qiime2 0 days 00:06:14 \n", + "Filter features from a table based on abundance... qiime2 0 days 00:02:33 \n", + "Identify core features in table [core_features] qiime2 0 days 00:59:29 \n", + "Filter features from table [filter_features] qiime2 0 days 00:03:34 \n", + "\n", + " MaxRSSRaw \n", + " min max \n", + "cName sName \n", + "adonis PERMANOVA test for beta group significan... qiime2 147.4 MB 3.5 GB \n", + "Core diversity metrics (non-phylogenetic) [core... qiime2 213.5 MB 4.3 GB \n", + "Taxonomy-based feature table filter. [filter_ta... qiime2 214.9 MB 2.6 GB \n", + "Summarize table [summarize] qiime2 229.8 MB 3.0 GB \n", + "Add pseudocount to table. [add_pseudocount] qiime2 242.5 MB 2.9 GB \n", + "Filter features from a table based on abundance... qiime2 212.4 MB 553.3 MB \n", + "Identify core features in table [core_features] qiime2 212.9 MB 2.6 GB \n", + "Filter features from table [filter_features] qiime2 208.3 MB 398.4 MB " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "summary = df.groupby(['cName', 'sName'])[\n", + " ['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n", + "\n", + "# We are gonna focus on jobs that request more than 4gb or take more than 4 hrs.\n", + "summary = summary[(summary[('MaxRSSRaw', 'max')] < 4*m1g) & \n", + " (summary[('ElapsedRawTime', 'max')] < timedelta(hours=4))]\n", + "\n", + "summary.sort_values(('MaxRSSRaw', 'count'), inplace=True, ascending=False)\n", + "summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n", + "\n", + "# ignore commands with less than 40 jobs to avoid over fitting early\n", + "summary = summary[summary[('ElapsedRawTime', 'count')] > 40]\n", + "\n", + "# ignore commands that were optimized on the previous notebooks - as part of larger sets\n", + "# summary = summary[]\n", + "summary = summary[summary.index.get_level_values('cName') != 'Validate']\n", + "\n", + "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n", + "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n", + "\n", + "_df = summary[summary.index.get_level_values('sName') != 'qiime2']\n", + "print (\"qiita:\", _df.shape[0])\n", + "display(_df)\n", + "\n", + "_df = summary[summary.index.get_level_values('sName') == 'qiime2']\n", + "print (\"qiime2:\", _df.shape[0])\n", + "display(_df)\n", + "\n", + "# *** RESOURCE ALLOCATION ***\n", + "\n", + "# Qiita jobs \n", + "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", + "# VALUES \n", + "# ('delete_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 3:00:00'),\n", + "# ('create_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 600mb --time 00:20:00'),\n", + "# ('delete_analysis', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:10:00'),\n", + "# ('download_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 4:00:00'),\n", + "# ('delete_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n", + "# ('delete_study', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n", + "# ('update_prep_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00'),\n", + "# ('copy_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 1:00:00'),\n", + "# ('list_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00');\n", + "\n", + "# Q2 jobs\n", + "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", + "# VALUES \n", + "# ('adonis PERMANOVA test for beta group significance [adonis]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 4gb --time 4:00:00'),\n", + "# ('Core diversity metrics (non-phylogenetic) [core_metrics]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 6gb --time 1:00:00'),\n", + "# ('Taxonomy-based feature table filter. [filter_table]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 4gb --time 00:20:00'),\n", + "# ('Summarize table [summarize]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 4gb --time 00:10:00'),\n", + "# ('Add pseudocount to table. [add_pseudocount]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 3.5gb --time 00:15:00'),\n", + "# ('Filter features from a table based on abundance and prevalence [filter_features_conditionally]', \n", + "# 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 1gb --time 00:10:00'),\n", + "# ('Identify core features in table [core_features]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 3.5gb --time 2:00:00'),\n", + "# ('Filter features from table [filter_features]', 'RESOURCE_PARAMS_COMMAND', \n", + "# '-p qiita -N 1 -n 1 --mem 500mb --time 00:10:00'); " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimizing Qiita processing jobs.\n", + "\n", + "As a remider, we can use:\n", + "- 'samples'\n", + "- 'columns'\n", + "- 'input_size'\n", + "- 'extra_info': this is when the current method doesn't provide the required info or we need to update it; this info comes from `job_stats_generation.py`\n", + "\n", + "Extra from the list of commands we should take a close look at `Generate HTML summary`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Generate HTML summary" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ElapsedRawTimeMaxRSSRawWaitTime
countminmaxminmaxminmax
cNamesNameextra_info
Generate HTML summarySequencing Data TypeNaN780 days 00:00:350 days 02:18:5456.6 MB85.7 MB0 days 00:00:000 days 06:22:26
BIOM typeNaN20 days 00:01:430 days 00:02:23278.1 MB315.8 MB0 days 00:00:000 days 00:00:01
\n", + "
" + ], + "text/plain": [ + " ElapsedRawTime \\\n", + " count \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 78 \n", + " BIOM type NaN 2 \n", + "\n", + " \\\n", + " min \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 0 days 00:00:35 \n", + " BIOM type NaN 0 days 00:01:43 \n", + "\n", + " \\\n", + " max \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 0 days 02:18:54 \n", + " BIOM type NaN 0 days 00:02:23 \n", + "\n", + " MaxRSSRaw \\\n", + " min max \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 56.6 MB 85.7 MB \n", + " BIOM type NaN 278.1 MB 315.8 MB \n", + "\n", + " WaitTime \\\n", + " min \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 0 days 00:00:00 \n", + " BIOM type NaN 0 days 00:00:00 \n", + "\n", + " \n", + " max \n", + "cName sName extra_info \n", + "Generate HTML summary Sequencing Data Type NaN 0 days 06:22:26 \n", + " BIOM type NaN 0 days 00:00:01 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Generate HTML summary\n", + "cmd = 'Generate HTML summary'\n", + "summary = df[df.cName == cmd].groupby(\n", + " ['cName', 'sName', 'extra_info'], dropna=False)[\n", + " ['ElapsedRawTime', 'MaxRSSRaw', 'WaitTime']].agg(['count', 'min', 'max']).copy()\n", + "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n", + "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n", + "summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n", + "summary.drop(columns=[('WaitTime', 'count')], inplace=True)\n", + "summary.sort_values(('ElapsedRawTime', 'max'), inplace=True, ascending=False)\n", + "\n", + "display(summary)\n", + "\n", + "# As a little background: in multiple cases the `Generate HTML summary` command is run as part of the\n", + "# Validate command\n", + "# Note: there is no special case (like for `Validate`) for `Generate HTML summary` but the jobs are small \n", + "# enough to be bin together\n", + "\n", + "# *** RESOURCE ALLOCATION ***\n", + "\n", + "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n", + "# VALUES ('Generate HTML summary', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 500mb --time 3:00:00');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Rest of Qiita jobs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/resource-allocation/generate-allocation-summary.py b/notebooks/resource-allocation/generate-allocation-summary.py index 1c3e081b4..e081a5d12 100644 --- a/notebooks/resource-allocation/generate-allocation-summary.py +++ b/notebooks/resource-allocation/generate-allocation-summary.py @@ -68,10 +68,6 @@ extra_info = j.parameters.values[ ('The number of rarefaction depths to include between min_depth ' 'and max_depth. (steps)')] - elif cmd.name == 'build_analysis_files': - extra_info = j.parameters.values[ - ('The number of rarefaction depths to include between min_depth ' - 'and max_depth. (steps)')] _d['external_id'] = eid _d['sId'] = s.id diff --git a/qiita_db/artifact.py b/qiita_db/artifact.py index 79cf64de0..2604ee6ef 100644 --- a/qiita_db/artifact.py +++ b/qiita_db/artifact.py @@ -1566,7 +1566,8 @@ def has_human(self): with qdb.sql_connection.TRN: qdb.sql_connection.TRN.add(sql) for v in qdb.sql_connection.TRN.execute_fetchflatten(): - if v.startswith('human-'): + # str is needed as v could be None + if str(v).startswith('human-'): has_human = True break diff --git a/qiita_db/processing_job.py b/qiita_db/processing_job.py index fdb30db94..8c87fceff 100644 --- a/qiita_db/processing_job.py +++ b/qiita_db/processing_job.py @@ -1756,10 +1756,12 @@ def _update_and_launch_children(self, mapping): ready = self._update_children(mapping) # Submit all the children that already have all the input parameters for c in ready: - c.submit() - # some jobs create several children jobs/validators and this can - # clog the submission process; giving it a second to avoid this - sleep(1) + if c.status in {'in_construction', 'waiting'}: + c.submit() + # some jobs create several children jobs/validators and this + # can clog the submission process; giving it a second to + # avoid this + sleep(1) @property def outputs(self):