Skip to content

Commit

Permalink
feat: Add Build_Upset module (#204)
Browse files Browse the repository at this point in the history
* feat: Add module for Build_Upset.py script

* refact: Change genome to genome_id

* feat: Add build upset to annotation subwf

* docs: Update param docs

* docs: Update schema and param docs

* feat: Add dummy groups to test_full table
  • Loading branch information
jvfe authored Jul 25, 2024
1 parent 5d1250e commit 9c16708
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 19 deletions.
137 changes: 137 additions & 0 deletions bin/Build_Upset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/env python

import argparse
import pandas as pd
from matplotlib import cm
from matplotlib import pyplot as plt
from upsetplot import UpSet, from_memberships
import warnings

# Suppress the UserWarning for tight_layout
warnings.simplefilter(action='ignore', category=FutureWarning)

def load_data(samplesheet_file, feature_profile_file):
"""
Load samplesheet and feature profile data from TSV files.
Parameters:
samplesheet_file (str): Path to the samplesheet file.
feature_profile_file (str): Path to the feature profile file.
Returns:
pd.DataFrame: Samplesheet dataframe.
pd.DataFrame: Feature profile dataframe.
"""
try:
samplesheet_df = pd.read_csv(samplesheet_file)
feature_df = pd.read_csv(feature_profile_file, sep='\t')
except FileNotFoundError as e:
print(f"Error: {e}")
raise

# Convert all columns in samplesheet_df to strings
samplesheet_df = samplesheet_df.astype(str)

return samplesheet_df, feature_df

def process_data(samplesheet_df, feature_df, column_name):
"""
Process the data to merge samplesheet and feature dataframes and create presence/absence matrix.
Parameters:
samplesheet_df (pd.DataFrame): Samplesheet dataframe.
feature_df (pd.DataFrame): Feature profile dataframe.
column_name (str): Column name for the feature.
Returns:
pd.DataFrame: Transposed presence/absence dataframe with groups column.
"""
merged_df = feature_df.merge(samplesheet_df, left_on='genome_id', right_on='sample', how='left')

if column_name not in merged_df.columns:
print(f"Warning: Column '{column_name}' not found in samplesheet. Skipping.")
return None

if merged_df[column_name].nunique() <= 1:
print(f"Warning: Column '{column_name}' contains only a single unique value. Skipping.")
return None

selected_columns = ['genome_id', column_name] + list(feature_df.columns[1:])
feature_df = merged_df[selected_columns]
presence = feature_df.groupby(column_name).any().astype(int)
presence.drop('genome_id', axis=1, inplace=True, errors='ignore')
presence_transposed = presence.transpose()
presence_transposed.reset_index(inplace=True)
presence_transposed.rename(columns={'index': 'feature'}, inplace=True)
presence_transposed['category'] = presence_transposed['feature'].str.split('_', expand=True)[0]

presence_transposed['groups'] = presence_transposed.apply(
lambda row: ','.join([col for col in presence_transposed.columns[2:] if row[col] == 1]), axis=1)

presence_transposed = presence_transposed.loc[:, ['feature', 'category', 'groups']]

# print("Presence Transposed DataFrame with 'groups' column:\n", presence_transposed.head()) # Debug print to check the dataframe state

return presence_transposed

def create_upset_plot(samplesheet_file, feature_profile_file, column_names, output_file_prefix):
"""
Create an Upset plot from samplesheet and feature profile files.
Parameters:
samplesheet_file (str): Path to the samplesheet file.
feature_profile_file (str): Path to the feature profile file.
column_names (list of str): List of column names for features.
output_file_prefix (str): Prefix for the output Upset plot files.
"""
samplesheet_df, feature_df = load_data(samplesheet_file, feature_profile_file)

for column_name in column_names:
presence_transposed = process_data(samplesheet_df, feature_df, column_name)

if presence_transposed is None:
continue

if 'groups' not in presence_transposed.columns:
print(f"Error: 'groups' column was not created for {column_name}. Skipping.")
continue

# Ensure 'groups' column is processed correctly
memberships = presence_transposed['groups'].str.split(",").apply(lambda x: frozenset(x))
rows_by_group = from_memberships(memberships, data=presence_transposed.set_index('feature'))

# print("Rows by group:\n", rows_by_group.head()) # Debug print to check rows_by_group structure

# Create UpSet plot
fig, ax = plt.subplots(figsize=(24, 12)) # Increased width for better layout
upset = UpSet(rows_by_group, show_counts=True, intersection_plot_elements=0, min_subset_size=1)
upset.add_stacked_bars(by="category", colors=cm.Pastel1, title="category", elements=10)
upset.plot(fig=fig)

# Remove the black frame by turning off the top and right spines
ax.spines[['top','right','left','bottom']].set_visible(False)
ax.tick_params(left=False)
ax.tick_params(bottom=False)
ax.tick_params(labelleft=False)
ax.tick_params(labelbottom=False)

# Adjust layout to move legend outside the figure
plt.tight_layout(rect=[0.1, 0.1, 0.75, 1]) # Adjust rect to leave space on the right for legend
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

output_file = f"{output_file_prefix}_{column_name}.png"
# Overwrite the existing file if it exists
plt.savefig(output_file)
plt.close(fig)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Create Upset plot from samplesheet and feature profile files.')
parser.add_argument('-s', '--samplesheet_file', type=str, required=True, help='Path to the samplesheet file in TSV format')
parser.add_argument('-f', '--feature_profile_file', type=str, required=True, help='Path to the feature profile file in TSV format')
parser.add_argument('-c', '--columns', type=str, required=True, help='Comma-separated list of column names for features')
parser.add_argument('-o', '--output_file_prefix', type=str, required=True, help='Prefix for the output Upset plot files (e.g., upset_plot)')
args = parser.parse_args()

column_names = args.columns.split(',')

create_upset_plot(args.samplesheet_file, args.feature_profile_file, column_names, args.output_file_prefix)
7 changes: 7 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,13 @@ process {
]
}

withName: BUILD_UPSET {
publishDir = [
path: { "${params.outdir}/annotation/upsets" },
mode: params.publish_dir_mode,
]
}

// Recombination
withName: VERTICALL_PAIRWISE {
ext.prefix = { "cluster_${cluster}" }
Expand Down
1 change: 1 addition & 0 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ params {
use_prokka = true
use_ppanggolin = true
skip_poppunk = true
upset_plot_columns = "group_one,group_two"
}

process {
Expand Down
11 changes: 6 additions & 5 deletions docs/params.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Parameters for the annotation subworkflow
| `min_qcover` | Minimum coverage of each match for filtering | `number` | 0.6 |
| `skip_profile_creation` | Skip annotation feature profile creation | `boolean` | |
| `feature_profile_columns` | Columns to include in the feature profile | `string` | mobsuite,rgi,cazy,vfdb,iceberg,bacmet |
| `upset_plot_columns` | Columns to use for making Upset plots of genome features | `string` | |

## Phylogenomics

Expand All @@ -71,8 +72,8 @@ Parameters for the lineage subworkflow
| `poppunk_model` | Which PopPunk model to use (bgmm, dbscan, refine, threshold or lineage) | `string` | |
| `run_poppunk_qc` | Whether to run the QC step for PopPunk | `boolean` | |
| `enable_subsetting` | Enable subsetting workflow based on genome similarity | `boolean` | |
| `core_similarity` | Similarity threshold for core genomes | `number` | 99.99 |
| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99 |
| `core_similarity` | Similarity threshold for core genomes | `number` | 99.9 |
| `accessory_similarity` | Similarity threshold for accessory genes | `number` | 99.0 |

## Gene Order

Expand Down Expand Up @@ -140,9 +141,9 @@ Set the top limit for requested resources for any single job.

| Parameter | Description | Type | Default |
|-----------|-----------|-----------|-----------|
| `max_cpus` | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>| `integer` | 16 |
| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details>| `string` | 128.GB |
| `max_time` | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>| `string` | 240.h |
| `max_cpus` | Maximum number of CPUs that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`</small></details>| `integer` | 72 |
| `max_memory` | Maximum amount of memory that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`</small></details>| `string` | 125.GB |
| `max_time` | Maximum amount of time that can be requested for any single job. <details><summary>Help</summary><small>Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`</small></details>| `string` | 168.h |

## Generic options

Expand Down
32 changes: 32 additions & 0 deletions modules/local/build_upset/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
process BUILD_UPSET {
label 'process_single'

conda "conda-forge::upsetplot=0.9.0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'docker://docker.io/jvfe/arete_upset:v0.1.0':
'docker.io/jvfe/arete_upset:v0.1.0' }"

input:
path feature_profile
path samplesheet
val samplesheet_columns

output:
path "BuildUpset*png", emit: pngs

when:
task.ext.when == null || task.ext.when

script:
"""
Build_Upset.py \\
--output_file_prefix BuildUpset \\
--feature_profile_file $feature_profile \\
--samplesheet_file $samplesheet \\
--columns $samplesheet_columns
"""
stub:
"""
touch BuildUpset_dummy.png
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ params {
annotation_tools = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet,islandpath,phispy,report'
feature_profile_columns = 'mobsuite,rgi,cazy,vfdb,iceberg,bacmet'
feature_dispersion_columns = null
upset_plot_columns = null
min_pident = 60
min_qcover = 0.6
skip_profile_creation = false
Expand Down
15 changes: 10 additions & 5 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@
"fa_icon": "fas fa-columns",
"description": "Columns to include in the feature profile",
"pattern": "^((rgi|mobsuite|vfdb|cazy|bacmet|iceberg)?,?)*(?<!,)$"
},
"upset_plot_columns": {
"type": "string",
"fa_icon": "fas fa-columns",
"description": "Columns to use for making Upset plots of genome features"
}
}
},
Expand Down Expand Up @@ -215,13 +220,13 @@
},
"core_similarity": {
"type": "number",
"default": 99.99,
"default": 99.9,
"fa_icon": "fas fa-clone",
"description": "Similarity threshold for core genomes"
},
"accessory_similarity": {
"type": "number",
"default": 99,
"default": 99.0,
"fa_icon": "far fa-clone",
"description": "Similarity threshold for accessory genes"
}
Expand Down Expand Up @@ -435,15 +440,15 @@
"max_cpus": {
"type": "integer",
"description": "Maximum number of CPUs that can be requested for any single job.",
"default": 16,
"default": 72,
"fa_icon": "fas fa-microchip",
"hidden": true,
"help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`"
},
"max_memory": {
"type": "string",
"description": "Maximum amount of memory that can be requested for any single job.",
"default": "128.GB",
"default": "125.GB",
"fa_icon": "fas fa-memory",
"pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
"hidden": true,
Expand All @@ -452,7 +457,7 @@
"max_time": {
"type": "string",
"description": "Maximum amount of time that can be requested for any single job.",
"default": "240.h",
"default": "168.h",
"fa_icon": "far fa-clock",
"pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$",
"hidden": true,
Expand Down
10 changes: 10 additions & 0 deletions subworkflows/local/annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ include { CONCAT_OUTPUT as CONCAT_MOBSUITE;
CONCAT_OUTPUT as CONCAT_INTEGRONS } from '../../modules/local/concat_output.nf'
include { FILTER_AND_CONCAT_MATCHES as FILTER_RGI } from '../../modules/local/filter_matches'
include { CREATE_REPORT } from '../../modules/local/create_report'
include { BUILD_UPSET } from '../../modules/local/build_upset/main'

//
// SUBWORKFLOWS
Expand Down Expand Up @@ -356,6 +357,15 @@ workflow ANNOTATE_ASSEMBLIES {
profile = (params.skip_profile_creation) ? profile : CREATE_REPORT.out.profile
}

if (params.upset_plot_columns) {
BUILD_UPSET(
profile,
file(params.input_sample_table),
params.upset_plot_columns
)
}


emit:
annotation = concat_annotation
annotation_software = ch_software_versions
Expand Down
18 changes: 9 additions & 9 deletions test/test_full_samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
sample,fna_file_path
ED073,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED073.fa
ED142,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED142.fa
ED195,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED195.fa
ED287,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED287.fa
ED413,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED413.fa
ED423,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED423.fa
ED603,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED603.fa
ED644,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED644.fa
sample,fna_file_path,group_one,group_two
ED073,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED073.fa,test2,group1
ED142,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED142.fa,test2,group1
ED195,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED195.fa,test2,group1
ED287,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED287.fa,test1,group1
ED413,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED413.fa,test3,group2
ED423,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED423.fa,test3,group2
ED603,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED603.fa,test1,group2
ED644,https://raw.githubusercontent.com/jvfe/arete-test-data/main/test_full/ED644.fa,test1,group2

0 comments on commit 9c16708

Please sign in to comment.