fmalmeida · fmalmeida · Mar 28, 2022 · Nov 18, 2021 · Nov 19, 2021 · Nov 19, 2021
diff --git a/.github/workflows/test_pr.yml → .github/workflows/test_pr_docker.yml b/.github/workflows/test_pr.yml → .github/workflows/test_pr_docker.yml
@@ -1,8 +1,8 @@
-name: Testing pipeline's core for the new PR
+name: Testing new PR with docker
 on:
   pull_request:
-    branches: master
-    types: [ opened, synchronize, reopened ]
+    branches: [ master, dev, develop ]
+    types: [ ready_for_review, synchronize, reopened ]
 
 jobs:
   run_nextflow:
@@ -23,9 +23,17 @@ jobs:
         run: |
           wget -qO- get.nextflow.io | bash
           sudo mv nextflow /usr/local/bin/
-
-      - name: Run the pipeline
+      
+      - name: Clean environment
         run: |
           sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
-          nextflow run main.nf -profile docker,quicktest --threads 2
+
+      - name: Build bacannot database
+        run: |
+          nextflow run main.nf -profile docker --get_dbs --output bacannot_dbs --max_cpus 2 --max_memory '6.GB' --max_time '6.h'
+          rm -rf bacannot_dbs/antismash_db bacannot_dbs/kofamscan_db bacannot_dbs/prokka_db/PGAP_NCBI.hmm # remove unused in quicktest to diminish size
+
+      - name: Run the pipeline
+        run: |
+          nextflow run main.nf -profile docker,quicktest --bacannot_db bacannot_dbs
diff --git a/.github/workflows/test_pr_singularity.yml b/.github/workflows/test_pr_singularity.yml
@@ -0,0 +1,41 @@
+name: Testing new PR with singularity
+on:
+  pull_request:
+    branches: [ master, dev, develop ]
+    types: [ ready_for_review, synchronize, reopened ]
+
+jobs:
+  run_nextflow:
+    name: Run pipeline for the upcoming PR
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+
+      - name: Install Nextflow
+        env:
+          CAPSULE_LOG: none
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+
+      - name: Install Singularity
+        uses: eWaterCycle/setup-singularity@v7
+        with:
+          singularity-version: 3.8.3
+
+      - name: Clean environment
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
+
+      - name: Build bacannot database
+        run: |
+          nextflow run main.nf -profile singularity --get_dbs --output bacannot_dbs --max_cpus 2 --max_memory '6.GB' --max_time '6.h'
+          rm -rf bacannot_dbs/antismash_db bacannot_dbs/kofamscan_db bacannot_dbs/prokka_db/PGAP_NCBI.hmm # remove unused in quicktest to diminish size
+
+      - name: Run the pipeline
+        run: |
+          nextflow run main.nf -profile singularity,quicktest --bacannot_db bacannot_dbs
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,3 @@
 .Ruserdata
 TESTE
 docs/_html
-teste
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -0,0 +1,29 @@
+image: nfcore/gitpod:latest
+
+tasks:
+  - before: |
+        wget -qO- get.nextflow.io | bash
+        chmod 777 nextflow
+        sudo mv nextflow /usr/local/bin/
+        pip install tiptop
+        pip install nf-core
+        mkdir -p /testing
+        sudo chmod 777 -R /testing
+        ln -rs /testing .        
+
+vscode:
+  extensions: # based on nf-core.nf-core-extensionpack
+    - codezombiech.gitignore                 # Language support for .gitignore files
+    # - cssho.vscode-svgviewer                 # SVG viewer
+    - davidanson.vscode-markdownlint         # Markdown/CommonMark linting and style checking for Visual Studio Code
+    - eamodio.gitlens                        # Quickly glimpse into whom, why, and when a line or code block was changed
+    - EditorConfig.EditorConfig              # override user/workspace settings with settings found in .editorconfig files
+    - Gruntfuggly.todo-tree                  # Display TODO and FIXME in a tree view in the activity bar
+    - mechatroner.rainbow-csv                # Highlight columns in csv files in different colors
+    # - nextflow.nextflow                      # Nextflow syntax highlighting
+    - oderwat.indent-rainbow                 # Highlight indentation level
+    - streetsidesoftware.code-spell-checker  # Spelling checker for source code
+
+ports:
+  - port: 3000
+    onOpen: open-preview
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,23 +1,20 @@
-# .readthedocs.yml
+# .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 
 # Required
 version: 2
 
-# Build documentation in the docs/ directory with Sphinx
-sphinx:
-  configuration: docs/conf.py
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-20.04
+  tools:
+    python: "3.9"
 
-# Build documentation with MkDocs
-# mkdocs:
-#  configuration: mkdocs.yml
+mkdocs:
+  configuration: mkdocs.yml
 
-# Optionally build your docs in additional formats such as PDF and ePub
-formats: all
-
-# Optionally set the version of Python and requirements required to build your docs
+# Optionally declare the Python requirements required to build your docs
 python:
-  version: 3.7
-  install:
-    - requirements: docs/requirements.txt
+   install:
+   - requirements: docs/requirements.txt
diff --git a/.zenodo.json b/.zenodo.json
@@ -1,8 +1,8 @@
 {
-    "description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more.</p>", 
+    "description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>", 
     "license": "other-open", 
     "title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline", 
-    "version": "v3.0", 
+    "version": "v3.1", 
     "upload_type": "software",
     "creators": [
         {

diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@
 [![Nextflow version](https://img.shields.io/badge/Nextflow%20>=-v20.07-important)](https://www.nextflow.io/docs/latest/getstarted.html)
 [![License](https://img.shields.io/badge/License-GPL%203-black)](https://github.com/fmalmeida/bacannot/blob/master/LICENSE)
 
+[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/github.com/fmalmeida/bacannot)
+
 <p align="center">
 
   <h1 align="center">bacannot pipeline</h2>

diff --git a/bin/addBedtoolsIntersect.R b/bin/addBedtoolsIntersect.R
@@ -0,0 +1,121 @@
+#!/usr/bin/Rscript
+# Setting Help
+'usage: addBedtoolsIntersect.R [--txt=<file> --gff=<file> --type=<chr> --source=<chr> --out=<chr>]
+
+options:
+-g, --gff=<file>      GFF file to merge annotation
+-t, --txt=<file>      Bedtools intersect file
+--type=<chr>          Feature type [default: BLAST]
+--source=<chr>        Feature source [default: CDS]
+-o, --out=<chr>       Output file name [default: out.gff]' -> doc
+
+# Parse parameters
+suppressMessages(library(docopt))
+opt <- docopt(doc)
+
+if (is.null(opt$gff)){
+  stop("At least one argument must be supplied (gff file)\n", call.=FALSE)
+}
+
+if (is.null(opt$txt)){
+  stop("At least one argument must be supplied (intersection file)\n", call.=FALSE)
+}
+
+# Load libraries
+suppressMessages(library(ballgown))
+suppressMessages(library(DataCombine))
+suppressMessages(library(dplyr))
+suppressMessages(library(stringr))
+suppressMessages(library(tidyr))
+
+# Function used to remove redundancy
+reduce_row = function(i) {
+  d <- unlist(strsplit(i, split=","))
+  paste(unique(d), collapse = ',')
+}
+
+# Function to get Attribute Fields
+getAttributeField <- function (x, field, attrsep = ";") {
+  s = strsplit(as.character(x), split = attrsep, fixed = TRUE)
+  sapply(s, function(atts) {
+    a = strsplit(atts, split = "=", fixed = TRUE)
+    m = match(field, sapply(a, "[", 1))
+    if (!is.na(m)) { rv = a[[m]][2]
+    }
+    else {
+      rv = as.character(NA)
+    }
+    return(rv)
+  })
+}
+
+# Operator to discard patterns found
+'%ni%' <- Negate('%in%')
+
+if (file.info(opt$txt)$size > 0) {
+
+  # Load GFF file
+  gff <- gffRead(opt$gff)
+
+  # Create a column in the intersection file with ids
+  gff$ID <- getAttributeField(gff$attributes, "ID", ";")
+
+  # Load intersection file
+  bedtools_intersect <- read.csv(opt$txt, header = F, sep = "\t")
+  colnames(bedtools_intersect) <- c("seqname1", "source1", "feature1", "start1", "end1", "score1", "strand1", "frame1", "attributes1",
+                     "seqname2", "source2", "feature2", "start2", "end2", "score2", "strand2", "frame2", "attributes2",
+                     "len")
+
+  # Create a column in the intersection file with ids
+  bedtools_intersect$ID <- getAttributeField(bedtools_intersect$attributes2, "ID", ";")
+
+  # save ids
+  ids <- bedtools_intersect$ID
+
+  # Subset based on gene IDs
+  ## Lines with our IDs
+  sub <- gff %>%
+    filter(ID %in% ids) %>%
+    select(seqname, source, feature, start, end, score, strand, frame, attributes, ID)
+  ## Lines without our IDs
+  not <- gff %>%
+    filter(ID %ni% ids) %>%
+    select(seqname, source, feature, start, end, score, strand, frame, attributes)
+
+  # Change fields values
+  ## source
+  s <- sub$source
+  sn <- as.character(opt$source)
+  snew <- paste(s, sn, sep = ",")
+  sub$source <- snew
+
+  ## feature
+  f <- sub$feature
+  fn <- as.character(opt$type)
+  fnew <- paste(f, fn, sep = ",")
+  sub$feature <- fnew
+
+  ## attributes
+  sub <- merge.data.frame(sub, bedtools_intersect, by = "ID", all = TRUE)
+  new_ID <- paste(opt$source, "_ID=", sep = "", collapse = "")
+  sub$attributes1 <- gsub(pattern = "ID=", replacement = as.character(new_ID), x=sub$attributes1)
+  sub <- unite(sub, "attributes", c("attributes", "attributes1"), sep = ";") %>%
+    select(seqname, source, feature, start, end, score, strand, frame, attributes)
+
+  # Merge files
+  merged_df <- merge.data.frame(sub, not, all = TRUE)
+  feat <- merged_df$feature
+  merged_df$feature <- sapply(feat, reduce_row)
+  source <- merged_df$source
+  merged_df$source <- sapply(source, reduce_row)
+  merged_df <- merged_df[str_order(merged_df$attributes, numeric = TRUE), ]
+
+  # Write output
+  write.table(merged_df, file = opt$out, quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
+
+} else {
+  # Load GFF file
+  gff <- gffRead(opt$gff)
+  # Write output
+  write.table(gff, file = opt$out, quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
+}
diff --git a/docker/scripts/rscripts/addBlast2Gff.R → bin/addBlast2Gff.R b/docker/scripts/rscripts/addBlast2Gff.R → bin/addBlast2Gff.R
diff --git a/docker/scripts/rscripts/addCardDescription.R → bin/addCardDescription.R b/docker/scripts/rscripts/addCardDescription.R → bin/addCardDescription.R
diff --git a/docker/scripts/rscripts/addKO2Gff.R → bin/addKO2Gff.R b/docker/scripts/rscripts/addKO2Gff.R → bin/addKO2Gff.R
diff --git a/docker/scripts/rscripts/addNCBIamr2Gff.R → bin/addNCBIamr2Gff.R b/docker/scripts/rscripts/addNCBIamr2Gff.R → bin/addNCBIamr2Gff.R
diff --git a/docker/scripts/rscripts/addRGI2gff.R → bin/addRGI2gff.R b/docker/scripts/rscripts/addRGI2gff.R → bin/addRGI2gff.R
diff --git a/bin/build_image.sh b/bin/build_image.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+name=$(basename $(pwd))
+docker build -t fmalmeida/bacannot:${1}_${name} .
diff --git a/bin/calculate_methylation_frequency.py b/bin/calculate_methylation_frequency.py
@@ -0,0 +1,78 @@
+#! /usr/bin/env python3
+
+import sys
+import csv
+import argparse
+import gzip
+
+class SiteStats:
+    def __init__(self, g_size, g_seq):
+        self.num_reads = 0
+        self.called_sites = 0
+        self.called_sites_methylated = 0
+        self.group_size = g_size
+        self.sequence = g_seq
+
+def update_call_stats(key, num_called_cpg_sites, is_methylated, sequence):
+    if key not in sites:
+        sites[key] = SiteStats(num_called_cpg_sites, sequence)
+
+    sites[key].num_reads += 1
+    sites[key].called_sites += num_called_cpg_sites
+    if is_methylated > 0:
+        sites[key].called_sites_methylated += num_called_cpg_sites
+
+parser = argparse.ArgumentParser( description='Calculate methylation frequency at genomic CpG sites')
+parser.add_argument('-c', '--call-threshold', type=float, required=False, default=2.0)
+parser.add_argument('-s', '--split-groups', action='store_true')
+args, input_files = parser.parse_known_args()
+assert(args.call_threshold is not None)
+
+sites = dict()
+# iterate over input files and collect per-site stats
+for f in input_files:
+    if f[-3:] == ".gz":
+        in_fh = gzip.open(f, 'rt')
+    else:
+        in_fh = open(f)
+    csv_reader = csv.DictReader(in_fh, delimiter='\t')
+    for record in csv_reader:
+
+        num_sites = int(record['num_motifs'])
+        llr = float(record['log_lik_ratio'])
+
+        # Skip ambiguous call
+        if abs(llr) < args.call_threshold * num_sites:
+            continue
+        sequence = record['sequence']
+
+        is_methylated = llr > 0
+
+        # if this is a multi-cpg group and split_groups is set, break up these sites
+        if args.split_groups and num_sites > 1:
+            c = str(record['chromosome'])
+            s = int(record['start'])
+            e = int(record['end'])
+
+            # find the position of the first CG dinucleotide
+            sequence = record['sequence']
+            cg_pos = sequence.find("CG")
+            first_cg_pos = cg_pos
+            while cg_pos != -1:
+                key = (c, s + cg_pos - first_cg_pos, s + cg_pos - first_cg_pos)
+                update_call_stats(key, 1, is_methylated, "split-group")
+                cg_pos = sequence.find("CG", cg_pos + 1)
+        else:
+            key = (str(record['chromosome']), int(record['start']), int(record['end']))
+            update_call_stats(key, num_sites, is_methylated, sequence)
+
+# header
+print("\t".join(["chromosome", "start", "end", "num_motifs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
+
+sorted_keys = sorted(list(sites.keys()), key = lambda x: x)
+
+for key in sorted_keys:
+    if sites[key].called_sites > 0:
+        (c, s, e) = key
+        f = float(sites[key].called_sites_methylated) / sites[key].called_sites
+        print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence))
diff --git a/bin/config.yml b/bin/config.yml
@@ -0,0 +1,17 @@
+hmmer:
+  bin: CHANGE_HMMSEARCH
+  e_value_threshold: 0.00445
+  pvog_path: CHANGE_PVOG
+phigaro:
+  mean_gc: 0.46354823199323625
+  penalty_black: 2.2
+  penalty_white: 0.7
+  threshold_max_abs: 52.96
+  threshold_max_basic: 46.0
+  threshold_max_without_gc: 11.42
+  threshold_min_abs: 50.32
+  threshold_min_basic: 45.39
+  threshold_min_without_gc: 11.28
+  window_len: 32
+prodigal:
+  bin: CHANGE_PRODIGAL
diff --git a/docker/scripts/bscripts/draw_gis.sh → bin/draw_gis.sh b/docker/scripts/bscripts/draw_gis.sh → bin/draw_gis.sh