diff --git a/.editorconfig b/.editorconfig index dd9ffa5..72dda28 100644 --- a/.editorconfig +++ b/.editorconfig @@ -28,10 +28,6 @@ indent_style = unset [/assets/email*] indent_size = unset -# ignore Readme -[README.md] -indent_style = unset - -# ignore python +# ignore python and markdown [*.{py,md}] indent_style = unset diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6f3b92d..a0314bc 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -18,7 +18,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/scna - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/scnanoseq/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/scnanoseq _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 480e4ea..490d087 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -8,12 +8,12 @@ on: types: [published] workflow_dispatch: jobs: - run-tower: + run-platform: name: Run AWS full tests if: github.repository == 'nf-core/scnanoseq' runs-on: ubuntu-latest steps: - - name: Launch workflow via tower + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} @@ -30,7 +30,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: Tower debug log file + name: Seqera Platform debug log file path: | - tower_action_*.log - tower_action_*.json + seqera_platform_action_*.log + seqera_platform_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 11e0fdb..46091f9 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -5,13 +5,13 @@ name: nf-core AWS test on: workflow_dispatch: jobs: - run-tower: + run-platform: name: Run AWS tests if: github.repository == 'nf-core/scnanoseq' runs-on: ubuntu-latest steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower + # Launch workflow using Seqera Platform CLI tool action + - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} @@ -27,7 +27,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: Tower debug log file + name: Seqera Platform debug log file path: | - tower_action_*.log - tower_action_*.json + seqera_platform_action_*.log + seqera_platform_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 47e8e1b..3670946 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,10 +29,10 @@ jobs: - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 with: version: "${{ matrix.NXF_VER }}" diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 08622fd..2d20d64 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -14,6 +14,8 @@ on: pull_request: types: - opened + - edited + - synchronize branches: - master pull_request_target: @@ -28,11 +30,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 with: @@ -65,8 +70,17 @@ jobs: - name: Inspect download run: tree ./${{ env.REPOTITLE_LOWERCASE }} - - name: Run the downloaded pipeline + - name: Run the downloaded pipeline (stub) + id: stub_run_pipeline + continue-on-error: true env: NXF_SINGULARITY_CACHEDIR: ./ NXF_SINGULARITY_HOME_MOUNT: true run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results + - name: Run the downloaded pipeline (stub run not supported) + id: run_pipeline + if: ${{ job.steps.stub_run_pipeline.status == failure() }} + env: + NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_HOME_MOUNT: true + run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index abfa0e4..edd374a 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 with: token: ${{ secrets.nf_core_bot_auth_token }} @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index cf13947..1a08eb1 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -15,13 +15,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -33,14 +32,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -61,7 +60,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index d468aea..03ecfcf 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -12,7 +12,7 @@ jobs: - name: get topics and convert to hashtags id: get_topics run: | - curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ' >> $GITHUB_OUTPUT + echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: @@ -25,13 +25,13 @@ jobs: Please see the changelog: ${{ github.event.release.html_url }} - ${{ steps.get_topics.outputs.GITHUB_OUTPUT }} #nfcore #openscience #nextflow #bioinformatics + ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics send-tweet: runs-on: ubuntu-latest steps: - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: python-version: "3.10" - name: Install dependencies diff --git a/.nf-core.yml b/.nf-core.yml index 465b8c9..9480b62 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,5 @@ repository_type: pipeline +nf_core_version: "2.14.1" pipeline_todos: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index af57081..4dc0f1d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,9 @@ repos: rev: "v3.1.0" hooks: - id: prettier + additional_dependencies: + - prettier@3.2.5 + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python rev: "2.7.3" hooks: diff --git a/CITATIONS.md b/CITATIONS.md index a593d9d..3e3c488 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -50,7 +50,7 @@ - [ToulligQC](https://github.com/GenomiqueENS/toulligQC) - >Karine Dias, Bérengère Laffay, Lionel Ferrato-Berberian, Sophie Lemoine, Ali Hamraoui, Morgane Thomas-Chollier, Stéphane Le Crom and Laurent Jourdren. + > Karine Dias, Bérengère Laffay, Lionel Ferrato-Berberian, Sophie Lemoine, Ali Hamraoui, Morgane Thomas-Chollier, Stéphane Le Crom and Laurent Jourdren. - [UMI-tools](https://pubmed.ncbi.nlm.nih.gov/28100584/) diff --git a/README.md b/README.md index 94d82ee..0ec4c7e 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/scnanoseq) +[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/scnanoseq) [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23scnanoseq-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/scnanoseq)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) @@ -29,7 +29,7 @@ On release, automated continuous integration tests run the pipeline on a full-si ![scnanoseq diagram](assets/scnanoseq_diagram.png) -1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [`NanoPlot`](https://github.com/wdecoster/NanoPlot), [`NanoComp`](https://github.com/wdecoster/nanocomp) and [`ToulligQC`](https://github.com/GenomiqueENS/toulligQC)) +1. Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [`NanoPlot`](https://github.com/wdecoster/NanoPlot), [`NanoComp`](https://github.com/wdecoster/nanocomp) and [`ToulligQC`](https://github.com/GenomiqueENS/toulligQC)) 2. Unzip and split FastQ ([`gunzip`](https://linux.die.net/man/1/gunzip)) 1. Optional: Split fastq for faster processing ([`split`](https://linux.die.net/man/1/split)) 3. Trim and filter reads. ([`Nanofilt`](https://github.com/wdecoster/nanofilt)) @@ -41,11 +41,8 @@ On release, automated continuous integration tests run the pipeline on a full-si 7. Barcode correction (custom script `./bin/correct_barcodes.py`) 8. Post-extraction QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [`NanoPlot`](https://github.com/wdecoster/NanoPlot) and [`ToulligQC`](https://github.com/GenomiqueENS/toulligQC)) 9. Alignment ([`minimap2`](https://github.com/lh3/minimap2)) -10. SAMtools processing including ([`SAMtools`](http://www.htslib.org/doc/samtools.html)): - 1. SAM to BAM - 2. Filtering of mapped only reads - 3. Sorting, indexing and obtain mapping metrics -11. Post-mapping QC in unfiltered BAM files ([`NanoComp`](https://github.com/wdecoster/nanocomp), [`RSeQC`](https://rseqc.sourceforge.net/)) +10. Post-alignment filtering of mapped reads and gathering mapping QC ([`SAMtools`](http://www.htslib.org/doc/samtools.html)) +11. Post-alignment QC in unfiltered BAM files ([`NanoComp`](https://github.com/wdecoster/nanocomp), [`RSeQC`](https://rseqc.sourceforge.net/)) 12. Barcode tagging with read quality, BC, BC quality, UMI, and UMI quality (custom script `./bin/tag_barcodes.py`) 13. UMI-based deduplication [`UMI-tools`](https://github.com/CGATOxford/UMI-tools) 14. Gene and transcript level matrices generation. [`IsoQuant`](https://github.com/ablab/IsoQuant) @@ -110,9 +107,9 @@ If you experience any issues, please make sure to submit an issue above. However process { withName: '.*:.*FASTQC.*' - { + { cpus = 20 - } + } } //NOTE: reminder that params set in modules.config need to be copied over to a custom config diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index f7e381f..690a26e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -57,8 +57,7 @@ top_modules: custom_content: read_counts_section: parent_id: read_counts_section - order: - read_counts_module + order: read_counts_module seurat_section: parent_id: seurat_section @@ -74,7 +73,7 @@ custom_data: section_name: "Read Counts" file_format: "csv" plot_type: "table" - + gene_seurat_stats_module: parent_id: seurat_section parent_name: "Seurat Section" diff --git a/assets/nf-core-scnanoseq_logo_light.png b/assets/nf-core-scnanoseq_logo_light.png index 4233867..f400846 100644 Binary files a/assets/nf-core-scnanoseq_logo_light.png and b/assets/nf-core-scnanoseq_logo_light.png differ diff --git a/bin/config.py b/bin/config.py index a5b2892..790eeea 100644 --- a/bin/config.py +++ b/bin/config.py @@ -15,7 +15,7 @@ ADPT_MAC_MATCH_ED=2 #minimum proportion of match required when searching ## format suffix -SEQ_SUFFIX_WIN=200 +SEQ_SUFFIX_WIN=200 SEQ_SUFFIX_MIN_MATCH_PROP=1 SEQ_SUFFIX_AFT_ADPT=(20,50) @@ -61,13 +61,12 @@ def high_sensitivity_threshold_calculation(count_array, exp_cells): DEFAULT_BC_STAT_FN = "summary.txt" DEFAULT_EMPTY_DROP_MIN_ED = 5 # minimum edit distance from emtpy drop BC to selected BC DEFAULT_EMPTY_DROP_NUM = 2000 # number of BC in the output - #################################################### ##### DEFAULT in Demultiplexing ###### #################################################### -DEFAULT_ASSIGNMENT_ED = 2 +DEFAULT_ASSIGNMENT_ED = 2 # Make sure this is smaller than DEFAULT_GRB_FLANKING_SIZE assert DEFAULT_GRB_FLANKING_SIZE >= DEFAULT_ASSIGNMENT_ED DEFAULT_ED_FLANKING = DEFAULT_ASSIGNMENT_ED diff --git a/bin/find_reads.py b/bin/find_reads.py index 45a52c2..9a4bd1a 100644 --- a/bin/find_reads.py +++ b/bin/find_reads.py @@ -1,69 +1,70 @@ -# find specific reads with given read id and output to a new fastq file - - -import argparse -import Bio.SeqIO -import multiprocessing as mp -import textwrap -from pathlib import Path - - -import helper - -def parse_arg(): - parser = argparse.ArgumentParser( - description=textwrap.dedent( - ''' - Find specific reads with given read id and output to a new fastq file - ''')) - - # Required positional argument - parser.add_argument('input_fastq_dir', type=str, - help='Fastq directory, Note that this should be a folder.') - - # required name argment - requiredNamed = parser.add_argument_group('These Arguments are required') - requiredNamed.add_argument( - '--output_file', type=str, required = True, - help='Filename for the output fastq.') - requiredNamed.add_argument('--id_file', type=str, required = True, - help='A file containing all the read ids to look for.') - - parser.add_argument('--threads', type=int, - help='Number of threads. Default: # of CPU - 1') - - - args = parser.parse_args() - - return args - -def find_reads(in_fastq, id_list): - fastq = Bio.SeqIO.parse(in_fastq, "fastq") - read_list = [r for r in fastq if r.id in id_list] - return read_list - # check id -def main(args): - - # get ids (from args.id_file) - ids = [] - with open (args.id_file, 'r') as f: - for line in f: - ids.append(line.strip()) - - - fastq_fns = list(Path(args.input_fastq_dir).rglob('*.fastq')) - rst_futures = helper.multiprocessing_submit(find_reads, - fastq_fns, - n_process=args.threads, - id_list = ids) - - rst_ls = [] - for f in rst_futures: - rst_ls+=f.result() - - print(len(rst_ls)) - Bio.SeqIO.write(rst_ls, args.output_file, 'fastq') - -if __name__ == '__main__': - args = parse_arg() - main(args) \ No newline at end of file +# find specific reads with given read id and output to a new fastq file + + +import argparse +import Bio.SeqIO +import multiprocessing as mp +import textwrap +from pathlib import Path + + +import helper + +def parse_arg(): + parser = argparse.ArgumentParser( + description=textwrap.dedent( + ''' + Find specific reads with given read id and output to a new fastq file + ''')) + + # Required positional argument + parser.add_argument('input_fastq_dir', type=str, + help='Fastq directory, Note that this should be a folder.') + + # required name argment + requiredNamed = parser.add_argument_group('These Arguments are required') + requiredNamed.add_argument( + '--output_file', type=str, required = True, + help='Filename for the output fastq.') + requiredNamed.add_argument('--id_file', type=str, required = True, + help='A file containing all the read ids to look for.') + + parser.add_argument('--threads', type=int, + help='Number of threads. Default: # of CPU - 1') + + + args = parser.parse_args() + + return args + +def find_reads(in_fastq, id_list): + fastq = Bio.SeqIO.parse(in_fastq, "fastq") + read_list = [r for r in fastq if r.id in id_list] + return read_list + # check id +def main(args): + + # get ids (from args.id_file) + ids = [] + with open (args.id_file, 'r') as f: + for line in f: + ids.append(line.strip()) + + + fastq_fns = list(Path(args.input_fastq_dir).rglob('*.fastq')) + rst_futures = helper.multiprocessing_submit(find_reads, + fastq_fns, + n_process=args.threads, + id_list = ids) + + rst_ls = [] + for f in rst_futures: + rst_ls+=f.result() + + print(len(rst_ls)) + Bio.SeqIO.write(rst_ls, args.output_file, 'fastq') + +if __name__ == '__main__': + args = parse_arg() + main(args) + diff --git a/bin/generate_read_counts.sh b/bin/generate_read_counts.sh index aea42cc..4c42399 100755 --- a/bin/generate_read_counts.sh +++ b/bin/generate_read_counts.sh @@ -39,7 +39,7 @@ do data="$(basename $sample_name)" # RAW FASTQ COUNTS - + if [[ -s "$raw_fastqc" ]] then fastqc_counts=$(get_fastqc_counts "$raw_fastqc") @@ -47,9 +47,9 @@ do else data="$data," fi - + # TRIM COUNTS - + if [[ -s "$trim_fastqc" ]] then trim_counts=$(get_fastqc_counts "$trim_fastqc") @@ -57,9 +57,9 @@ do else data="$data," fi - + # PREEXTRACT COUNTS - + if [ -s "$extract_fastqc" ] then extract_counts=$(get_fastqc_counts "$extract_fastqc") @@ -67,10 +67,9 @@ do else data="$data," fi - + # CORRECT COUNTS - - + if [ -s $correct_csv ] then correct_counts=$(cut -f6 $correct_csv | awk '{if ($0 != "") {print $0}}' | wc -l) diff --git a/bin/helper.py b/bin/helper.py index fd068a1..3cdfa59 100644 --- a/bin/helper.py +++ b/bin/helper.py @@ -19,7 +19,7 @@ def reverse_complement(seq): Returns: reverse_complement seq ''' - comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', + comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a'} letters = \ [comp[base] if base in comp.keys() else base for base in seq] @@ -28,8 +28,8 @@ def reverse_complement(seq): def err_msg(msg): CRED = '\033[91m' CEND = '\033[0m' - print(CRED + msg + CEND) - + print(CRED + msg + CEND) + def warning_msg(msg, printit = True): CRED = '\033[93m' CEND = '\033[0m' @@ -47,11 +47,11 @@ def green_msg(msg, printit = True): return CRED + msg + CEND def sliding_window_sum(array, window) : - cum = np.cumsum(array) + cum = np.cumsum(array) return cum[window:] - cum[:-window] def sliding_window_mean(array, window) : - cum = np.cumsum(array) + cum = np.cumsum(array) return (cum[window:] - cum[:-window]) / window @@ -62,22 +62,22 @@ def __init__(self, **kwargs): def add(self, attr_name, attr_val, overwrite = True): if attr_name in __dict__.keys() and not overwrite: pass - else: + else: self.__dict__[attr_name] = attr_val def rm(self, attr_name): if attr_name in self.__dict__.keys(): del self.__dict__[attr_name] def __str__(self): return str(self.__dict__) - + def check(self, attr_list, add_none = True, silent = True): """ - Check whether the attributes in a given list are present. + Check whether the attributes in a given list are present. Parameters ---------- attr_list : LIST - list of strings of the attributes to check + list of strings of the attributes to check add_none : BOOL whether or not to create the missed attributes with value None silent : BOOL @@ -91,19 +91,19 @@ def check(self, attr_list, add_none = True, silent = True): assert isinstance(silent, bool) except (AttributeError, TypeError): raise AssertionError("add_none and silent must be bool variable.") - + check_res = True for attr in attr_list: if attr not in self.__dict__.keys(): check_res = False self.__dict__[attr] = None return check_res if not silent else True - + def multiprocessing_submit(func, iterator, n_process=mp.cpu_count()-1 , - pbar=True, pbar_unit='Read',pbar_func=len, + pbar=True, pbar_unit='Read',pbar_func=len, schduler = 'process', *arg, **kwargs): - """multiple processing or threading, + """multiple processing or threading, Args: func: function to be run parallely @@ -138,7 +138,7 @@ def result(self): if pbar: _pbar = tqdm(unit=pbar_unit, desc='Processed') - + # run in single process/thread if n_process < 1 if n_process <= 1: for it in iterator: @@ -151,7 +151,7 @@ def result(self): max_queue = n_process futures = {} n_job_in_queue = 0 - + # make sure the result is yield in the order of submit. job_idx = 0 job_completed = {} @@ -171,12 +171,12 @@ def result(self): # batch size as value, release the cpu as soon as one job finished job = next(as_completed(futures), None) - # yield the completed job in the order of submit + # yield the completed job in the order of submit if job is not None: job_completed[futures[job][1]] = job, futures[job][0] del futures[job] - # + # if job is None and i is None and len(job_completed)==0: break @@ -187,25 +187,25 @@ def result(self): _pbar.update(job_completed[job_to_yield][1]) yield job_completed[job_to_yield][0] del job_completed[job_to_yield] - + # submit new job i = next(iterator, None) if i is not None: futures[executor.submit(func, i, *arg, **kwargs)] = (pbar_func(i),job_idx) job_idx += 1 - + job_to_yield += 1 -# multiproces panda data frame +# multiproces panda data frame def procee_batch(df, row_func, *arg, **kwargs): return df.apply(row_func, axis=1, *arg, **kwargs) def df_multiproceccing_apply(df, func, n_process, aggr_func=pd.concat, pbar = True, pbar_unit='Read',pbar_func=len,*arg, **kwargs): - """This is a re-implementation which is very similar to dask apply. + """This is a re-implementation which is very similar to dask apply. """ def sub_df_generator(df, n_part): - """ - Generator: split a large df by row into multiple + """ + Generator: split a large df by row into multiple """ sub_size = int(np.ceil(len(df) / n_part)) batch_start, batch_end = 0, sub_size @@ -215,25 +215,25 @@ def sub_df_generator(df, n_part): batch_end += sub_size #run 100 batch per process on average but ensure at least 1000 reads per batch - num_batch = min(int(len(df)/1000)+1, n_process*100) + num_batch = min(int(len(df)/1000)+1, n_process*100) df_iter = sub_df_generator(df, n_part=num_batch) - + rst_futures = multiprocessing_submit(procee_batch, df_iter, n_process=n_process , - pbar = pbar, pbar_unit=pbar_unit,pbar_func=pbar_func, + pbar = pbar, pbar_unit=pbar_unit,pbar_func=pbar_func, schduler = 'process', row_func = func, *arg, **kwargs) for i in rst_futures: yield i.result() # rsts = [x.result() for x in rst_futures] # return aggr_func(rsts) - + # concatenate multiple files def concatenate_files(output_file, *input_files): with open(output_file, 'wb') as outfile: for input_file in input_files: with open(input_file, 'rb') as infile: shutil.copyfileobj(infile, outfile) - os.remove(input_file) + os.remove(input_file) # get file with a certian extensions def get_files(search_dir, extensions, recursive=True): @@ -257,7 +257,7 @@ def check_exist(file_list): if exit_code == 1: sys.exit() -# split any iterator in to batches +# split any iterator in to batches def batch_iterator(iterator, batch_size): """generateor of batches of items in a iterator with batch_size. """ @@ -266,7 +266,7 @@ def batch_iterator(iterator, batch_size): for entry in iterator: i += 1 batch.append(entry) - + if i == batch_size: yield batch batch = [] @@ -285,7 +285,7 @@ def fastq_parser(file_handle): next(file_handle) # skip '+' q_letter = next(file_handle) yield read_tuple(id[1:].split()[0], seq.strip(), q_letter.strip()) - + # validate filename (check suffix): @@ -300,6 +300,6 @@ def check_suffix(filename, suffix_lst): return filename.endswith(suffix_lst) if any([filename.endswith(x) for x in suffix_lst]): return True - else: + else: return False diff --git a/bin/main.py b/bin/main.py index cab5d5b..51b1611 100755 --- a/bin/main.py +++ b/bin/main.py @@ -13,7 +13,7 @@ 3. BC rank plot 4. csv file of the BC whitelist """ - + import sys import shlex import os @@ -48,23 +48,23 @@ logger.setLevel(logging.DEBUG) def parse_arg(argv): - + def print_help(): help_message=\ f''' Description: BLAZE2 is a tool for demultiplexing 10X single cell long-read RNA-seq data. - It takes fastq files as input and output a whitelist of barcodes and a fastq + It takes fastq files as input and output a whitelist of barcodes and a fastq with demultiplexed reads. Usage: blaze --expect-cells [OPTIONS] Required argument: One of the following two options is required unless whitelisting step is turned off: - --expect-cells + --expect-cells Expected number of cells. --count-threshold - Count threshold of high-quality putative barcodes used to determine the whitelist. + Count threshold of high-quality putative barcodes used to determine the whitelist. Note that the --count-threshold option is ignored if --expect-cells is specified. Options: @@ -75,7 +75,7 @@ def print_help(): Filename of output files. Default: --output-prefix {DEFAULT_PREFIX} Note that the output can be directed to a different directory by specifying the path in the prefix. E.g., --output-prefix /path/to/output/prefix - + --output-fastq Filename of output fastq file name. Default: --output-fastq {DEFAULT_GRB_OUT_FASTQ} Note that if the filename has to end with .fastq, .fq, .fastq.gz or .fq.gz. @@ -85,35 +85,35 @@ def print_help(): --no-whitelisting: Skip the whitelisting step and assign reads to full whitelist specified by --full-bc-whitelist. - + --max-edit-distance - Maximum edit distance allowed between a putative barcode and a barcode + Maximum edit distance allowed between a putative barcode and a barcode for a read/putative barcdoe to be assigned to the barcode. Default: --max-edit-distance {DEFAULT_ASSIGNMENT_ED} - + --kit-version <3v2 or 3v3 or 5v2>: Choose from 10X Single Cell 3ʹ gene expression v2 or v3 or 5' gene expression. If using other - protocols, please do not specify this option and specify --full-bc-whitelist instead. By default, + protocols, please do not specify this option and specify --full-bc-whitelist instead. By default, `--kit_version v3` will be used if --full-bc-whitelist is not specified. --minQ : - Putative BC contains one or more bases with Q - txt file containing all the possible BCs. You may provide your own whitelist. - No need to specify this if users want to use the 10X whilelist. The correct + txt file containing all the possible BCs. You may provide your own whitelist. + No need to specify this if users want to use the 10X whilelist. The correct version of 10X whilelist will be determined based on 10X kit version. --overwrite - Overwrite the old file when the output file(s) exist. If not specified, + Overwrite the old file when the output file(s) exist. If not specified, the steps generating the existing file(s) will be skipped. --threads : Number of threads used --batch-size - : Number of reads this program process together as a batch. Not that if - the specified number larger than the number of reads in each fastq files, the + : Number of reads this program process together as a batch. Not that if + the specified number larger than the number of reads in each fastq files, the batch size will be forced to be the number of reads in the file. --minimal_stdout @@ -123,7 +123,7 @@ def print_help(): --high-sensitivity-mode: Turn on the sensitivity mode, which increases the sensitivity of barcode detections but potentially increase the number false/uninformative BC in - the whitelist. + the whitelist. Note that --emptydrop is recommanded specified with this mode (See details below). Empty droplet BCs @@ -135,7 +135,7 @@ def print_help(): ''' print(textwrap.dedent(help_message)) - + if not argv: argv = sys.argv else: @@ -144,11 +144,11 @@ def print_help(): elif isinstance(argv, list): argv = ['blaze.py'] + argv else: - helper.err_msg("Error: Invalid argument input, the argument should be a string or a list.") + helper.err_msg("Error: Invalid argument input, the argument should be a string or a list.") sys.exit(1) - - - # Default + + + # Default out_fastq_fn = DEFAULT_GRB_OUT_FASTQ prefix = DEFAULT_PREFIX overwrite = False @@ -158,7 +158,7 @@ def print_help(): min_phred_score = DEFAULT_GRB_MIN_SCORE kit = DEFAULT_GRB_KIT batch_size=1000 - high_sensitivity_mode = False + high_sensitivity_mode = False emptydrop_max_count = np.inf do_demultiplexing = True do_whitelisting = True @@ -167,7 +167,7 @@ def print_help(): count_t = None # Read from options - try: + try: opts, args = getopt.getopt(argv[1:],"h", ["help","threads=","minQ=","full-bc-whitelist=","high-sensitivity-mode", "output-prefix=", "expect-cells=", "overwrite", @@ -175,10 +175,10 @@ def print_help(): "no-demultiplexing", "max-edit-distance=", "minimal_stdout", "no-whitelisting", "count-threshold="]) except getopt.GetoptError: - helper.err_msg("Error: Invalid argument input") + helper.err_msg("Error: Invalid argument input") print_help() - sys.exit(1) - + sys.exit(1) + for opt, arg in opts: if opt in ("-h", "--help"): print_help() @@ -194,11 +194,11 @@ def print_help(): elif opt == '--threads': n_process = int(arg) elif opt == '--minQ': - min_phred_score = int(arg) + min_phred_score = int(arg) elif opt == '--full-bc-whitelist': - full_bc_whitelist = arg + full_bc_whitelist = arg elif opt == "--out-putative-bc": - out_raw_bc_fn = arg + out_raw_bc_fn = arg elif opt == "--kit-version": kit = arg.lower() elif opt == "--high-sensitivity-mode": @@ -226,7 +226,7 @@ def print_help(): summary_fn = prefix + DEFAULT_BC_STAT_FN if kit not in ['3v2', '3v3', '5v2']: - helper.err_msg("Error: Invalid value of --kit-version (" + kit + "), please choose from 3v3 or 3v2 or 5v2") + helper.err_msg("Error: Invalid value of --kit-version (" + kit + "), please choose from 3v3 or 3v2 or 5v2") sys.exit() if full_bc_whitelist: @@ -241,13 +241,13 @@ def print_help(): # Read from args if not args: - helper.err_msg("Error: Missing fastq directory.") + helper.err_msg("Error: Missing fastq directory.") print_help()# print help doc when no command line args provided sys.exit(0) # check required argument if not exp_cells and count_t is None and do_whitelisting: - helper.err_msg("--expect-cells or --count-threshold is required to build the whitelist!") + helper.err_msg("--expect-cells or --count-threshold is required to build the whitelist!") sys.exit(1) if exp_cells and count_t is not None: helper.warning_msg("--expect-cells and --count-threshold are both specified. --expect-cells will be used.") @@ -256,7 +256,7 @@ def print_help(): # check input fastq_dir = args[0] helper.check_exist([full_bc_whitelist, fastq_dir]) - + if os.path.isdir(fastq_dir): fastq_fns = helper.get_files(fastq_dir, ['*.fastq', '*.fq', '*.fastq.gz', '*.fq.gz']) elif os.path.isfile(fastq_dir): @@ -300,7 +300,7 @@ def print_help(): f"Error in filename configuration:" f"Filename '{out_plot_fn}' should end with '.png'. Please check the config.py file in BLAZE.") sys.exit(1) - + # helper.warning_msg(f"Filename {out_fastq_fn} exists, will output demultiplexed reads into {out_fastq_fn}.") return fastq_fns, out_fastq_fn, n_process, exp_cells ,min_phred_score, \ @@ -327,7 +327,7 @@ def get_raw_bc_from_reads(reads, min_q=0, kit=None): ------- 1. Counter of high-confidence putative BC 2. Counter of high-confidence putative BC - 3. pd.DataFrame containing all putative BCs + 3. pd.DataFrame containing all putative BCs """ # init read_ids = [] @@ -341,11 +341,11 @@ def get_raw_bc_from_reads(reads, min_q=0, kit=None): post_umi_flankings = [] for i,r in enumerate(reads): - - # create read object - read = polyT_adaptor_finder.Read(read_id = r.id, sequence=str(r.seq), - phred_score=r.q_letter, kit=kit) - + + # create read object + read = polyT_adaptor_finder.Read(read_id = r.id, sequence=str(r.seq), + phred_score=r.q_letter, kit=kit) + read.get_strand_and_raw_bc() read_ids.append(read.id) @@ -355,11 +355,11 @@ def get_raw_bc_from_reads(reads, min_q=0, kit=None): trim_idxs.append(read.polyT_trimming_idx) pre_bc_flankings.append(read.pre_bc_flanking) post_umi_flankings.append(read.post_umi_flanking) - - if read.raw_bc_min_q and read.raw_bc_min_q >= min_q: + + if read.raw_bc_min_q and read.raw_bc_min_q >= min_q: raw_bc.append(read.raw_bc) - - if read.raw_bc_min_q and read.raw_bc_min_q < min_q: + + if read.raw_bc_min_q and read.raw_bc_min_q < min_q: raw_bc_pass.append(100) #tag for low quality putative bc else: raw_bc_pass.append(read.adaptor_polyT_pass) @@ -379,7 +379,7 @@ def get_raw_bc_from_reads(reads, min_q=0, kit=None): def qc_report(pass_count, min_phred_score, stdout=True, out_fn=None): ''' Generate report for the putative barcode detection. - Print stats for + Print stats for a. # of input read in total (only look at primary alignment if it's BAM) b. # and % read pass the polyT and adaptor searching process. c. # and % with poly T and adaptor find in both ends @@ -392,20 +392,20 @@ def qc_report(pass_count, min_phred_score, stdout=True, out_fn=None): min score used to filter out putative BC ''' total_read = sum(pass_count.values()) - + print_message=textwrap.dedent( f''' \n----------------------stats of the putative barcodes-------------------------- - Total number of reads: + Total number of reads: {total_read:,} - Reads with unambiguous polyT and adapter positions found: + Reads with unambiguous polyT and adapter positions found: {pass_count[0]+ pass_count[100]:,} ({(pass_count[0]+ pass_count[100])/total_read*100:.2f}% of all reads) {pass_count[0]:,} in which all bases in the putative BC have Q>={min_phred_score} - Failed Reads: - no polyT and adapter positions found: + Failed Reads: + no polyT and adapter positions found: {pass_count[1]:,} ({pass_count[1]/total_read*100:.2f}% of all reads) - polyT and adapter positions found in both end (fail to determine strand): + polyT and adapter positions found in both end (fail to determine strand): {pass_count[2]:,} ({pass_count[2]/total_read*100:.2f}% of all reads) multiple polyT and adapter found in one end {pass_count[10]:,} ({pass_count[10]/total_read*100:.2f}% of all reads) @@ -417,22 +417,22 @@ def qc_report(pass_count, min_phred_score, stdout=True, out_fn=None): with open(out_fn, 'w') as f: f.write(print_message + '\n') - -def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, - count_t=None, high_sensitivity_mode=False, - output_empty = False, empty_max_count = np.inf, + +def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, + count_t=None, high_sensitivity_mode=False, + output_empty = False, empty_max_count = np.inf, out_plot_fn = DEFAULT_KNEE_PLOT_FN): - f""" - Get a whitelist from all putative cell bc with high-confidence putative bc counts. - If the expect number is provided (default), a quantile-based threshold will be - calculated to determine the exact cells to be output. Otherwise, a user-specified + """ + Get a whitelist from all putative cell bc with high-confidence putative bc counts. + If the expect number is provided (default), a quantile-based threshold will be + calculated to determine the exact cells to be output. Otherwise, a user-specified ount threshold will be used and the cells/Barocdes with counts above the threshold will be output. - + If high_sensitivity_mode = True, the high sensitivity (HS) mode is turned on which uses - more relaxed threshold + more relaxed threshold - If in output_empty=True, a list of BCs that are most likely corresponding to - empty droplets will also be produced autimatically , which might be useful in + If in output_empty=True, a list of BCs that are most likely corresponding to + empty droplets will also be produced autimatically , which might be useful in downstream analysis. Criteria of selecting these BC: 1. BC in 10x full whitelist, and @@ -449,10 +449,10 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, ValueError: No valid exp_cells or count_t specified Returns: - dict 1: + dict 1: key: barcodes selected values: high-confidence putative BC counts - + list (high_sensitivity_mode only): barcodes most likely associated to empty droplet """ @@ -462,15 +462,15 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, percentile_count_thres = high_sensitivity_threshold_calculation else: percentile_count_thres = default_count_threshold_calculation - - whole_whitelist = [] + + whole_whitelist = [] if full_bc_whitelist.endswith('.zip'): with zipfile.ZipFile(full_bc_whitelist) as zf: # check if there is only 1 file assert len(zf.namelist()) == 1 - with io.TextIOWrapper(zf.open(zf.namelist()[0]), + with io.TextIOWrapper(zf.open(zf.namelist()[0]), encoding="utf-8") as f: for line in f: whole_whitelist.append(line.strip()) @@ -478,11 +478,11 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, with open(full_bc_whitelist, 'r') as f: for line in f: whole_whitelist.append(line.strip()) - + whole_whitelist = set(whole_whitelist) raw_bc_count = {k:v for k,v in raw_bc_count.items() if k in whole_whitelist} - + # determine real bc based on the count threshold if count_t is not None: knee_plot(list(raw_bc_count.values()), count_t, out_plot_fn) @@ -504,7 +504,7 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, # we don't need too much BC in this list if len(ept_bc) > DEFAULT_EMPTY_DROP_NUM: - break + break return cells_bc, ept_bc elif exp_cells: @@ -512,7 +512,7 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, knee_plot(list(raw_bc_count.values()), t, out_plot_fn) cells_bc = {k:v for k,v in raw_bc_count.items() if v > t} - + if not output_empty: return cells_bc, [] else: @@ -532,7 +532,7 @@ def get_bc_whitelist(raw_bc_count, full_bc_whitelist, exp_cells=None, break return cells_bc, ept_bc - else: + else: raise ValueError('Invalid value of count_t and/or exp_cells.') def knee_plot(counts, threshold=None, out_fn = 'knee_plot.png'): @@ -578,18 +578,18 @@ def read_batch_generator(fastq_fns, batch_size): def main(argv=None): - + fastq_fns, out_fastq_fn, n_process, exp_cells ,min_phred_score, \ full_bc_whitelist, out_raw_bc_fn, out_whitelist_fn, \ high_sensitivity_mode, batch_size, out_emptydrop_fn, \ emptydrop_max_count, overwrite, out_plot_fn, do_demultiplexing, \ max_edit_distance, summary_fn,minimal_out, do_whitelisting, \ count_t, kit = parse_arg(argv) - + # Start running: Welcome logo if not minimal_out: print(textwrap.dedent( - f'''\n\nWelcome to + f'''\n\nWelcome to {BLAZE_LOGO} ''')) @@ -603,9 +603,9 @@ def main(argv=None): printit = False)) logger.info(f'Getting putative barcodes from {len(fastq_fns)} FASTQ files...') read_batchs = read_batch_generator(fastq_fns, batch_size=batch_size) - + rst_futures = helper.multiprocessing_submit(get_raw_bc_from_reads, - read_batchs, n_process=n_process, + read_batchs, n_process=n_process, min_q=min_phred_score, kit=kit) raw_bc_pass_count = defaultdict(int) @@ -618,35 +618,35 @@ def main(argv=None): rst_df.to_csv(out_raw_bc_fn, index=False) else: rst_df.to_csv(out_raw_bc_fn, mode='a', index=False, header=False) - + helper.green_msg(f'Putative barcode table saved in {out_raw_bc_fn}') - + # ----------------------stats of the putative barcodes-------------------------- - qc_report(raw_bc_pass_count, min_phred_score=min_phred_score, + qc_report(raw_bc_pass_count, min_phred_score=min_phred_score, out_fn=summary_fn, stdout= not minimal_out) - + else: logger.info(helper.warning_msg(textwrap.dedent( f""" - Warning: `{out_raw_bc_fn}` exists. BLAZE would NOT re-generate the file and the existing file + Warning: `{out_raw_bc_fn}` exists. BLAZE would NOT re-generate the file and the existing file will be directly used for downstream steps. If you believe it needs to be updated, please - change the --output_prefix or remove/rename the existing file. - + change the --output_prefix or remove/rename the existing file. + Note: there is no need to update this file if the input data remain the same and the previous - run that generated this file finished successfully. It wouldn't change with other specified + run that generated this file finished successfully. It wouldn't change with other specified arguments . However if you are running using a modified config.py file or the existing `{out_raw_bc_fn}` was generated by a different version of BLAZE, updating the file is suggested. """ ), printit = False)) - + ###################### ###### Whitelisting ###################### if do_whitelisting and (not os.path.exists(out_whitelist_fn) or overwrite): dfs = pd.read_csv(out_raw_bc_fn, chunksize=1_000_000) - + # get bc count dict (filtered by minQ) raw_bc_count = Counter() for df in tqdm(dfs, desc = 'Counting high-quality putative BC'): @@ -670,7 +670,7 @@ def main(argv=None): try: bc_whitelist, ept_bc = get_bc_whitelist(raw_bc_count, - full_bc_whitelist, + full_bc_whitelist, exp_cells=exp_cells, count_t=count_t, high_sensitivity_mode=high_sensitivity_mode, @@ -684,7 +684,7 @@ def main(argv=None): with open(out_emptydrop_fn, 'w') as f: for k in ept_bc: f.write(k+'\n') - helper.green_msg(f'Empty droplet barcode list saved as `{out_emptydrop_fn}`.') + helper.green_msg(f'Empty droplet barcode list saved as `{out_emptydrop_fn}`.') except Exception as e: logger.exception(e) @@ -708,7 +708,7 @@ def main(argv=None): f"NOTE: BLAZE will use existing `{out_whitelist_fn}` for the downstread steps and will not re-generate the {out_plot_fn}." f"If the file is required, please remove/rename the existing `{out_whitelist_fn}` and rerun." , printit = False)) - + elif not do_whitelisting: logger.info(helper.warning_msg( f"NOTE: Whitelisting step is skipped as specified by --no-whitelisting. BLAZE will assign reads to {full_bc_whitelist}." @@ -725,13 +725,13 @@ def main(argv=None): printit = False )) logger.info("Assigning reads to whitelist.\n") - read_assignment.assign_read(fastq_fns, - out_fastq_fn, - out_raw_bc_fn, + read_assignment.assign_read(fastq_fns, + out_fastq_fn, + out_raw_bc_fn, out_whitelist_fn, max_edit_distance, n_process, - out_fastq_fn.endswith('.gz'), + out_fastq_fn.endswith('.gz'), batch_size) else: logger.info(helper.warning_msg( diff --git a/bin/polyT_adaptor_finder.py b/bin/polyT_adaptor_finder.py index 026876d..c331b19 100644 --- a/bin/polyT_adaptor_finder.py +++ b/bin/polyT_adaptor_finder.py @@ -33,17 +33,17 @@ def __init__(self, read_id, sequence, phred_score=None, strand = None, kit=None, self.kit = kit for key in kwargs.keys(): self.__dict__[key] = kwargs[key] - + def add(self, attr_name, attr_val, overwrite = True): if attr_name in __dict__.keys() and not overwrite: pass - else: + else: self.__dict__[attr_name] = attr_val - + def find_adapter_5_prime(self, read=None, strand=None, adapter_seq=ADPT_SEQ, num_nt=ADPT_WIN,max_ed=ADPT_MAC_MATCH_ED, tso_seq=TSO_SEQ): - + strand = self._strand if not strand else strand read = self.seq if not read else read @@ -61,7 +61,7 @@ def find_adapter_5_prime(self, read=None, strand=None, adapter_seq=ADPT_SEQ, return {'-':[max(0,tso_seq_start-d2)+sub_seq_end+1]} if sub_seq_end > 0 else {} else: return {} - + if strand == '+': read = helper.reverse_complement(read[-num_nt:]) adpt_ends = self.find_adapter_5_prime( @@ -70,22 +70,22 @@ def find_adapter_5_prime(self, read=None, strand=None, adapter_seq=ADPT_SEQ, return {'+':adpt_ends} if len(adpt_ends) else {} else: - + fwd_strand = self.find_adapter_5_prime( - strand='-', adapter_seq=adapter_seq, + strand='-', adapter_seq=adapter_seq, num_nt=num_nt, tso_seq=tso_seq) rev_strand = self.find_adapter_5_prime( - strand='+', adapter_seq=adapter_seq, + strand='+', adapter_seq=adapter_seq, num_nt=num_nt, tso_seq=tso_seq) rst = {**{k:v for k,v in fwd_strand.items() if len(v)}, **{k:v for k,v in rev_strand.items() if len(v)}} return rst - def find_adaptor_3_prime(self, read=None, strand=None, adaptor_seq=ADPT_SEQ, + def find_adaptor_3_prime(self, read=None, strand=None, adaptor_seq=ADPT_SEQ, num_nt=ADPT_WIN, max_ed=ADPT_MAC_MATCH_ED): - - def find_poly_T(seq, poly_T_len=PLY_T_LEN, + + def find_poly_T(seq, poly_T_len=PLY_T_LEN, min_match_prop=SEQ_SUFFIX_MIN_MATCH_PROP): '''Find poly T in seq Parameters @@ -110,7 +110,7 @@ def find_poly_T(seq, poly_T_len=PLY_T_LEN, # convert to np_array T -> 1 and ACG -> 0 read_code = np.array([int(x == 'T') for x in seq]) T_prop = helper.sliding_window_mean(read_code, poly_T_len) - return np.where(T_prop >= min_match_prop)[0] + return np.where(T_prop >= min_match_prop)[0] strand = self._strand if not strand else strand read = self.seq if not read else read @@ -130,7 +130,7 @@ def find_poly_T(seq, poly_T_len=PLY_T_LEN, if idx-d2 > win_end: searching_win.append((win_start, win_end)) win_start, win_end = idx-d2, idx-d1 - else: + else: win_end = idx-d1 searching_win.append((win_start, win_end)) @@ -141,34 +141,34 @@ def find_poly_T(seq, poly_T_len=PLY_T_LEN, if sub_seq_end > 0: adpt_ends.append(max(start,0)+sub_seq_end+1) - return {'-':list(set(adpt_ends))} if len(adpt_ends) else {} + return {'-':list(set(adpt_ends))} if len(adpt_ends) else {} + - # take reverse complement if read is coming from transcript strand (with ployA instead ployT) if strand == '+': read = helper.reverse_complement(read[-num_nt:]) adpt_ends = self.find_adaptor_3_prime( - read, strand='-', adaptor_seq=adaptor_seq, + read, strand='-', adaptor_seq=adaptor_seq, num_nt=num_nt).get('-',[]) return {'+':adpt_ends} if len(adpt_ends) else {} - + else: T_strand = self.find_adaptor_3_prime( - strand='-', adaptor_seq=adaptor_seq, + strand='-', adaptor_seq=adaptor_seq, num_nt=num_nt) A_strand = self.find_adaptor_3_prime( - strand='+', adaptor_seq=adaptor_seq, + strand='+', adaptor_seq=adaptor_seq, num_nt=num_nt) rst = {**{k:v for k,v in T_strand.items() if len(v)}, **{k:v for k,v in A_strand.items() if len(v)}} return rst - - def find_adaptor(self,read=None, strand=None, adaptor_seq=ADPT_SEQ, + + def find_adaptor(self,read=None, strand=None, adaptor_seq=ADPT_SEQ, num_nt=ADPT_WIN, max_ed=ADPT_MAC_MATCH_ED, tso_seq=TSO_SEQ): ''' find adaptor from a read - + Parameters ---------- read : STR @@ -178,7 +178,7 @@ def find_adaptor(self,read=None, strand=None, adaptor_seq=ADPT_SEQ, Transcript strand, this function will find adaptor in '-' poly T strand Will do a reverse complement if strand == '-' adaptor_seq : STR - 10X adaptor sequence, the full-length adaptor sequence is + 10X adaptor sequence, the full-length adaptor sequence is 'CTACACGACGCTCTTCCGATCT' (22nt). Last 12nt is used by default. max_ed : float max edit distance in the adaptor alignment. Default: 2 @@ -210,7 +210,7 @@ def get_strand_and_raw_bc(self): 1: poly T and adaptor not found in any strand 2: poly T and adaptor found in both strand 10: multiple adaptor found in 20~50 nt of poly T upstream - + Returns ------- None. @@ -227,7 +227,7 @@ def get_strand_and_raw_bc(self): if num_of_strand_find == 0: self.adaptor_polyT_pass += 1 elif num_of_strand_find == 1: - # check single location + # check single location adptors = set(list(adapt_dict.values())[0]) num_of_adptor_find = len(adptors) if num_of_adptor_find == 0: @@ -259,12 +259,12 @@ def get_strand_and_raw_bc(self): self.raw_bc_start = list(adapt_dict.values())[0][0] self.strand = list(adapt_dict.keys())[0] - + if self.strand == '+': self.raw_bc = \ helper.reverse_complement( self.seq)[self.raw_bc_start: self.raw_bc_start+16] - else: + else: self.raw_bc = self.seq[self.raw_bc_start: self.raw_bc_start+16] if self.phred_score is not None: @@ -273,15 +273,15 @@ def get_strand_and_raw_bc(self): else: phred_score = self.phred_score - bc_phred_score = phred_score[self.raw_bc_start: self.raw_bc_start+16] + bc_phred_score = phred_score[self.raw_bc_start: self.raw_bc_start+16] self.raw_bc_min_q = min([ord(x) for x in bc_phred_score]) - 33 else: self.raw_bc_min_q = None - + # get poly_T_start_raw and adaptor_end_raw - + @property def adator_trimming_idx(self): """sequencing after trimming the adaptor and UMI @@ -294,7 +294,7 @@ def adator_trimming_idx(self): return None elif self._strand == '+': return int(-self.raw_bc_start-16-DEFAULT_UMI_SIZE) - else: + else: return int(self.raw_bc_start+16+DEFAULT_UMI_SIZE) @property @@ -306,10 +306,10 @@ def putative_UMI(self): elif self._strand == '+': return helper.reverse_complement( self.seq)[self.raw_bc_start+16: self.raw_bc_start+16+DEFAULT_UMI_SIZE] - else: + else: return self.seq[ self.raw_bc_start+16: self.raw_bc_start+16+DEFAULT_UMI_SIZE] - + @property def pre_bc_flanking(self): if not self._get_strand_and_raw_bc_flag: @@ -319,9 +319,9 @@ def pre_bc_flanking(self): elif self._strand == '+': return helper.reverse_complement( self.seq)[self.raw_bc_start-DEFAULT_GRB_FLANKING_SIZE: self.raw_bc_start] - else: + else: return self.seq[self.raw_bc_start-DEFAULT_GRB_FLANKING_SIZE: self.raw_bc_start] - + @property def post_umi_flanking(self): """sequencing after trimming the adaptor and UMI @@ -336,30 +336,30 @@ def post_umi_flanking(self): return helper.reverse_complement( self.seq)[self.raw_bc_start+16+DEFAULT_UMI_SIZE: \ self.raw_bc_start+16+DEFAULT_UMI_SIZE+ DEFAULT_GRB_FLANKING_SIZE] - else: + else: return self.seq[ self.raw_bc_start+16+DEFAULT_UMI_SIZE: \ self.raw_bc_start+16+DEFAULT_UMI_SIZE+DEFAULT_GRB_FLANKING_SIZE] - + @property def polyT_trimming_idx(self): """sequencing after trimming the adaptor and UMI Returns: index of the end of the polyT, negative if it's polyA strand """ - - + + umi_end_idx = self.adator_trimming_idx - + if umi_end_idx is None: return None - + # take reverse complement if read is coming from transcript strand (with ployA instead ployT) if umi_end_idx < 0: reversed = True seq = helper.reverse_complement(self.seq) umi_end_idx = abs(umi_end_idx) - else: + else: reversed = False seq = self.seq @@ -367,15 +367,15 @@ def polyT_trimming_idx(self): polyT_start = seq.find('TTTT', umi_end_idx) if polyT_start == -1: return umi_end_idx - + read_code = np.array([int(x == 'T') for x in seq]) for idx, nt in enumerate(read_code[polyT_start:]): if nt == 1: polyT_start += 1 - elif sum(read_code[polyT_start:polyT_start+10]) >= 7: # hard code definition of polyT. plotT end when >3 of 10 consecutive letter are not T + elif sum(read_code[polyT_start:polyT_start+10]) >= 7: # hard code definition of polyT. plotT end when >3 of 10 consecutive letter are not T polyT_start += 1 else: - break + break return int(polyT_start) if not reversed else int(-polyT_start) @@ -385,12 +385,12 @@ def strand(self): # set new strand helper.warning_msg( """ - No strand information is recorded, try 'get_strand_and_raw_bc' + No strand information is recorded, try 'get_strand_and_raw_bc' first. """) return None return self._strand - + @strand.setter def strand(self, v): if v == None: @@ -423,5 +423,4 @@ def main(): if __name__ == '__main__': main() - - + diff --git a/bin/polyT_trimmer.py b/bin/polyT_trimmer.py index 06c5785..8f60aa2 100644 --- a/bin/polyT_trimmer.py +++ b/bin/polyT_trimmer.py @@ -51,15 +51,15 @@ def polyT_trimming_idx(seq, reverse=False, umi_end_idx=0, polyT_end_minT=7, poly polyT_start = seq.find('TTTT', umi_end_idx) if polyT_start == -1: return umi_end_idx - + read_code = np.array([int(x == 'T') for x in seq]) for idx, nt in enumerate(read_code[polyT_start:]): if nt == 1: polyT_start += 1 - elif sum(read_code[polyT_start:polyT_start+polyT_end_check_win]) >= polyT_end_minT: + elif sum(read_code[polyT_start:polyT_start+polyT_end_check_win]) >= polyT_end_minT: polyT_start += 1 else: - break + break return int(polyT_start) if not reverse else int(-polyT_start) @@ -91,7 +91,7 @@ def _read_batch_generator(fastq_fns, batch_size): batch_iter = helper.batch_iterator(fastq, batch_size=batch_size) for batch in batch_iter: yield batch - + else: with open(fn) as handle: fastq =\ @@ -114,13 +114,13 @@ def _proc_read_batches(read_batch, gz): else: seq = r.seq[int(polyT_end):] qscore = r.qscore[int(polyT_end):] - + out_buffer += '@' + r.id + '\n' out_buffer += str(seq) + '\n' out_buffer += '+\n' out_buffer += qscore + '\n' - + if gz: b_out_buffer = gzip.compress(out_buffer.encode('utf-8')) else: @@ -152,13 +152,13 @@ def polyT_trimmer(fastq_fns, fastq_out, n_process, gz, batchsize): # multi-thread version else: - rst_futures = helper.multiprocessing_submit(_proc_read_batches, - r_batches, + rst_futures = helper.multiprocessing_submit(_proc_read_batches, + r_batches, n_process=n_process, schduler = "process", pbar_func=len, gz = gz) - + # collect results with open(fastq_out, 'wb') as output_handle: for f in rst_futures: @@ -169,5 +169,5 @@ def polyT_trimmer(fastq_fns, fastq_out, n_process, gz, batchsize): if __name__ == '__main__': fastq_fns, fastq_out, n_process, gzip_out = parse_command_line() polyT_trimmer(fastq_fns, fastq_out, n_process, gz=gzip_out, batchsize=4000) - + diff --git a/bin/read_assignment.py b/bin/read_assignment.py index 2b3d25d..6bd7bf5 100644 --- a/bin/read_assignment.py +++ b/bin/read_assignment.py @@ -43,7 +43,7 @@ def _match_bc_row(row, whitelist, max_ed): '': if no barcode was found in the whitelist 'ambiguous': if multiple barcode was found in the whitelist 2. adjusted putative_umi - 3. strand '+' for read with positive strand (start with BC,umi,polyT...), + 3. strand '+' for read with positive strand (start with BC,umi,polyT...), '-' for read with negative strand """ @@ -51,21 +51,21 @@ def _match_bc_row(row, whitelist, max_ed): strand = '' elif row.polyT_end > 0: strand = '+' - else: + else: strand = '-' if not row.putative_bc or row.putative_bc in whitelist: return [row.putative_bc, row.putative_umi, strand] else: bc = row.putative_bc - + best_ed = max_ed bc_hit = '' bc_hit_end_idx = -1 # extending the putative barcode from both sides for potential indels bc = row.pre_bc_flanking[-DEFAULT_ED_FLANKING:] + bc + row.putative_umi[:DEFAULT_ED_FLANKING] for i in whitelist: - ed, end_idx = sub_edit_distance(i, bc, best_ed) + ed, end_idx = sub_edit_distance(i, bc, best_ed) if ed < best_ed: best_ed = ed bc_hit = i @@ -74,12 +74,12 @@ def _match_bc_row(row, whitelist, max_ed): if not bc_hit: bc_hit = i bc_hit_end_idx = end_idx - else: + else: bc_hit = 'ambiguous' best_ed -= 1 if best_ed < 0: return ['', row.putative_umi, strand] - + if bc_hit == 'ambiguous' or bc_hit == '': return ['', row.putative_umi, strand] else: @@ -93,7 +93,7 @@ def _match_bc_row(row, whitelist, max_ed): elif umi_adj < 0: out_umi = row.putative_bc[umi_adj:] + row.putative_umi[:umi_adj] return [bc_hit, out_umi, strand] - + def batch_barcode_to_fastq(r_batches_with_idx, assignment_df ,gz = True): """Take a read batch, write fastq/fastq.gz to a tmp file """ @@ -105,9 +105,9 @@ def batch_barcode_to_fastq(r_batches_with_idx, assignment_df ,gz = True): read_idx = 0 for r in read_batch: row = assignment_df.iloc[start_df_idx+read_idx]#the row in putative bc table - read_idx += 1 + read_idx += 1 try: - assert row.read_id == r.id + assert row.read_id == r.id except AssertionError: helper.err_msg("Different order in putative bc file and input fastq!") sys.exit() @@ -121,12 +121,12 @@ def batch_barcode_to_fastq(r_batches_with_idx, assignment_df ,gz = True): else: seq = r.seq[int(row.polyT_end):] qscore = r.qscore[int(row.polyT_end):] - + out_buffer += f"@{row.BC_corrected}_{row.putative_umi}#{row.read_id}_{row.strand}\n" out_buffer += str(seq) + '\n' out_buffer += '+\n' out_buffer += qscore + '\n' - + output_handle.write(out_buffer) output_handle.close() return temp_file.name @@ -150,7 +150,7 @@ def _read_and_bc_batch_generator_with_idx(fastq_fns, putative_bc_csv, batch_size (read_fastq(title, sequence, qscore) for title, sequence, qscore in helper.fastq_parser(handle)) batch_iter = helper.batch_iterator(fastq, batch_size=batch_size) - + for batch in batch_iter: batch_len = len(batch) batch_bc_df = pd.read_csv( @@ -176,16 +176,16 @@ def _read_and_bc_batch_generator_with_idx(fastq_fns, putative_bc_csv, batch_size read_idx += batch_len putative_bc_f.close() - + def _assign_read_batches(r_batch, whitelist, max_ed, gz): """Single-thread function: Assign all putative barcode to whiteliested barcode Args: - r_batch (tuple): yield from read_and_bc_batch_generator_with_idx + r_batch (tuple): yield from read_and_bc_batch_generator_with_idx (read_batch, start_idx, batch_bc_df) - whitelist (list): list of barcode - n_process (int): number of process + whitelist (list): list of barcode + n_process (int): number of process max_ed (int): maximum edit distance allowed for a assignment Returns: @@ -199,7 +199,7 @@ def _assign_read_batches(r_batch, whitelist, max_ed, gz): df = df.fillna('') whitelist = set(whitelist) out_buffer = '' - + new_cols = [] for row in df.itertuples(): new_cols.append(_match_bc_row(row, whitelist, max_ed)) @@ -211,7 +211,7 @@ def _assign_read_batches(r_batch, whitelist, max_ed, gz): for r, bc in zip(read_batch, df.itertuples()): try: - assert bc.read_id == r.id + assert bc.read_id == r.id except AssertionError: helper.err_msg("Different order in putative bc file and input fastq!") sys.exit() @@ -224,13 +224,13 @@ def _assign_read_batches(r_batch, whitelist, max_ed, gz): else: seq = r.seq[int(bc.polyT_end):] qscore = r.qscore[int(bc.polyT_end):] - + out_buffer += f"@{bc.BC_corrected}_{bc.putative_umi}#{bc.read_id}_{bc.strand}\n" out_buffer += str(seq) + '\n' out_buffer += '+\n' out_buffer += qscore + '\n' - + if gz: b_out_buffer = gzip.compress(out_buffer.encode('utf-8')) else: @@ -240,20 +240,20 @@ def _assign_read_batches(r_batch, whitelist, max_ed, gz): # logger.info(helper.green_msg(f"Demultiplexing finshied: ", printit = False)) # logger.info(helper.green_msg(f"Successfully demultiplexed reads / Total reads: {sum(df.BC_corrected!='')} / {len(df.BC_corrected)}. ", printit = False)) -def assign_read(fastq_fns, fastq_out, putative_bc_csv, +def assign_read(fastq_fns, fastq_out, putative_bc_csv, whitelsit_csv, max_ed, n_process, gz, batchsize): """Main function: Demultiplex fastq files using putative barcode csv and whitelist csv """ # greating generator for read and putative barcode batches r_batches = \ _read_and_bc_batch_generator_with_idx(fastq_fns, putative_bc_csv, batchsize) - + # read the whitelist - whitelist = [] + whitelist = [] with open(whitelsit_csv, 'r') as f: for line in f: whitelist.append(line.split('-')[0].strip()) - + # assign putative barcode to whitelist # single thread version if n_process == 1: @@ -272,15 +272,15 @@ def assign_read(fastq_fns, fastq_out, putative_bc_csv, # multi-thread version else: - rst_futures = helper.multiprocessing_submit(_assign_read_batches, - r_batches, + rst_futures = helper.multiprocessing_submit(_assign_read_batches, + r_batches, n_process=n_process, schduler = "process", pbar_func=lambda x: len(x[0]), whitelist = whitelist, max_ed = max_ed, gz = gz) - + # collect results demul_count_tot = 0 count_tot = 0 diff --git a/bin/update_whitelist.py b/bin/update_whitelist.py index 830a5c2..1e8b8b2 100644 --- a/bin/update_whitelist.py +++ b/bin/update_whitelist.py @@ -18,11 +18,11 @@ def parse_arg(): description=textwrap.dedent( ''' This script can be used to generate a new whitelist from the putative bc table - output from 'blaze.py'. Users may specify different argment used in + output from 'blaze.py'. Users may specify different argment used in 'blaze.py' to obtain a different whitelist. '''), formatter_class=argparse.ArgumentDefaultsHelpFormatter) - + # Required positional argument parser.add_argument('putative_bc_csv', type=str, help='Filename of the putative_bc csv file output from blaze.py') @@ -40,13 +40,13 @@ def parse_arg(): parser.add_argument('--kit-version', type=str, default='3v3', help= textwrap.dedent( ''' - Choose from v2 and v3 (for 10X Single Cell 3ʹ gene expression v2 or v3). + Choose from v2 and v3 (for 10X Single Cell 3ʹ gene expression v2 or v3). ''')) parser.add_argument('--minQ', type=int, default=15, help= textwrap.dedent( ''' - : Minimum phred score for all bases in a putative BC. - Reads whose putative BC contains one or more bases with + : Minimum phred score for all bases in a putative BC. + Reads whose putative BC contains one or more bases with Q: .txt file containing all the possible BCs. Users may provide @@ -64,7 +64,7 @@ def parse_arg(): detections but potentially increase the number false/uninformative BC in the whitelist.''') parser.add_argument('--emptydrop', action='store_true', - help='''Output list of BCs corresponding to empty droplets (filename: {DEFAULT_EMPTY_DROP_FN}), + help='''Output list of BCs corresponding to empty droplets (filename: {DEFAULT_EMPTY_DROP_FN}), which could be used to estimate ambiant RNA expressionprofile.''') parser.add_argument('--emptydrop-max-count', type=float, default=np.inf, help=textwrap.dedent( @@ -76,20 +76,20 @@ def parse_arg(): args = parser.parse_args() if not args.expect_cells and not args.count_threshold: - helper.err_msg("Missing argument --expect-cells or --count-threshold.") + helper.err_msg("Missing argument --expect-cells or --count-threshold.") sys.exit(1) if (args.expect_cells or args.high_sensitivity_mode) and args.count_threshold: helper.warning_msg(textwrap.dedent( f''' Warning: You have specified'--count-threshold'. Options "--high_sensitivity_mode" and "--expect-cells" would be ignored if - specified. + specified. ''')) args.high_sensitivity_mode = False - + args.kit_version = args.kit_version.lower() if args.kit_version not in ['3v2', '3v3', '5v2']: - helper.err_msg("Error: Invalid value of --kit-version, please choose from 3v3 or 3v2 or 5v2") + helper.err_msg("Error: Invalid value of --kit-version, please choose from 3v3 or 3v2 or 5v2") sys.exit() if args.full_bc_whitelist: @@ -102,7 +102,7 @@ def parse_arg(): elif args.kit_version == '3v2' or args.kit_version == '5v2': args.full_bc_whitelist = DEFAULT_GRB_WHITELIST_V2 - # check file + # check file helper.check_exist([args.full_bc_whitelist, args.putative_bc_csv]) return args @@ -111,9 +111,9 @@ def parse_arg(): def main(args): # read table dfs = pd.read_csv(args.putative_bc_csv, chunksize=args.chunk_size) - + # get bc count dict (filtered by minQ) - + raw_bc_count = Counter() for df in tqdm(dfs, desc = 'Counting high-quality putative BC'): raw_bc_count += Counter(df[ @@ -121,10 +121,10 @@ def main(args): if args.high_sensitivity_mode: print('Preparing whitelist...(high-sensitivity-mode)') - else: + else: print('Preparing whitelist...') bc_whitelist, ept_bc = get_bc_whitelist(raw_bc_count, - args.full_bc_whitelist, + args.full_bc_whitelist, args.expect_cells, args.count_threshold, args.high_sensitivity_mode, diff --git a/conf/base.config b/conf/base.config index 904e6a3..ba3d12f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -57,7 +57,4 @@ process { errorStrategy = 'retry' maxRetries = 2 } - withName:CUSTOM_DUMPSOFTWAREVERSIONS { - cache = false - } } diff --git a/conf/modules.config b/conf/modules.config index f2e8319..d0dca0d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -25,7 +25,6 @@ process { pattern: '*_versions.yml' ] } - } ///////////// @@ -193,7 +192,7 @@ if (!params.skip_qc){ ] } } - + process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_DEDUP:BAM_STATS_SAMTOOLS:.*' { ext.prefix = { "${meta.id}.dedup.sorted" } @@ -230,6 +229,22 @@ if (!params.skip_qc) { } +// READ COUNTS +if (!params.skip_qc) { + + process { + withName:'.*:READ_COUNTS' { + publishDir = [ + path: { "${params.outdir}/batch_qcs/read_counts" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + } + +} + ///////////////////// // REFERENCE FILES // ///////////////////// @@ -307,7 +322,7 @@ process { // GUNZIP process { - withName: '.*:GUNZIP' { + withName: '.*:GUNZIP.*' { publishDir = [ enabled: false ] @@ -507,6 +522,15 @@ process { } } +process { + withName:'.*:BAM_SORT_STATS_SAMTOOLS_TAGGED:.*' { + ext.prefix = { "${meta.id}.sorted" } + publishDir = [ + enabled: false + ] + } +} + process { withName:'.*:BAM_SORT_STATS_SAMTOOLS_DEDUP:SAMTOOLS_SORT' { ext.prefix = { "${meta.id}.dedup.sorted" } diff --git a/conf/test_full.config b/conf/test_full.config index fab1e8d..b714c41 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -24,4 +24,6 @@ params { // Barcode options barcode_format = "10X_3v3" + split_amount = 500000 + } diff --git a/docs/images/nf-core-scnanoseq_logo_dark.png b/docs/images/nf-core-scnanoseq_logo_dark.png index d186ab3..fa6be46 100644 Binary files a/docs/images/nf-core-scnanoseq_logo_dark.png and b/docs/images/nf-core-scnanoseq_logo_dark.png differ diff --git a/docs/images/nf-core-scnanoseq_logo_light.png b/docs/images/nf-core-scnanoseq_logo_light.png index 3a65f7f..3b26fcd 100644 Binary files a/docs/images/nf-core-scnanoseq_logo_light.png and b/docs/images/nf-core-scnanoseq_logo_light.png differ diff --git a/docs/usage.md b/docs/usage.md index 007bc76..0146bf0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -161,6 +161,8 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) +- `wave` + - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. diff --git a/modules.json b/modules.json index 9cf09a5..6f0b053 100644 --- a/modules.json +++ b/modules.json @@ -8,137 +8,99 @@ "bamtools/split": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cat/cat": { "branch": "master", "git_sha": "9437e6053dccf4aafa022bfd6e7e9de67e625af8", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "cat/fastq": { "branch": "master", "git_sha": "0997b47c93c06b49aa7b3fefda87e728312cf2ca", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "nanoplot": { "branch": "master", "git_sha": "a31407dfaf0cb0d04768d5cb439fc6f4523a6981", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/nanoplot/nanoplot.diff" }, "rseqc/readdistribution": { "branch": "master", "git_sha": "6c7d8f1d6247655e4bc4d97f37b68b2461f645f6", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", "git_sha": "aeb02a39d4c463598bfdcb2d964dbb7acbcf1298", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "bam_stats_samtools" - ] + "installed_by": ["bam_stats_samtools"] }, "samtools/idxstats": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "bam_stats_samtools" - ] + "installed_by": ["bam_stats_samtools"] }, "samtools/index": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "bam_sort_stats_samtools" - ] + "installed_by": ["bam_sort_stats_samtools"] }, "samtools/merge": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", "git_sha": "4352dbdb09ec40db71e9b172b97a01dcf5622c26", - "installed_by": [ - "bam_sort_stats_samtools" - ] + "installed_by": ["bam_sort_stats_samtools"] }, "samtools/stats": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "bam_stats_samtools" - ] + "installed_by": ["bam_stats_samtools"] }, "samtools/view": { "branch": "master", "git_sha": "0bd7d2333a88483aa0476acea172e9f5f6dd83bb", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "toulligqc": { "branch": "master", "git_sha": "061a322293b3487e53f044304710e54cbf657717", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/toulligqc/toulligqc.diff" }, "umitools/dedup": { "branch": "master", "git_sha": "3bd4f34e3093c2a16e6a8eefc22242b9b94641db", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -147,40 +109,30 @@ "bam_sort_stats_samtools": { "branch": "master", "git_sha": "4352dbdb09ec40db71e9b172b97a01dcf5622c26", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "bam_stats_samtools": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": [ - "bam_sort_stats_samtools" - ] + "installed_by": ["bam_sort_stats_samtools"] }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} diff --git a/modules/local/read_counts.nf b/modules/local/read_counts.nf index bcb9d51..b3fdd36 100644 --- a/modules/local/read_counts.nf +++ b/modules/local/read_counts.nf @@ -1,17 +1,11 @@ process READ_COUNTS { label 'process_low' - conda "conda-forge::perl=5.26.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : 'biocontainers/perl:5.26.2' }" - //conda "conda-forge::sed=4.7" - //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - // 'ubuntu:20.04' }" - input: path raw_fastqc path trim_fastqc diff --git a/nextflow.config b/nextflow.config index 2b3c5ca..104b73f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,15 +62,16 @@ params { multiqc_methods_description = null // Boilerplate options - outdir = null - publish_dir_mode = 'copy' - email = null - email_on_fail = null - plaintext_email = false - monochrome_logs = false - hook_url = null - help = false - version = false + outdir = null + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + hook_url = null + help = false + version = false + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' // Config options config_profile_name = null @@ -106,103 +107,109 @@ try { } // Load nf-core/scnanoseq custom profiles from different institutions. -// Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/scnanoseq.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/scnanoseq profiles: ${params.custom_config_base}/pipeline/scnanoseq.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/scnanoseq.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/scnanoseq profiles: ${params.custom_config_base}/pipeline/scnanoseq.config") +} profiles { debug { - dumpHashes = true - process.beforeScript = 'echo $HOSTNAME' - cleanup = false + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false nextflow.enable.configProcessNamesValidation = true } conda { - conda.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - channels = ['conda-forge', 'bioconda', 'defaults'] - apptainer.enabled = false + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + conda.channels = ['conda-forge', 'bioconda', 'defaults'] + apptainer.enabled = false } mamba { - conda.enabled = true - conda.useMamba = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } docker { - docker.enabled = true - conda.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + docker.enabled = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' } arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { - singularity.enabled = true - singularity.autoMounts = true - conda.enabled = false - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } podman { - podman.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + podman.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } shifter { - shifter.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + shifter.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } charliecloud { - charliecloud.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - apptainer.enabled = false + charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false } apptainer { - apptainer.enabled = true - apptainer.autoMounts = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false + apptainer.enabled = true + apptainer.autoMounts = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + wave { + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' } gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB + executor.name = 'local' + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 9418c53..f9489a5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -453,6 +453,13 @@ "description": "Validation of parameters in lenient more.", "hidden": true, "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + }, + "pipelines_testdata_base_path": { + "type": "string", + "fa_icon": "far fa-check-circle", + "description": "Base URL or local path to location of pipeline test dataset files", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "hidden": true } } } diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 5611062..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. -# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.ruff] -line-length = 120 -target-version = "py38" -cache-dir = "~/.cache/ruff" - -[tool.ruff.lint] -select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] - -[tool.ruff.lint.isort] -known-first-party = ["nf_core"] - -[tool.ruff.lint.per-file-ignores] -"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf index a1ecce9..b7625a3 100644 --- a/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_scnanoseq_pipeline/main.nf @@ -137,6 +137,10 @@ workflow PIPELINE_COMPLETION { imNotification(summary_params, hook_url) } } + + workflow.onError { + log.error "Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting" + } } /* @@ -227,8 +231,16 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["manifest_map"] = workflow.manifest.toMap() // Pipeline DOI - meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" - meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + if (meta.manifest_map.doi) { + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + def temp_doi_ref = "" + String[] manifest_doi = meta.manifest_map.doi.tokenize(",") + for (String doi_ref: manifest_doi) temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " + meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) + } else meta["doi_text"] = "" + meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references meta["tool_citations"] = ""