From a2e8c5d12d55e072f829bc184be03e21893cb524 Mon Sep 17 00:00:00 2001 From: Toni Verbeiren Date: Wed, 24 Jul 2024 10:31:36 +0200 Subject: [PATCH] Add concat_text component (#4) * Add concat_text component * Minor cleanup * Update src/concat_text/config.vsh.yaml * Update in line with most of Dries' comments * Don't claim to cleanup if it ain't happening... * Add authorship * Avoid using deprecated Viash variable --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- src/_authors/dries_schaumont.yaml | 11 +++++ src/_authors/toni_verbeiren.yaml | 9 ++++ src/concat_text/config.vsh.yaml | 56 +++++++++++++++++++++++++ src/concat_text/script.sh | 34 +++++++++++++++ src/concat_text/test.sh | 70 +++++++++++++++++++++++++++++++ 5 files changed, 180 insertions(+) create mode 100644 src/_authors/dries_schaumont.yaml create mode 100644 src/_authors/toni_verbeiren.yaml create mode 100644 src/concat_text/config.vsh.yaml create mode 100644 src/concat_text/script.sh create mode 100644 src/concat_text/test.sh diff --git a/src/_authors/dries_schaumont.yaml b/src/_authors/dries_schaumont.yaml new file mode 100644 index 0000000..b267808 --- /dev/null +++ b/src/_authors/dries_schaumont.yaml @@ -0,0 +1,11 @@ +name: Dries Schaumont +info: + links: + email: dries@data-intuitive.com + github: DriesSchaumont + orcid: "0000-0002-4389-0440" + linkedin: dries-schaumont + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist diff --git a/src/_authors/toni_verbeiren.yaml b/src/_authors/toni_verbeiren.yaml new file mode 100644 index 0000000..2f2f851 --- /dev/null +++ b/src/_authors/toni_verbeiren.yaml @@ -0,0 +1,9 @@ +name: Toni Verbeiren +info: + links: + github: tverbeiren + linkedin: verbeiren + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist and CEO diff --git a/src/concat_text/config.vsh.yaml b/src/concat_text/config.vsh.yaml new file mode 100644 index 0000000..6a825ac --- /dev/null +++ b/src/concat_text/config.vsh.yaml @@ -0,0 +1,56 @@ +name: concat_text +description: | + Concatenate a number of text files, handle gzipped text files gracefully and + optionally gzip the output text file. + + This component is useful for concatening fastq files from different lanes, for instance. +authors: + - __merge__: /src/_authors/toni_verbeiren.yaml + roles: [ author, maintainer ] + - __merge__: /src/_authors/dries_schaumont.yaml + roles: [ reviewer ] +info: + improvements: | + This component could be improved in 2 ways: + 1. Allow for a mix of zipped and plain input files + 2. Allow to specify a compression algorithm for the output +argument_groups: + - name: Input arguments + arguments: + - name: --input + description: A list of (gzipped) text files. + type: file + multiple: true + required: true + example: input?.txt.gz + - name: Output arguments + arguments: + - name: "--gzip_output" + type: boolean_true + description: Should the output be zipped? + - name: --output + description: File to write the output to, optionally gzipped. + type: file + direction: output + example: output.txt + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: alpine:latest + setup: + - type: apk + packages: + - bash + - procps + - file + +runners: + - type: executable + - type: nextflow diff --git a/src/concat_text/script.sh b/src/concat_text/script.sh new file mode 100644 index 0000000..5efab6f --- /dev/null +++ b/src/concat_text/script.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -euo pipefail + +TMPDIR=$(mktemp -d "$meta_temp_dir/concat_text-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +par_input="$(echo "$par_input" | tr ';' ' ')" + +echo -n ">> Check if input is gzipped... " +set +eo pipefail +file $par_input | grep -q 'gzip' +is_zipped="$?" +set -euo pipefail +[[ "$is_zipped" == "0" ]] && echo "yes" || echo "no" + +if [[ "$is_zipped" == "0" ]]; then + echo ">> zcat gzipped files" + zcat $par_input > $TMPDIR/contents +else + echo ">> cat plain files" + cat $par_input > $TMPDIR/contents +fi + +if [ "$par_gzip_output" == true ]; then + echo ">> Zip output file" + gzip $TMPDIR/contents + mv $TMPDIR/contents.gz $par_output +else + mv $TMPDIR/contents $par_output +fi diff --git a/src/concat_text/test.sh b/src/concat_text/test.sh new file mode 100644 index 0000000..1e75223 --- /dev/null +++ b/src/concat_text/test.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +set -euo pipefail + +echo ">> Creating test input files file[1-3].txt" +INPUT_FILE_1="file1.txt" +INPUT_FILE_2="file2.txt" +INPUT_FILE_3="file3.txt" +echo "one" > "$INPUT_FILE_1" +echo "two" > "$INPUT_FILE_2" +echo "three" > "$INPUT_FILE_3" +echo ">> Created input files" + +echo ">> Creating zipped versions at file[1-3].txt.gz" +gzip -k $INPUT_FILE_1 +gzip -k $INPUT_FILE_2 +gzip -k $INPUT_FILE_3 + +echo ">> Creating expected output file expected_output.txt and zipped version" +cat > "expected_output.txt" <> Run component on 3 plain input files, plain output" +$meta_executable \ + --input "$INPUT_FILE_1;$INPUT_FILE_2;$INPUT_FILE_3" \ + --output "output1.txt" + +[[ ! -f "output1.txt" ]] \ + && echo "Output file output1.txt not found!" && exit 1 +[[ $(cmp "output1.txt" "expected_output.txt") ]] \ + && echo "Output file output1.txt is not as expected!" && exit 1 + +echo ">> Run component on 3 zipped input files, plain output" +$meta_executable \ + --input "$INPUT_FILE_1.gz;$INPUT_FILE_2.gz;$INPUT_FILE_3.gz" \ + --output "output2.txt" + +[[ ! -f "output2.txt" ]] \ + && echo "Output file output2.txt not found!" && exit 1 +[[ $(cmp "output2.txt" "expected_output.txt") ]] \ + && echo "Output file output2.txt is not as expected!" && exit 1 + +echo ">> Run component on 3 plain input files, zipped output" +$meta_executable \ + --input "$INPUT_FILE_1;$INPUT_FILE_2;$INPUT_FILE_3" \ + --output "output3.txt.gz" \ + --gzip_output + +[[ ! -f "output3.txt.gz" ]] \ + && echo "Output file output3.txt.gz not found!" && exit 1 +[[ $(cmp "output3.txt.gz" "expected_output.txt.gz") ]] \ + && echo "Output file output3.txt.gz is not as expected!" && exit 1 + +echo ">> Run component on 3 zipped input files, zipped output" +$meta_executable \ + --input "$INPUT_FILE_1.gz;$INPUT_FILE_2.gz;$INPUT_FILE_3.gz" \ + --output "output4.txt.gz" \ + --gzip_output + +[[ ! -f "output4.txt.gz" ]] \ + && echo "Output file output4.txt.gz not found!" && exit 1 +[[ $(cmp "output4.txt.gz" "expected_output.txt.gz") ]] \ + && echo "Output file output4.txt.gz is not as expected!" && exit 1 + +echo ">> Tests done"