Update bioawk

DLBPointon · DLBPointon · commit 73a15fef58a0 · 2026-03-25T13:22:43.000Z
diff --git a/modules/nf-core/bioawk/main.nf b/modules/nf-core/bioawk/main.nf
@@ -1,5 +1,5 @@
 process BIOAWK {
-    tag "$meta.id"
+    tag "${meta.id}"
     label 'process_single'
 
     conda "${moduleDir}/environment.yml"
@@ -9,26 +9,39 @@ process BIOAWK {
 
     input:
     tuple val(meta), path(input)
+    val suffix
+    val zip_bool
 
     output:
-    tuple val(meta), path("*.gz"), emit: output
+    tuple val(meta), path("${file_output}"),    optional: true, emit: output
+    tuple val(meta), path("*.gz"),              optional: true, emit: gz_output
     tuple val("${task.process}"), val('bioawk'), val("1.0"), emit: versions_bioawk, topic: versions
     // WARN: Version information not provided by tool on CLI. Please update version string above when bumping container versions.
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args  = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    if ("${input}" == "${prefix}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate."
+    def args        = task.ext.args ?: ''
+    def prefix      = task.ext.prefix ?: "${meta.id}"
+    file_output     = "${prefix}.${suffix}"
+    if ("${input}" == "${file_output}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate."
 
     """
     bioawk \
         $args \
         $input \
-        > ${prefix}
+        > ${file_output}
 
-    gzip ${prefix}
+    if [ "${zip_bool}" = "true" ]; then
+        gzip "${file_output}"
+    fi
+
+    """
+
+    stub:
+    """
+    touch ${file_output}
+    echo "" > ${file_output}.gz
     """
 }
diff --git a/modules/nf-core/bioawk/meta.yml b/modules/nf-core/bioawk/meta.yml
@@ -1,25 +1,23 @@
-schema_version: 1.1.0
 name: "bioawk"
-description: Bioawk is an extension to Brian Kernighan's awk, adding the support of several common biological data formats.
-
+description: Bioawk is an extension to Brian Kernighan's awk, adding the support
+  of several common biological data formats.
 keywords:
   - bioawk
   - fastq
   - fasta
   - sam
   - file manipulation
   - awk
-
 tools:
   - bioawk:
       description: BWK awk modified for biological data
       homepage: https://github.com/lh3/bioawk
       documentation: https://github.com/lh3/bioawk
       tool_dev_url: https://github.com/lh3/bioawk
       licence:
-        - Free software license (https://github.com/lh3/bioawk/blob/master/README.awk#L1)
+        - Free software license
+          (https://github.com/lh3/bioawk/blob/master/README.awk#L1)
       identifier: ""
-
 input:
   - - meta:
         type: map
@@ -31,9 +29,28 @@ input:
         description: |
           Input biological sequence file (optionally gzipped) to be manipulated via the program specified in `$args`.
         pattern: "*.{bed,gff,sam,vcf,fastq,fasta,tab,bed.gz,gff.gz,sam.gz,vcf.gz,fastq.gz,fasta.gz,tab.gz}"
-
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+          - edam: http://edamontology.org/format_3475
+  - suffix:
+      type: string
+      description: The suffix to add to the output file name.
+  - zip_bool:
+      type: boolean
+      description: Whether to gzip the output file.
+      pattern: "true|false"
 output:
   output:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false ]
+      - ${file_output}:
+          type: file
+          description: Manipulated version of the input sequence file.
+          ontologies: []
+  gz_output:
     - - meta:
           type: map
           description: |
@@ -43,32 +60,31 @@ output:
           type: file
           description: Manipulated and gzipped version of the input sequence file.
           pattern: "*.gz"
-
+          ontologies:
+            - edam: http://edamontology.org/format_3989
   versions_bioawk:
-    - - "${task.process}":
+    - - ${task.process}:
           type: string
           description: The name of the process
-      - "bioawk":
+      - bioawk:
           type: string
           description: The name of the tool
       - "1.0":
           type: string
-          description: The version of the tool
-
+          description: The expression to obtain the version of the tool
 topics:
   versions:
-    - - "${task.process}":
+    - - ${task.process}:
           type: string
           description: The name of the process
-      - "bioawk":
+      - bioawk:
           type: string
           description: The name of the tool
       - "1.0":
           type: string
-          description: The version of the tool
-
+          description: The expression to obtain the version of the tool
 authors:
   - "@jfy133"
-
 maintainers:
   - "@jfy133"
+schema_version: 1.1.0
diff --git a/modules/nf-core/bioawk/tests/main.nf.test b/modules/nf-core/bioawk/tests/main.nf.test
@@ -4,13 +4,13 @@ nextflow_process {
     name "Test Process BIOAWK"
     script "../main.nf"
     process "BIOAWK"
-    config "./nextflow.config"
 
     tag "modules"
     tag "modules_nfcore"
     tag "bioawk"
 
-    test("test-bioawk") {
+    test("fasta bioawk fasta.gz") {
+        config "./nextflow.config"
 
         when {
             process {
@@ -19,6 +19,33 @@ nextflow_process {
 				    [ id:'test', single_end:false ], // meta map
 				    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
 				]
+				input[1] = "fa"
+				input[2] = true
+
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("fasta bioawk tsv") {
+        config "./nextflow_telomere_check.config"
+
+        when {
+            process {
+                """
+                input[0] = [
+				    [ id:'test', single_end:false ], // meta map
+				    file('/lustre/scratch124/tol/teams/tola/users/dp24/nf-modules/modules/nf-core/bioawk/tests/telomere.fasta', checkIfExists: true)
+				]
+				input[1] = "tsv"
+				input[2] = false
 
                 """
             }
diff --git a/modules/nf-core/bioawk/tests/main.nf.test.snap b/modules/nf-core/bioawk/tests/main.nf.test.snap
@@ -1,5 +1,5 @@
 {
-    "test-bioawk": {
+    "fasta bioawk fasta.gz": {
         "content": [
             {
                 "0": [
@@ -12,7 +12,17 @@
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,5fe88e58a71f10551df56518c35ba91a"
+                    
+                ],
+                "2": [
+                    [
+                        "BIOAWK",
+                        "bioawk",
+                        "1.0"
+                    ]
+                ],
+                "gz_output": [
+                    
                 ],
                 "output": [
                     [
@@ -23,15 +33,68 @@
                         "sample_1.fa.gz:md5,b558dd15d8940373a032a827d490e693"
                     ]
                 ],
-                "versions": [
-                    "versions.yml:md5,5fe88e58a71f10551df56518c35ba91a"
+                "versions_bioawk": [
+                    [
+                        "BIOAWK",
+                        "bioawk",
+                        "1.0"
+                    ]
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.3",
+            "nextflow": "25.10.2"
+        },
+        "timestamp": "2026-03-25T12:35:30.509942773"
+    },
+    "fasta bioawk tsv": {
+        "content": [
+            {
+                "0": [
+                    
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "telomere_summary.tsv:md5,20facddd524fd8f6c0c03505f0be3e7a"
+                    ]
+                ],
+                "2": [
+                    [
+                        "BIOAWK",
+                        "bioawk",
+                        "1.0"
+                    ]
+                ],
+                "gz_output": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "telomere_summary.tsv:md5,20facddd524fd8f6c0c03505f0be3e7a"
+                    ]
+                ],
+                "output": [
+                    
+                ],
+                "versions_bioawk": [
+                    [
+                        "BIOAWK",
+                        "bioawk",
+                        "1.0"
+                    ]
                 ]
             }
         ],
         "meta": {
-            "nf-test": "0.8.4",
-            "nextflow": "24.04.4"
+            "nf-test": "0.9.3",
+            "nextflow": "25.10.2"
         },
-        "timestamp": "2024-08-28T10:24:46.397249"
+        "timestamp": "2026-03-25T12:39:43.207782527"
     }
 }
diff --git a/modules/nf-core/bioawk/tests/nextflow.config b/modules/nf-core/bioawk/tests/nextflow.config
@@ -1,6 +1,6 @@
 process {
     withName: BIOAWK {
         ext.args = "-c fastx \'{print \">\" \$name ORS length(\$seq)}\'"
-        ext.prefix = "sample_1.fa"
+        ext.prefix = "sample_1"
     }
 }
diff --git a/modules/nf-core/bioawk/tests/nextflow_telomere_check.config b/modules/nf-core/bioawk/tests/nextflow_telomere_check.config
@@ -0,0 +1,6 @@
+process {
+    withName: BIOAWK {
+        ext.args = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct < 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\t%d\t%.2f\t%s\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s}\'"
+        ext.prefix = "telomere_summary"
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`process {`
`2`	`2`	`withName: BIOAWK {`
`3`	`3`	`ext.args = "-c fastx \'{print \">\" \$name ORS length(\$seq)}\'"`
`4`		`- ext.prefix = "sample_1.fa"`
	`4`	`+ ext.prefix = "sample_1"`
`5`	`5`	`}`
`6`	`6`	`}`