diff --git a/tools/sklearn/main_macros.xml b/tools/sklearn/main_macros.xml
index 6617910c80..c1cf57e48a 100644
--- a/tools/sklearn/main_macros.xml
+++ b/tools/sklearn/main_macros.xml
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION@">1.0.11.1</token>
+    <token name="@VERSION@">1.0.11.2</token>
     <token name="@PROFILE@">24.2</token>
 
     <xml name="python_requirements">
diff --git a/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta b/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta
new file mode 100644
index 0000000000..fe5eb849aa
--- /dev/null
+++ b/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta
@@ -0,0 +1,6 @@
+>1
+sequences
+>2
+   ACGT
+>3
+TGCAA
diff --git a/tools/sklearn/test-data/dna_ohe_input.fasta b/tools/sklearn/test-data/dna_ohe_input.fasta
new file mode 100644
index 0000000000..72b12a656b
--- /dev/null
+++ b/tools/sklearn/test-data/dna_ohe_input.fasta
@@ -0,0 +1,4 @@
+>seq1
+ACGT
+>seq2
+TGCAA
diff --git a/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta b/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta
new file mode 100644
index 0000000000..17eda9f83c
--- /dev/null
+++ b/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta
@@ -0,0 +1,4 @@
+>seq1
+ACGT
+>seq2
+TGCA
diff --git a/tools/sklearn/test-data/ohe_out_4.tabular b/tools/sklearn/test-data/ohe_out_4.tabular
deleted file mode 100644
index ae661a51fb..0000000000
--- a/tools/sklearn/test-data/ohe_out_4.tabular
+++ /dev/null
@@ -1,8 +0,0 @@
-1	0	0	0
-0	1	0	0
-0	0	1	0
-0	0	0	1
-0	0	0	1
-0	0	1	0
-0	1	0	0
-1	0	0	0
diff --git a/tools/sklearn/test-data/ohe_out_5.tabular b/tools/sklearn/test-data/ohe_out_5.tabular
deleted file mode 100644
index 0ffcd771b2..0000000000
--- a/tools/sklearn/test-data/ohe_out_5.tabular
+++ /dev/null
@@ -1,8 +0,0 @@
-1	0	0	0	0
-0	1	0	0	0
-0	0	1	0	0
-0	0	0	1	0
-0	0	0	1	0
-0	0	1	0	0
-0	1	0	0	0
-1	0	0	0	0
diff --git a/tools/sklearn/to_categorical.py b/tools/sklearn/to_categorical.py
index 46627f75ef..89283dbb22 100644
--- a/tools/sklearn/to_categorical.py
+++ b/tools/sklearn/to_categorical.py
@@ -1,52 +1,176 @@
 import argparse
 import json
+import re
 import warnings
 
+import h5py
 import numpy as np
 import pandas as pd
-from keras.utils import to_categorical
 
+warnings.simplefilter("ignore")
 
-def main(inputs, infile, outfile, num_classes=None):
-    """
-    Parameter
-    ---------
-    input : str
-        File path to galaxy tool parameter
 
-    infile : str
-        File paths of input vector
+def _get_longest_sequence_length(fasta_file):
+    max_len = 0
+    max_id = None
+    for name in fasta_file.keys():
+        seq_len = len(fasta_file[name])
+        if seq_len > max_len:
+            max_len = seq_len
+            max_id = name
 
-    outfile : str
-        File path to output matrix
+    return max_len, max_id
 
-    num_classes : str
-        Total number of classes. If None, this would be inferred as the (largest number in y) + 1
 
-    """
-    warnings.simplefilter("ignore")
+def encode_dna_sequences(fasta_path, padding, outfile, outfile_matrix):
+    from galaxy_ml.preprocessors import GenomeOneHotEncoder
+    import pyfaidx
 
-    with open(inputs, "r") as param_handler:
-        params = json.load(param_handler)
+    seq_length = None
+    fasta_file = pyfaidx.Fasta(fasta_path)
+    if padding:
+        seq_length, max_id = _get_longest_sequence_length(fasta_file)
+        print("Longest sequence is %s with length %d" % (max_id, seq_length))
+    print("Padding: {}".format(padding))
+    X = np.arange(len(fasta_file.keys())).reshape(-1, 1)
+    genome_encoder = GenomeOneHotEncoder(
+        fasta_path=fasta_path, seq_length=seq_length, padding=padding
+    )
+    genome_encoder.fit(X)
+    encoded_dna_sequences = genome_encoder.transform(X)
+    flatted_enc_seqs = encoded_dna_sequences.flatten().reshape(
+        encoded_dna_sequences.shape[0], -1
+    )
+    np.savetxt(
+        outfile, np.asarray(flatted_enc_seqs, dtype=int), fmt="%d", delimiter="\t"
+    )
+    with h5py.File(outfile_matrix, "w") as handle:
+        handle.create_dataset("data", data=encoded_dna_sequences, compression="gzip")
 
-    input_header = params["header0"]
-    header = "infer" if input_header else None
 
-    input_vector = pd.read_csv(infile, sep="\t", header=header)
+def seq_to_kmers(sequence, k=3):
+    return [sequence[idx: idx + k] for idx in range(len(sequence) - k + 1)]
+
+
+def normalize_dna_sequence(sequence):
+    return re.sub(r"\s+", "", sequence.upper())
+
+
+def is_valid_dna_kmer(kmer):
+    valid_dna_chars = set("ACGTRYSWKMBDHVN")
+    return set(kmer).issubset(valid_dna_chars)
+
+
+def build_kmer_vocabulary(sequences, k):
+    vocabulary = {"<PAD>": 0, "<UNK>": 1}
+    for sequence in sequences:
+        for kmer in seq_to_kmers(sequence, k):
+            if is_valid_dna_kmer(kmer) and kmer not in vocabulary:
+                vocabulary[kmer] = len(vocabulary)
+
+    if len(vocabulary) == 2:
+        raise ValueError(
+            "No DNA k-mers were generated. Check that k is not longer than all sequences."
+        )
+
+    return vocabulary
+
+
+def encode_sequence_kmers(sequence, vocabulary, k):
+    return [
+        vocabulary.get(kmer, vocabulary["<UNK>"])
+        for kmer in seq_to_kmers(sequence, k)
+        if is_valid_dna_kmer(kmer)
+    ]
+
 
+def pad_encoded_sequences(encoded_sequences, pad_value=0):
+    max_len = max(len(sequence) for sequence in encoded_sequences)
+    return [
+        sequence + [pad_value] * (max_len - len(sequence))
+        for sequence in encoded_sequences
+    ]
+
+
+def encode_dna_kmers(fasta_path, k, outfile, outfile_vocab):
+    import pyfaidx
+
+    if k < 1:
+        raise ValueError("k-mer size must be at least 1.")
+
+    fasta_file = pyfaidx.Fasta(fasta_path)
+    sequences = [
+        normalize_dna_sequence(str(fasta_file[name])) for name in fasta_file.keys()
+    ]
+    vocabulary = build_kmer_vocabulary(sequences, k)
+    encoded_sequences = [
+        encode_sequence_kmers(sequence, vocabulary, k) for sequence in sequences
+    ]
+    padded_sequences = np.asarray(
+        pad_encoded_sequences(encoded_sequences, pad_value=vocabulary["<PAD>"]),
+        dtype=int,
+    )
+    np.savetxt(outfile, padded_sequences, fmt="%d", delimiter="\t")
+    with open(outfile_vocab, "w") as handle:
+        json.dump(vocabulary, handle, indent=4, sort_keys=False)
+        handle.write("\n")
+
+
+def encode_labels(infile, input_header, outfile, num_classes=None):
+    from keras.utils import to_categorical
+
+    header = "infer" if input_header else None
+    input_vector = pd.read_csv(infile, sep="\t", header=header)
     output_matrix = to_categorical(input_vector, num_classes=num_classes)
+    np.savetxt(outfile, np.asarray(output_matrix, dtype=int), fmt="%d", delimiter="\t")
+
+
+def main(args):
+    task_type = args.encoder_task_type
+    num_classes = args.num_classes
+    header = "infer" if args.labels_header == "booltrue" else None
+    padding = True if args.padding == "booltrue" else False
+    sequence_encoding = args.sequence_encoding
+    kmer_size = args.kmer_size
 
-    np.savetxt(outfile, output_matrix, fmt="%d", delimiter="\t")
+    if task_type == "label_encoder":
+        encode_labels(args.labels_path, header, args.outfile, num_classes=num_classes)
+    elif task_type == "dna_encoder":
+        if sequence_encoding == "one_hot":
+            encode_dna_sequences(
+                args.fasta_path, padding, args.outfile, args.outfile_matrix
+            )
+        elif sequence_encoding == "kmer":
+            encode_dna_kmers(
+                args.fasta_path, kmer_size, args.outfile, args.outfile_vocab
+            )
+        else:
+            raise ValueError(
+                "Unsupported DNA sequence encoding: %s" % sequence_encoding
+            )
+    else:
+        raise ValueError("Unsupported encoder type: %s" % task_type)
 
 
 if __name__ == "__main__":
     aparser = argparse.ArgumentParser()
-    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
-    aparser.add_argument("-y", "--infile", dest="infile")
+    aparser.add_argument("-l", "--labels_path", dest="labels_path")
+    aparser.add_argument("-d", "--labels_header", dest="labels_header", default=False)
+    aparser.add_argument(
+        "-t", "--encoder_task_type", dest="encoder_task_type", required=True
+    )
+    aparser.add_argument(
+        "-y", "--num_classes", dest="num_classes", type=int, default=None
+    )
+    aparser.add_argument("-p", "--padding", dest="padding", default="boolfalse")
     aparser.add_argument(
-        "-n", "--num_classes", dest="num_classes", type=int, default=None
+        "-s", "--sequence_encoding", dest="sequence_encoding", default="one_hot"
     )
-    aparser.add_argument("-o", "--outfile", dest="outfile")
+    aparser.add_argument("-k", "--kmer_size", dest="kmer_size", type=int, default=3)
+    aparser.add_argument("-f", "--fasta_path", dest="fasta_path")
+    aparser.add_argument("-o", "--outfile", dest="outfile", required=True)
+    aparser.add_argument("-m", "--outfile_matrix", dest="outfile_matrix")
+    aparser.add_argument("-v", "--outfile_vocab", dest="outfile_vocab")
     args = aparser.parse_args()
 
-    main(args.inputs, args.infile, args.outfile, args.num_classes)
+    main(args)
diff --git a/tools/sklearn/to_categorical.xml b/tools/sklearn/to_categorical.xml
index f98af50f89..a8c5bf7f05 100644
--- a/tools/sklearn/to_categorical.xml
+++ b/tools/sklearn/to_categorical.xml
@@ -1,5 +1,5 @@
 <tool id="sklearn_to_categorical" name="To categorical" version="@VERSION@" profile="@PROFILE@">
-    <description>Converts a class vector (integers) to binary class matrix</description>
+    <description>Encodes labels to a one-hot encoded matrix and DNA sequences to one-hot and k-mer representations</description>
     <macros>
         <import>main_macros.xml</import>
     </macros>
@@ -7,86 +7,334 @@
     <expand macro="macro_stdio" />
     <version_command>echo "@VERSION@"</version_command>
     <command detect_errors="exit_code"><![CDATA[
+        #if str($encoder_type.task_type) == 'dna_encoder'
+            ln -s '$encoder_type.fasta_path' input_fasta.fasta &&
+        #end if
         python '$__tool_directory__/to_categorical.py'
-            --inputs '$inputs'
-            --infile '$infile'
-            #if $num_classes
-            --num_classes '$num_classes'
+            --encoder_task_type '$encoder_type.task_type'
+            #if str($encoder_type.task_type) == 'label_encoder'
+            --labels_path '$labels_path'
+            --labels_header '$labels_header'
+            #if str($encoder_type.num_classes) != ''
+            --num_classes '$encoder_type.num_classes'
+            #end if
+            #else
+            --fasta_path input_fasta.fasta
+            --sequence_encoding '$encoder_type.sequence_encoding.encoding_method'
+            #if str($encoder_type.sequence_encoding.encoding_method) == 'one_hot'
+            --padding '$encoder_type.sequence_encoding.padding'
+            --outfile_matrix '$outfile_matrix'
+            #else
+            --kmer_size '$encoder_type.sequence_encoding.kmer_size'
+            --outfile_vocab '$outfile_vocab'
+            #end if
             #end if
             --outfile '$outfile'
     ]]>
     </command>
-    <configfiles>
-        <inputs name="inputs" />
-    </configfiles>
     <inputs>
-        <param name="infile" type="data" format="tabular" label="Input file" />
-        <param name="header0" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
-        <param name="num_classes" type="integer" optional="true" label="Total number of classes" />
+        <conditional name="encoder_type" label="Select encoding target">
+            <param name="task_type" type="select" label="Encoding for">
+                <option value="label_encoder">Labels</option>
+                <option value="dna_encoder">DNA sequences</option>
+            </param>
+            <when value="label_encoder">
+                <param name="num_classes" type="integer" optional="true" value="" label="Total number of classes" />
+                <param name="labels_path" type="data" format="tabular" label="Dataset containing labels" help="Dataset containing labels" />
+                <param name="labels_header" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header?" />
+            </when>
+            <when value="dna_encoder">
+                <param name="fasta_path" type="data" format="fasta" label="Dataset containing fasta genomic sequences" help="Sequences can be one-hot encoded or converted into k-mer token features." />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" type="select" label="Sequence encoding method">
+                        <option value="one_hot">One-hot encoding</option>
+                        <option value="kmer">K-mer encoding</option>
+                    </param>
+                    <when value="one_hot">
+                        <param name="padding" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Enable padding?" />
+                    </when>
+                    <when value="kmer">
+                        <param name="kmer_size" type="integer" value="3" min="1" label="K-mer size" help="Build a shared DNA k-mer vocabulary with &lt;PAD&gt; and &lt;UNK&gt;, encode sequences as token ids, and pad them to equal length." />
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
     </inputs>
     <outputs>
-        <data name="outfile" format="tabular" />
+        <data name="outfile" format="tabular"/>
+        <data name="outfile_matrix" format="h5" label="${tool.name} DNA matrix on ${on_string}">
+            <filter>encoder_type['task_type'] == "dna_encoder" and encoder_type['sequence_encoding']['encoding_method'] == "one_hot"</filter>
+        </data>
+        <data name="outfile_vocab" format="json" label="${tool.name} k-mer vocabulary on ${on_string}">
+            <filter>encoder_type['task_type'] == "dna_encoder" and encoder_type['sequence_encoding']['encoding_method'] == "kmer"</filter>
+        </data>
     </outputs>
     <tests>
-        <test>
-            <param name="infile" value="ohe_in_w_header.tabular" ftype="tabular" />
-            <param name="header0" value="true" />
-            <output name="outfile" file="ohe_out_4.tabular" ftype="tabular" />
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_w_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="true" />
+                <param name="num_classes" value="4" />
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="4" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_w_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="true" />
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="4" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_w_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="true" />
+                <param name="num_classes" value="5" />
+            </conditional>
+            <output name="outfile" ftype="tabular" >
+                <assert_contents>
+                    <has_n_columns n="5" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0\t0" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_wo_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="false" />
+                <param name="num_classes" value="4" />
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="4" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0" />
+                </assert_contents>
+            </output>
         </test>
-        <test>
-            <param name="infile" value="ohe_in_w_header.tabular" ftype="tabular" />
-            <param name="header0" value="true" />
-            <param name="num_classes" value="4" />
-            <output name="outfile" file="ohe_out_4.tabular" ftype="tabular" />
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_wo_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="false" />
+                <param name="num_classes" value="4" />
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="4" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0" />
+                </assert_contents>
+            </output>
         </test>
-        <test>
-            <param name="infile" value="ohe_in_w_header.tabular" ftype="tabular" />
-            <param name="header0" value="true" />
-            <param name="num_classes" value="5" />
-            <output name="outfile" file="ohe_out_5.tabular" ftype="tabular" />
+        <test expect_num_outputs="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="label_encoder" />
+                <param name="labels_path" value="ohe_in_wo_header.tabular" ftype="tabular" />
+                <param name="labels_header" value="false" />
+                <param name="num_classes" value="5" />
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="5" />
+                    <has_n_lines n="8" />
+                    <has_line_matching expression="1\t0\t0\t0\t0" />
+                </assert_contents>
+            </output>
         </test>
-        <test>
-            <param name="infile" value="ohe_in_wo_header.tabular" ftype="tabular" />
-            <param name="header0" value="false" />
-            <output name="outfile" file="ohe_out_4.tabular" ftype="tabular" />
+        <test expect_num_outputs="2">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_ohe_input.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="one_hot" />
+                    <param name="padding" value="true" />
+                </conditional>
+            </conditional>
+            <output name="outfile_matrix" ftype="h5" compare="sim_size" delta="32">
+                <assert_contents>
+                    <has_h5_keys keys="data" />
+                </assert_contents>
+            </output>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="20" />
+                    <has_n_lines n="2" />
+                    <has_line_matching expression="1\t0\t0\t0\t0\t1\t0\t0\t0\t0\t1\t0\t0\t0\t0\t1\t0\t0\t0\t0" />
+                </assert_contents>
+            </output>
         </test>
-        <test>
-            <param name="infile" value="ohe_in_wo_header.tabular" ftype="tabular" />
-            <param name="header0" value="false" />
-            <param name="num_classes" value="4" />
-            <output name="outfile" file="ohe_out_4.tabular" ftype="tabular" />
+        <test expect_failure="true" expect_exit_code="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_ohe_input.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="one_hot" />
+                    <param name="padding" value="false" />
+                </conditional>
+            </conditional>
         </test>
-        <test>
-            <param name="infile" value="ohe_in_wo_header.tabular" ftype="tabular" />
-            <param name="header0" value="false" />
-            <param name="num_classes" value="5" />
-            <output name="outfile" file="ohe_out_5.tabular" ftype="tabular" />
+        <test expect_num_outputs="2">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_ohe_input_same_seq_len.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="one_hot" />
+                    <param name="padding" value="false" />
+                </conditional>
+            </conditional>
+            <output name="outfile_matrix" ftype="h5">
+                <assert_contents>
+                    <has_h5_keys keys="data" />
+                </assert_contents>
+            </output>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="16" />
+                    <has_n_lines n="2" />
+                    <has_line_matching expression="1\t0\t0\t0\t0\t1\t0\t0\t0\t0\t1\t0\t0\t0\t0\t1" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_ohe_input.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="kmer" />
+                    <param name="kmer_size" value="3" />
+                </conditional>
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="3" />
+                    <has_n_lines n="2" />
+                    <has_line_matching expression="2\t3\t0" />
+                    <has_line_matching expression="4\t5\t6" />
+                </assert_contents>
+            </output>
+            <output name="outfile_vocab" ftype="json">
+                <assert_contents>
+                    <has_json_property_with_value property="&lt;PAD&gt;" value="0" />
+                    <has_json_property_with_value property="&lt;UNK&gt;" value="1" />
+                    <has_json_property_with_value property="ACG" value="2" />
+                    <has_json_property_with_value property="CAA" value="6" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_failure="true" expect_exit_code="1">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_ohe_input.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="kmer" />
+                    <param name="kmer_size" value="6" />
+                </conditional>
+            </conditional>
+        </test>
+        <test expect_num_outputs="2">
+            <conditional name="encoder_type">
+                <param name="task_type" value="dna_encoder" />
+                <param name="fasta_path" value="dna_kmer_sanitized_input.fasta" ftype="fasta" />
+                <conditional name="sequence_encoding">
+                    <param name="encoding_method" value="kmer" />
+                    <param name="kmer_size" value="3" />
+                </conditional>
+            </conditional>
+            <output name="outfile" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="3" />
+                    <has_n_lines n="3" />
+                    <has_line_matching expression="0\t0\t0" />
+                    <has_line_matching expression="2\t3\t0" />
+                    <has_line_matching expression="4\t5\t6" />
+                </assert_contents>
+            </output>
+            <output name="outfile_vocab" ftype="json">
+                <assert_contents>
+                    <has_json_property_with_value property="&lt;PAD&gt;" value="0" />
+                    <has_json_property_with_value property="&lt;UNK&gt;" value="1" />
+                    <has_json_property_with_value property="ACG" value="2" />
+                    <has_json_property_with_value property="CAA" value="6" />
+                </assert_contents>
+            </output>
         </test>
     </tests>
     <help><![CDATA[
 **What it does**
 
-Converts a class vector (integers) to binary class matrix.
+The tool does one-hot encoding of either labels or DNA sequences, and can also generate k-mer token encodings for DNA sequences.
+
+1. Label encoding to categorical matrix
+
+This mode wraps ``tf.keras.utils.to_categorical(y, num_classes=None, dtype='float32')``.
+
+- ``y``: vector of class labels to convert into a one-hot matrix.
+- ``num_classes``: total number of classes. If omitted, it is inferred as ``max(y) + 1``.
+- Output: a binary matrix representation of the input with the classes axis placed last.
+
+2. DNA one-hot encoding
+
+This mode outputs:
+
+- an H5 file containing the one-hot encoded sequence matrices
+- a tabular file containing flattened representations
+
+Example mapping:
+
+- ``A`` -> ``[1, 0, 0, 0]``
+- ``C`` -> ``[0, 1, 0, 0]``
+- ``G`` -> ``[0, 0, 1, 0]``
+- ``T`` -> ``[0, 0, 0, 1]``
+
+For example, ``ACGCTG`` is encoded in the H5 output as::
+
+    [[1, 0, 0, 0],
+     [0, 1, 0, 0],
+     [0, 0, 1, 0],
+     [0, 1, 0, 0],
+     [0, 0, 0, 1],
+     [0, 0, 1, 0]]
+
+The same sequence is flattened in the tabular output as::
 
-tf.keras.utils.to_categorical(
-y, num_classes=None, dtype='float32'
-)
+    1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0
 
-E.g. for use with categorical_crossentropy.
+3. DNA k-mer token encoding
 
-Arguments
+This mode:
 
-y: a vector of numbers to be converted into a matrix of one-hot encoded values.
-num_classes: total number of classes. If None, this would be inferred as the (largest number in y) + 1.
-dtype: The data type expected by the input. Default: 'float32'.
+- builds a shared k-mer vocabulary from all input sequences
+- starts the vocabulary with ``<PAD> = 0`` and ``<UNK> = 1``
+- encodes each sequence as token ids
+- pads all encoded rows to the same length using ``<PAD>``
+- also outputs the vocabulary as a JSON output
 
-Returns
+For example, with sequences ``ACGT`` and ``TGCAA`` using ``k=3``, the vocabulary becomes::
 
-A binary matrix representation of the input. The classes axis is placed last.
+    {'<PAD>': 0, '<UNK>': 1, 'ACG': 2, 'CGT': 3, 'TGC': 4, 'GCA': 5, 'CAA': 6}
 
-Raises
+The encoded rows become::
 
-Value Error: If input contains string value
+    [2, 3, 0]
+    [4, 5, 6]
 
     ]]>    </help>
     <expand macro="sklearn_citation" />