diff --git a/tools/sklearn/main_macros.xml b/tools/sklearn/main_macros.xml index 6617910c80..c1cf57e48a 100644 --- a/tools/sklearn/main_macros.xml +++ b/tools/sklearn/main_macros.xml @@ -1,5 +1,5 @@ - 1.0.11.1 + 1.0.11.2 24.2 diff --git a/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta b/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta new file mode 100644 index 0000000000..fe5eb849aa --- /dev/null +++ b/tools/sklearn/test-data/dna_kmer_sanitized_input.fasta @@ -0,0 +1,6 @@ +>1 +sequences +>2 + ACGT +>3 +TGCAA diff --git a/tools/sklearn/test-data/dna_ohe_input.fasta b/tools/sklearn/test-data/dna_ohe_input.fasta new file mode 100644 index 0000000000..72b12a656b --- /dev/null +++ b/tools/sklearn/test-data/dna_ohe_input.fasta @@ -0,0 +1,4 @@ +>seq1 +ACGT +>seq2 +TGCAA diff --git a/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta b/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta new file mode 100644 index 0000000000..17eda9f83c --- /dev/null +++ b/tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta @@ -0,0 +1,4 @@ +>seq1 +ACGT +>seq2 +TGCA diff --git a/tools/sklearn/test-data/ohe_out_4.tabular b/tools/sklearn/test-data/ohe_out_4.tabular deleted file mode 100644 index ae661a51fb..0000000000 --- a/tools/sklearn/test-data/ohe_out_4.tabular +++ /dev/null @@ -1,8 +0,0 @@ -1 0 0 0 -0 1 0 0 -0 0 1 0 -0 0 0 1 -0 0 0 1 -0 0 1 0 -0 1 0 0 -1 0 0 0 diff --git a/tools/sklearn/test-data/ohe_out_5.tabular b/tools/sklearn/test-data/ohe_out_5.tabular deleted file mode 100644 index 0ffcd771b2..0000000000 --- a/tools/sklearn/test-data/ohe_out_5.tabular +++ /dev/null @@ -1,8 +0,0 @@ -1 0 0 0 0 -0 1 0 0 0 -0 0 1 0 0 -0 0 0 1 0 -0 0 0 1 0 -0 0 1 0 0 -0 1 0 0 0 -1 0 0 0 0 diff --git a/tools/sklearn/to_categorical.py b/tools/sklearn/to_categorical.py index 46627f75ef..89283dbb22 100644 --- a/tools/sklearn/to_categorical.py +++ b/tools/sklearn/to_categorical.py @@ -1,52 +1,176 @@ import argparse import json +import re import warnings +import h5py import numpy as np import pandas as pd -from keras.utils import to_categorical +warnings.simplefilter("ignore") -def main(inputs, infile, outfile, num_classes=None): - """ - Parameter - --------- - input : str - File path to galaxy tool parameter - infile : str - File paths of input vector +def _get_longest_sequence_length(fasta_file): + max_len = 0 + max_id = None + for name in fasta_file.keys(): + seq_len = len(fasta_file[name]) + if seq_len > max_len: + max_len = seq_len + max_id = name - outfile : str - File path to output matrix + return max_len, max_id - num_classes : str - Total number of classes. If None, this would be inferred as the (largest number in y) + 1 - """ - warnings.simplefilter("ignore") +def encode_dna_sequences(fasta_path, padding, outfile, outfile_matrix): + from galaxy_ml.preprocessors import GenomeOneHotEncoder + import pyfaidx - with open(inputs, "r") as param_handler: - params = json.load(param_handler) + seq_length = None + fasta_file = pyfaidx.Fasta(fasta_path) + if padding: + seq_length, max_id = _get_longest_sequence_length(fasta_file) + print("Longest sequence is %s with length %d" % (max_id, seq_length)) + print("Padding: {}".format(padding)) + X = np.arange(len(fasta_file.keys())).reshape(-1, 1) + genome_encoder = GenomeOneHotEncoder( + fasta_path=fasta_path, seq_length=seq_length, padding=padding + ) + genome_encoder.fit(X) + encoded_dna_sequences = genome_encoder.transform(X) + flatted_enc_seqs = encoded_dna_sequences.flatten().reshape( + encoded_dna_sequences.shape[0], -1 + ) + np.savetxt( + outfile, np.asarray(flatted_enc_seqs, dtype=int), fmt="%d", delimiter="\t" + ) + with h5py.File(outfile_matrix, "w") as handle: + handle.create_dataset("data", data=encoded_dna_sequences, compression="gzip") - input_header = params["header0"] - header = "infer" if input_header else None - input_vector = pd.read_csv(infile, sep="\t", header=header) +def seq_to_kmers(sequence, k=3): + return [sequence[idx: idx + k] for idx in range(len(sequence) - k + 1)] + + +def normalize_dna_sequence(sequence): + return re.sub(r"\s+", "", sequence.upper()) + + +def is_valid_dna_kmer(kmer): + valid_dna_chars = set("ACGTRYSWKMBDHVN") + return set(kmer).issubset(valid_dna_chars) + + +def build_kmer_vocabulary(sequences, k): + vocabulary = {"": 0, "": 1} + for sequence in sequences: + for kmer in seq_to_kmers(sequence, k): + if is_valid_dna_kmer(kmer) and kmer not in vocabulary: + vocabulary[kmer] = len(vocabulary) + + if len(vocabulary) == 2: + raise ValueError( + "No DNA k-mers were generated. Check that k is not longer than all sequences." + ) + + return vocabulary + + +def encode_sequence_kmers(sequence, vocabulary, k): + return [ + vocabulary.get(kmer, vocabulary[""]) + for kmer in seq_to_kmers(sequence, k) + if is_valid_dna_kmer(kmer) + ] + +def pad_encoded_sequences(encoded_sequences, pad_value=0): + max_len = max(len(sequence) for sequence in encoded_sequences) + return [ + sequence + [pad_value] * (max_len - len(sequence)) + for sequence in encoded_sequences + ] + + +def encode_dna_kmers(fasta_path, k, outfile, outfile_vocab): + import pyfaidx + + if k < 1: + raise ValueError("k-mer size must be at least 1.") + + fasta_file = pyfaidx.Fasta(fasta_path) + sequences = [ + normalize_dna_sequence(str(fasta_file[name])) for name in fasta_file.keys() + ] + vocabulary = build_kmer_vocabulary(sequences, k) + encoded_sequences = [ + encode_sequence_kmers(sequence, vocabulary, k) for sequence in sequences + ] + padded_sequences = np.asarray( + pad_encoded_sequences(encoded_sequences, pad_value=vocabulary[""]), + dtype=int, + ) + np.savetxt(outfile, padded_sequences, fmt="%d", delimiter="\t") + with open(outfile_vocab, "w") as handle: + json.dump(vocabulary, handle, indent=4, sort_keys=False) + handle.write("\n") + + +def encode_labels(infile, input_header, outfile, num_classes=None): + from keras.utils import to_categorical + + header = "infer" if input_header else None + input_vector = pd.read_csv(infile, sep="\t", header=header) output_matrix = to_categorical(input_vector, num_classes=num_classes) + np.savetxt(outfile, np.asarray(output_matrix, dtype=int), fmt="%d", delimiter="\t") + + +def main(args): + task_type = args.encoder_task_type + num_classes = args.num_classes + header = "infer" if args.labels_header == "booltrue" else None + padding = True if args.padding == "booltrue" else False + sequence_encoding = args.sequence_encoding + kmer_size = args.kmer_size - np.savetxt(outfile, output_matrix, fmt="%d", delimiter="\t") + if task_type == "label_encoder": + encode_labels(args.labels_path, header, args.outfile, num_classes=num_classes) + elif task_type == "dna_encoder": + if sequence_encoding == "one_hot": + encode_dna_sequences( + args.fasta_path, padding, args.outfile, args.outfile_matrix + ) + elif sequence_encoding == "kmer": + encode_dna_kmers( + args.fasta_path, kmer_size, args.outfile, args.outfile_vocab + ) + else: + raise ValueError( + "Unsupported DNA sequence encoding: %s" % sequence_encoding + ) + else: + raise ValueError("Unsupported encoder type: %s" % task_type) if __name__ == "__main__": aparser = argparse.ArgumentParser() - aparser.add_argument("-i", "--inputs", dest="inputs", required=True) - aparser.add_argument("-y", "--infile", dest="infile") + aparser.add_argument("-l", "--labels_path", dest="labels_path") + aparser.add_argument("-d", "--labels_header", dest="labels_header", default=False) + aparser.add_argument( + "-t", "--encoder_task_type", dest="encoder_task_type", required=True + ) + aparser.add_argument( + "-y", "--num_classes", dest="num_classes", type=int, default=None + ) + aparser.add_argument("-p", "--padding", dest="padding", default="boolfalse") aparser.add_argument( - "-n", "--num_classes", dest="num_classes", type=int, default=None + "-s", "--sequence_encoding", dest="sequence_encoding", default="one_hot" ) - aparser.add_argument("-o", "--outfile", dest="outfile") + aparser.add_argument("-k", "--kmer_size", dest="kmer_size", type=int, default=3) + aparser.add_argument("-f", "--fasta_path", dest="fasta_path") + aparser.add_argument("-o", "--outfile", dest="outfile", required=True) + aparser.add_argument("-m", "--outfile_matrix", dest="outfile_matrix") + aparser.add_argument("-v", "--outfile_vocab", dest="outfile_vocab") args = aparser.parse_args() - main(args.inputs, args.infile, args.outfile, args.num_classes) + main(args) diff --git a/tools/sklearn/to_categorical.xml b/tools/sklearn/to_categorical.xml index f98af50f89..a8c5bf7f05 100644 --- a/tools/sklearn/to_categorical.xml +++ b/tools/sklearn/to_categorical.xml @@ -1,5 +1,5 @@ - Converts a class vector (integers) to binary class matrix + Encodes labels to a one-hot encoded matrix and DNA sequences to one-hot and k-mer representations main_macros.xml @@ -7,86 +7,334 @@ echo "@VERSION@" - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + encoder_type['task_type'] == "dna_encoder" and encoder_type['sequence_encoding']['encoding_method'] == "one_hot" + + + encoder_type['task_type'] == "dna_encoder" and encoder_type['sequence_encoding']['encoding_method'] == "kmer" + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + - - - - - + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - + + + + + + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ``[1, 0, 0, 0]`` +- ``C`` -> ``[0, 1, 0, 0]`` +- ``G`` -> ``[0, 0, 1, 0]`` +- ``T`` -> ``[0, 0, 0, 1]`` + +For example, ``ACGCTG`` is encoded in the H5 output as:: + + [[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 0, 1], + [0, 0, 1, 0]] + +The same sequence is flattened in the tabular output as:: -tf.keras.utils.to_categorical( -y, num_classes=None, dtype='float32' -) + 1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0 -E.g. for use with categorical_crossentropy. +3. DNA k-mer token encoding -Arguments +This mode: -y: a vector of numbers to be converted into a matrix of one-hot encoded values. -num_classes: total number of classes. If None, this would be inferred as the (largest number in y) + 1. -dtype: The data type expected by the input. Default: 'float32'. +- builds a shared k-mer vocabulary from all input sequences +- starts the vocabulary with `` = 0`` and `` = 1`` +- encodes each sequence as token ids +- pads all encoded rows to the same length using ```` +- also outputs the vocabulary as a JSON output -Returns +For example, with sequences ``ACGT`` and ``TGCAA`` using ``k=3``, the vocabulary becomes:: -A binary matrix representation of the input. The classes axis is placed last. + {'': 0, '': 1, 'ACG': 2, 'CGT': 3, 'TGC': 4, 'GCA': 5, 'CAA': 6} -Raises +The encoded rows become:: -Value Error: If input contains string value + [2, 3, 0] + [4, 5, 6] ]]>