Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tools/sklearn/main_macros.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<macros>
<token name="@VERSION@">1.0.11.1</token>
<token name="@VERSION@">1.0.11.2</token>
<token name="@PROFILE@">24.2</token>

<xml name="python_requirements">
Expand Down
6 changes: 6 additions & 0 deletions tools/sklearn/test-data/dna_kmer_sanitized_input.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>1
sequences
>2
ACGT
>3
TGCAA
4 changes: 4 additions & 0 deletions tools/sklearn/test-data/dna_ohe_input.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>seq1
ACGT
>seq2
TGCAA
4 changes: 4 additions & 0 deletions tools/sklearn/test-data/dna_ohe_input_same_seq_len.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>seq1
ACGT
>seq2
TGCA
8 changes: 0 additions & 8 deletions tools/sklearn/test-data/ohe_out_4.tabular

This file was deleted.

8 changes: 0 additions & 8 deletions tools/sklearn/test-data/ohe_out_5.tabular

This file was deleted.

176 changes: 150 additions & 26 deletions tools/sklearn/to_categorical.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,176 @@
import argparse
import json
import re
import warnings

import h5py
import numpy as np
import pandas as pd
from keras.utils import to_categorical

warnings.simplefilter("ignore")

def main(inputs, infile, outfile, num_classes=None):
"""
Parameter
---------
input : str
File path to galaxy tool parameter

infile : str
File paths of input vector
def _get_longest_sequence_length(fasta_file):
max_len = 0
max_id = None
for name in fasta_file.keys():
seq_len = len(fasta_file[name])
if seq_len > max_len:
max_len = seq_len
max_id = name

outfile : str
File path to output matrix
return max_len, max_id

num_classes : str
Total number of classes. If None, this would be inferred as the (largest number in y) + 1

"""
warnings.simplefilter("ignore")
def encode_dna_sequences(fasta_path, padding, outfile, outfile_matrix):
from galaxy_ml.preprocessors import GenomeOneHotEncoder
import pyfaidx

with open(inputs, "r") as param_handler:
params = json.load(param_handler)
seq_length = None
fasta_file = pyfaidx.Fasta(fasta_path)
if padding:
seq_length, max_id = _get_longest_sequence_length(fasta_file)
print("Longest sequence is %s with length %d" % (max_id, seq_length))
print("Padding: {}".format(padding))
X = np.arange(len(fasta_file.keys())).reshape(-1, 1)
genome_encoder = GenomeOneHotEncoder(
fasta_path=fasta_path, seq_length=seq_length, padding=padding
)
genome_encoder.fit(X)
encoded_dna_sequences = genome_encoder.transform(X)
flatted_enc_seqs = encoded_dna_sequences.flatten().reshape(
encoded_dna_sequences.shape[0], -1
)
np.savetxt(
outfile, np.asarray(flatted_enc_seqs, dtype=int), fmt="%d", delimiter="\t"
)
with h5py.File(outfile_matrix, "w") as handle:
handle.create_dataset("data", data=encoded_dna_sequences, compression="gzip")

input_header = params["header0"]
header = "infer" if input_header else None

input_vector = pd.read_csv(infile, sep="\t", header=header)
def seq_to_kmers(sequence, k=3):
return [sequence[idx: idx + k] for idx in range(len(sequence) - k + 1)]


def normalize_dna_sequence(sequence):
return re.sub(r"\s+", "", sequence.upper())


def is_valid_dna_kmer(kmer):
valid_dna_chars = set("ACGTRYSWKMBDHVN")
return set(kmer).issubset(valid_dna_chars)


def build_kmer_vocabulary(sequences, k):
vocabulary = {"<PAD>": 0, "<UNK>": 1}
for sequence in sequences:
for kmer in seq_to_kmers(sequence, k):
if is_valid_dna_kmer(kmer) and kmer not in vocabulary:
vocabulary[kmer] = len(vocabulary)

if len(vocabulary) == 2:
raise ValueError(
"No DNA k-mers were generated. Check that k is not longer than all sequences."
)

return vocabulary


def encode_sequence_kmers(sequence, vocabulary, k):
return [
vocabulary.get(kmer, vocabulary["<UNK>"])
for kmer in seq_to_kmers(sequence, k)
if is_valid_dna_kmer(kmer)
]


def pad_encoded_sequences(encoded_sequences, pad_value=0):
max_len = max(len(sequence) for sequence in encoded_sequences)
return [
sequence + [pad_value] * (max_len - len(sequence))
for sequence in encoded_sequences
]


def encode_dna_kmers(fasta_path, k, outfile, outfile_vocab):
import pyfaidx

if k < 1:
raise ValueError("k-mer size must be at least 1.")

fasta_file = pyfaidx.Fasta(fasta_path)
sequences = [
normalize_dna_sequence(str(fasta_file[name])) for name in fasta_file.keys()
]
vocabulary = build_kmer_vocabulary(sequences, k)
encoded_sequences = [
encode_sequence_kmers(sequence, vocabulary, k) for sequence in sequences
]
padded_sequences = np.asarray(
pad_encoded_sequences(encoded_sequences, pad_value=vocabulary["<PAD>"]),
dtype=int,
)
np.savetxt(outfile, padded_sequences, fmt="%d", delimiter="\t")
with open(outfile_vocab, "w") as handle:
json.dump(vocabulary, handle, indent=4, sort_keys=False)
handle.write("\n")


def encode_labels(infile, input_header, outfile, num_classes=None):
from keras.utils import to_categorical

header = "infer" if input_header else None
input_vector = pd.read_csv(infile, sep="\t", header=header)
output_matrix = to_categorical(input_vector, num_classes=num_classes)
np.savetxt(outfile, np.asarray(output_matrix, dtype=int), fmt="%d", delimiter="\t")


def main(args):
task_type = args.encoder_task_type
num_classes = args.num_classes
header = "infer" if args.labels_header == "booltrue" else None
padding = True if args.padding == "booltrue" else False
sequence_encoding = args.sequence_encoding
kmer_size = args.kmer_size

np.savetxt(outfile, output_matrix, fmt="%d", delimiter="\t")
if task_type == "label_encoder":
encode_labels(args.labels_path, header, args.outfile, num_classes=num_classes)
elif task_type == "dna_encoder":
if sequence_encoding == "one_hot":
encode_dna_sequences(
args.fasta_path, padding, args.outfile, args.outfile_matrix
)
elif sequence_encoding == "kmer":
encode_dna_kmers(
args.fasta_path, kmer_size, args.outfile, args.outfile_vocab
)
else:
raise ValueError(
"Unsupported DNA sequence encoding: %s" % sequence_encoding
)
else:
raise ValueError("Unsupported encoder type: %s" % task_type)


if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
aparser.add_argument("-y", "--infile", dest="infile")
aparser.add_argument("-l", "--labels_path", dest="labels_path")
aparser.add_argument("-d", "--labels_header", dest="labels_header", default=False)
aparser.add_argument(
"-t", "--encoder_task_type", dest="encoder_task_type", required=True
)
aparser.add_argument(
"-y", "--num_classes", dest="num_classes", type=int, default=None
)
aparser.add_argument("-p", "--padding", dest="padding", default="boolfalse")
aparser.add_argument(
"-n", "--num_classes", dest="num_classes", type=int, default=None
"-s", "--sequence_encoding", dest="sequence_encoding", default="one_hot"
)
aparser.add_argument("-o", "--outfile", dest="outfile")
aparser.add_argument("-k", "--kmer_size", dest="kmer_size", type=int, default=3)
aparser.add_argument("-f", "--fasta_path", dest="fasta_path")
aparser.add_argument("-o", "--outfile", dest="outfile", required=True)
aparser.add_argument("-m", "--outfile_matrix", dest="outfile_matrix")
aparser.add_argument("-v", "--outfile_vocab", dest="outfile_vocab")
args = aparser.parse_args()

main(args.inputs, args.infile, args.outfile, args.num_classes)
main(args)
Loading
Loading