diff --git a/common/evaluators/qa_evaluator.py b/common/evaluators/qa_evaluator.py index f5cd1864..f4c801ac 100644 --- a/common/evaluators/qa_evaluator.py +++ b/common/evaluators/qa_evaluator.py @@ -16,9 +16,13 @@ def get_scores(self): for batch in self.data_loader: qids.extend(batch.id.detach().cpu().numpy()) # Select embedding - sent1, sent2 = self.get_sentence_embeddings(batch) - output = self.model(sent1, sent2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) + if hasattr(self.model, 'skip_embedding_lookup') and self.model.skip_embedding_lookup: + output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) + else: + sent1, sent2 = self.get_sentence_embeddings(batch) + output = self.model(sent1, sent2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) + test_cross_entropy_loss += F.cross_entropy(output, batch.label, size_average=False).item() true_labels.extend(batch.label.detach().cpu().numpy()) diff --git a/common/trainers/qa_trainer.py b/common/trainers/qa_trainer.py index 7fc130c6..c420c66d 100644 --- a/common/trainers/qa_trainer.py +++ b/common/trainers/qa_trainer.py @@ -15,10 +15,13 @@ def train_epoch(self, epoch): for batch_idx, batch in enumerate(self.train_loader): self.optimizer.zero_grad() - # Select embedding - sent1, sent2 = self.get_sentence_embeddings(batch) + if hasattr(self.model, 'skip_embedding_lookup') and self.model.skip_embedding_lookup: + output = self.model(batch.sentence_1, batch.sentence_2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) + else: + # Select embedding + sent1, sent2 = self.get_sentence_embeddings(batch) + output = self.model(sent1, sent2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) - output = self.model(sent1, sent2, batch.ext_feats, batch.dataset.word_to_doc_cnt, batch.sentence_1_raw, batch.sentence_2_raw) loss = F.nll_loss(output, batch.label, size_average=False) total_loss += loss.item() loss.backward() diff --git a/sm_cnn/.gitignore b/sm_cnn/.gitignore deleted file mode 100644 index edd68842..00000000 --- a/sm_cnn/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*pyc -*.pt -text/ -trained_models/ -data/ diff --git a/sm_cnn/README.md b/sm_cnn/README.md index 1acf41a3..b4c46050 100644 --- a/sm_cnn/README.md +++ b/sm_cnn/README.md @@ -12,12 +12,6 @@ Your repository root should be in your `PYTHONPATH` environment variable: export PYTHONPATH=$(pwd) ``` -To create the dataset: -```bash -cd Castor/sm_cnn/ -./create_dataset.sh -``` - We use `trec_eval` for evaluation: ```bash @@ -39,7 +33,7 @@ You can train the SM model for the 4 following configurations: To train on GPU 0 with static configuration: ```bash -python train.py --mode static --gpu 0 +python train.py --mode static --device 0 ``` NB: pass `--no_cuda` to use CPU diff --git a/sm_cnn/__init.py__.py b/sm_cnn/__init__.py similarity index 100% rename from sm_cnn/__init.py__.py rename to sm_cnn/__init__.py diff --git a/sm_cnn/__main__.py b/sm_cnn/__main__.py new file mode 100644 index 00000000..ca3ca3c0 --- /dev/null +++ b/sm_cnn/__main__.py @@ -0,0 +1,127 @@ +import argparse +import logging +from copy import deepcopy + +import os +import pprint +import random + +import numpy as np +import torch + +from common.dataset import DatasetFactory +from common.evaluation import EvaluatorFactory +from common.train import TrainerFactory +from utils.serialization import load_checkpoint +from sm_cnn.model import SMCNN +from sm_cnn.args import get_args + + +def get_logger(): + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(levelname)s - %(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + + return logger + +def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, keep_results=False): + saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device, + keep_results=keep_results) + scores, metric_names = saved_model_evaluator.get_scores() + logger.info('Evaluation metrics for {}'.format(split_name)) + logger.info('\t'.join([' '] + metric_names)) + logger.info('\t'.join([split_name] + list(map(str, scores)))) + +if __name__ == '__main__': + # Getting args + args = get_args() + config = deepcopy(args) + + # Getting logger + logger = get_logger() + logger.info(pprint.pformat(vars(args))) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.device != -1: + torch.cuda.manual_seed(args.seed) + + # Dealing with device + if not args.cuda: + args.gpu = -1 + if torch.cuda.is_available() and args.cuda: + logger.info("Note: You are using GPU for training") + torch.cuda.set_device(args.gpu) + torch.cuda.manual_seed(args.seed) + if torch.cuda.is_available() and not args.cuda: + logger.info("Warning: You have Cuda but do not use it. You are using CPU for training") + device = torch.device(f'cuda:{args.device}' if torch.cuda.is_available() and args.device >= 0 else 'cpu') + + if args.dataset not in ('trecqa', 'wikiqa'): + raise ValueError('Unrecognized dataset') + + dataset_cls, embedding, train_loader, test_loader, dev_loader \ + = DatasetFactory.get_dataset(args.dataset, args.word_vectors_dir, args.word_vectors_file, args.batch_size, + args.device) + + config.questions_num = dataset_cls.VOCAB_SIZE + config.answers_num = dataset_cls.VOCAB_SIZE + config.target_class = dataset_cls.NUM_CLASSES + model = SMCNN(config) + + + model = model.to(device) + embedding = embedding.to(device) + + + optimizer = torch.optim.Adadelta(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + + train_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, train_loader, args.batch_size, + args.device) + test_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, test_loader, args.batch_size, + args.device) + dev_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, dev_loader, args.batch_size, + args.device) + + + trainer_config = { + 'optimizer': optimizer, + 'batch_size': args.batch_size, + 'log_interval': args.log_interval, + 'model_outfile': args.model_outfile, + 'patience': args.patience, + 'tensorboard': args.tensorboard, + 'run_label': args.run_label, + 'logger': logger + } + + trainer = TrainerFactory.get_trainer(args.dataset, model, embedding, train_loader, trainer_config, train_evaluator, test_evaluator, dev_evaluator) + + if not args.skip_training: + total_params = 0 + for param in model.parameters(): + size = [s for s in param.size()] + total_params += np.prod(size) + logger.info('Total number of parameters: %s', total_params) + model.static_question_embed.weight.data.copy_(embedding.weight) + model.nonstatic_question_embed.weight.data.copy_(embedding.weight) + model.static_answer_embed.weight.data.copy_(embedding.weight) + model.nonstatic_answer_embed.weight.data.copy_(embedding.weight) + + trainer.train(args.epochs) + + _, _, state_dict, _, _ = load_checkpoint(args.model_outfile) + + for k, tensor in state_dict.items(): + state_dict[k] = tensor.to(device) + + model.load_state_dict(state_dict) + if dev_loader: + evaluate_dataset('dev', dataset_cls, model, embedding, dev_loader, args.batch_size, args.device) + evaluate_dataset('test', dataset_cls, model, embedding, test_loader, args.batch_size, args.device, args.keep_results) diff --git a/sm_cnn/args.py b/sm_cnn/args.py index c746e190..b88dcf5f 100644 --- a/sm_cnn/args.py +++ b/sm_cnn/args.py @@ -1,29 +1,44 @@ from argparse import ArgumentParser +import os def get_args(): parser = ArgumentParser(description="SM CNN") + + parser.add_argument('model_outfile', help='file to save final model') + parser.add_argument('--dataset', type=str, help='trecqa|wikiqa', default='trecqa') parser.add_argument('--no_cuda', action='store_false', help='do not use cuda', dest='cuda') - parser.add_argument('--gpu', type=int, default=0) # Use -1 for CPU + parser.add_argument('--word-vectors-dir', help='word vectors directory', + default=os.path.join(os.pardir, 'Castor-data', 'embeddings', 'word2vec')) + parser.add_argument('--word-vectors-file', help='word vectors filename', default='aquaint+wiki.txt.gz.ndim=50.txt') + parser.add_argument('--word-vectors-dim', type=int, default=50, + help='number of dimensions of word vectors (default: 50)') + parser.add_argument('--skip-training', help='will load pre-trained model', action='store_true') + parser.add_argument('--device', type=int, default=0, help='GPU device, -1 for CPU (default: 0)') + parser.add_argument('--batch-size', type=int, default=64, help='input batch size for training (default: 64)') parser.add_argument('--epochs', type=int, default=30) - parser.add_argument('--batch_size', type=int, default=64) - parser.add_argument('--mode', type=str, default='static') - parser.add_argument('--lr', type=float, default=1.0) + parser.add_argument('--lr', type=float, default=0.001, help='learning rate (default: 0.001)') parser.add_argument('--seed', type=int, default=3435) - parser.add_argument('--dataset', type=str, help='TREC|wiki', default='TREC') parser.add_argument('--resume_snapshot', type=str, default=None) parser.add_argument('--dev_every', type=int, default=30) parser.add_argument('--log_every', type=int, default=10) parser.add_argument('--patience', type=int, default=50) - parser.add_argument('--save_path', type=str, default='saves') parser.add_argument('--output_channel', type=int, default=100) parser.add_argument('--filter_width', type=int, default=5) - parser.add_argument('--words_dim', type=int, default=50) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--epoch_decay', type=int, default=15) parser.add_argument('--vector_cache', type=str, default='data/word2vec.trecqa.pt') parser.add_argument('--trained_model', type=str, default="") - parser.add_argument('--weight_decay',type=float, default=1e-5) + parser.add_argument('--weight_decay', type=float, default=1e-5) parser.add_argument('--onnx', action='store_true', help='export model to onnx') + parser.add_argument('--mode', type=str, default='rand') + parser.add_argument('--keep-results', action='store_true', + help='store the output score and qrel files into disk for the test set') + parser.add_argument('--log-interval', type=int, default=10, + help='how many batches to wait before logging training status (default: 10)') + parser.add_argument('--tensorboard', action='store_true', default=False, + help='use TensorBoard to visualize training (default: false)') + parser.add_argument('--run-label', type=str, help='label to describe run') + args = parser.parse_args() return args diff --git a/sm_cnn/bridge.py b/sm_cnn/bridge.py deleted file mode 100644 index 3b0e3a19..00000000 --- a/sm_cnn/bridge.py +++ /dev/null @@ -1,170 +0,0 @@ -import json -import os -import sys -from collections import Counter -import argparse -import random - -import numpy as np -import torch -from nltk.tokenize import TreebankWordTokenizer -from torchtext import data - -from sm_cnn.external_features import compute_overlap, compute_idf_weighted_overlap, stopped -from sm_cnn.trec_dataset import TrecDataset -from sm_cnn.wiki_dataset import WikiDataset -from anserini_dependency.RetrieveSentences import RetrieveSentences -from sm_cnn import model - -sys.modules['model'] = model - -class SMModelBridge(object): - - def __init__(self, args): - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print("Note: You are using GPU for training") - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print("Warning: You have Cuda but do not use it. You are using CPU for training") - - torch.manual_seed(args.seed) - np.random.seed(args.seed) - random.seed(args.seed) - - self.QID = data.Field(sequential=False) - self.QUESTION = data.Field(batch_first=True) - self.ANSWER = data.Field(batch_first=True) - self.LABEL = data.Field(sequential=False) - self.EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False, - postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr])) - - if 'TrecQA' in args.dataset: - train, dev, test = TrecDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) - elif 'WikiQA' in args.dataset: - train, dev, test = WikiDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) - else: - print("Unsupported dataset") - exit() - - self.QID.build_vocab(train, dev, test) - self.QUESTION.build_vocab(train, dev, test) - self.ANSWER.build_vocab(train, dev, test) - self.LABEL.build_vocab(train, dev, test) - - if args.cuda: - self.model = torch.load(args.model, map_location=lambda storage, location: storage.cuda(args.gpu)) - else: - self.model = torch.load(args.model, map_location=lambda storage, location: storage) - - self.gpu = args.gpu - - def parse(self, sentence): - s_toks = TreebankWordTokenizer().tokenize(sentence) - sentence = ' '.join(s_toks).lower() - return sentence - - def rerank_candidate_answers(self, question, answers, idf_json): - # run through the model - scores_sentences = [] - question = self.parse(question) - term_idfs = json.loads(idf_json) - term_idfs = dict((k, float(v)) for k, v in term_idfs.items()) - - for term in question.split(): - if term not in term_idfs: - term_idfs[term] = 0.0 - - for answer in answers: - answer = answer.split('\t')[0] - answer = self.parse(answer) - for term in answer.split(): - if term not in term_idfs: - term_idfs[term] = 0.0 - - overlap = compute_overlap([question], [answer]) - idf_weighted_overlap = compute_idf_weighted_overlap([question], [answer], term_idfs) - overlap_no_stopwords =\ - compute_overlap(stopped([question]), stopped([answer])) - idf_weighted_overlap_no_stopwords =\ - compute_idf_weighted_overlap(stopped([question]), stopped([answer]), term_idfs) - ext_feats = str(overlap[0]) + " " + str(idf_weighted_overlap[0]) + " " + \ - str(overlap_no_stopwords[0]) + " " + str(idf_weighted_overlap_no_stopwords[0]) - - - fields = [('question', self.QUESTION), ('answer', self.ANSWER), ('ext_feat', self.EXTERNAL)] - example = data.Example.fromlist([question, answer, ext_feats], fields) - this_question = self.QUESTION.numericalize(self.QUESTION.pad([example.question]), self.gpu) - this_answer = self.ANSWER.numericalize(self.ANSWER.pad([example.answer]), self.gpu) - this_external = self.EXTERNAL.numericalize(self.EXTERNAL.pad([example.ext_feat]), self.gpu) - self.model.eval() - scores = self.model(this_question, this_answer, this_external) - scores_sentences.append((scores[:, 2].cpu().data.numpy()[0].tolist(), answer)) - - return scores_sentences - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Bridge Demo. Produces scores in trec_eval format", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--model', help="the path to the saved model file") - parser.add_argument('--dataset', help="the QA dataset folder {TrecQA|WikiQA}", default='../../Castor-data/TrecQA/') - parser.add_argument("--index", help="Lucene index", required=True) - parser.add_argument("--embeddings", help="Path of the word2vec index", default="") - parser.add_argument("--topics", help="topics file", default="") - parser.add_argument("--query", help="a single query", default="where was newton born ?") - parser.add_argument("--hits", help="max number of hits to return", default=100) - parser.add_argument("--scorer", help="passage scores", default="Idf") - parser.add_argument("--k", help="top-k passages to be retrieved", default=1) - parser.add_argument('--no_cuda', action='store_false', help='do not use cuda', dest='cuda') - parser.add_argument('--gpu', type=int, default=0) # Use -1 for CPU - parser.add_argument('--seed', type=int, default=3435) - - args = parser.parse_args() - - if not args.cuda: - args.gpu = -1 - - retrieveSentencesObj = RetrieveSentences(args) - idf_json = retrieveSentencesObj.getTermIdfJSON() - smmodel = SMModelBridge(args) - - train_set, dev_set, test_set = 'train', 'dev', 'test' - if 'TrecQA' in args.dataset: - train_set, dev_set, test_set = 'train-all', 'raw-dev', 'raw-test' - - for split in [dev_set, test_set]: - outfile = open('bridge.{}.scores'.format(split), 'w') - - questions = [q.strip() for q in open(os.path.join(args.dataset, split, 'a.toks')).readlines()] - answers = [q.strip() for q in open(os.path.join(args.dataset, split, 'b.toks')).readlines()] - labels = [q.strip() for q in open(os.path.join(args.dataset, split, 'sim.txt')).readlines()] - qids = [q.strip() for q in open(os.path.join(args.dataset, split, 'id.txt')).readlines()] - - qid_question = dict(zip(qids, questions)) - q_counts = Counter(questions) - - answers_offset = 0 - docid_counter = 0 - - all_questions_answers = questions + answers - for qid, question in sorted(qid_question.items(), key=lambda x: float(x[0])): - num_answers = q_counts[question] - q_answers = answers[answers_offset: answers_offset + num_answers] - answers_offset += num_answers - sentence_scores = smmodel.rerank_candidate_answers(question, q_answers, idf_json) - - for score, sentence in sentence_scores: - print('{} Q0 {} 0 {} sm_cnn_bridge.{}.run'.format( - qid, - docid_counter, - score, - os.path.basename(args.dataset) - ), file=outfile) - docid_counter += 1 - if 'WikiQA' in args.dataset: - docid_counter = 0 - - outfile.close() diff --git a/sm_cnn/create_dataset.sh b/sm_cnn/create_dataset.sh deleted file mode 100755 index f786fed0..00000000 --- a/sm_cnn/create_dataset.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -mkdir -p data -python overlap_features.py --dir ../../Castor-data/TrecQA/ - -CURRENT_DIR=$(pwd) -cd ../../Castor-data/TrecQA -cd raw-dev/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/trecqa.dev.tsv; cd .. -cd raw-test/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/trecqa.test.tsv; cd .. -cd train-all/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/trecqa.train.tsv; cd .. -cd $CURRENT_DIR - -python overlap_features.py --dir ../../Castor-data/WikiQA/ -cd ../../Castor-data/WikiQA -cd dev/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/wikiqa.dev.tsv; cd .. -cd test/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/wikiqa.test.tsv; cd .. -cd train/; paste id.txt sim.txt a.toks b.toks overlap_feats.txt > $CURRENT_DIR/data/wikiqa.train.tsv; cd .. -cd $CURRENT_DIR \ No newline at end of file diff --git a/sm_cnn/external_features.py b/sm_cnn/external_features.py deleted file mode 100644 index 83e51917..00000000 --- a/sm_cnn/external_features.py +++ /dev/null @@ -1,216 +0,0 @@ -# module to compute various external features for the sm cnn model. -# TODO: add more external features like: -# word mover distance, cosine sim in tf.idf space, cosine sim in word embedding space -# overlap based on parts of speech: noun, verb, adj (POS tag) -# word embedding cosine sim based on part of speech: noun, verb, adj -import sys -import os -import shlex -import subprocess -import string -from collections import defaultdict - -import numpy as np - -import nltk -nltk.download('stopwords', quiet=True) - -from nltk.stem.porter import PorterStemmer -from nltk.corpus import stopwords - -def stopped(sentences): - """ - remove stop words from given sentences (questions|answers) - """ - stoplist = set(stopwords.words('english')) - #stoplist.update(set(string.punctuation)) - def stop(sentence): - return ' '.join([word for word in sentence.split() if word not in stoplist]) - return [stop(sentence) for sentence in sentences] - -def stemmed(sentences): - """ - reduce sentence terms to stemmed representations - """ - stemmer = PorterStemmer() - def stem(sentence): - return ' '.join([stemmer.stem(word) for word in sentence.split()]) - return [stem(sentence) for sentence in sentences] - -def get_qadata_only_idf(all_data): - """ - returns idf weights computed over all question answer pairs in the dataset - """ - if not type(all_data) is list: - all_data = list(all_data) - term_idfs = defaultdict(float) - for doc in all_data: - for term in list(set(doc.split())): - term_idfs[term] += 1.0 - N = len(all_data) - for term, n_t in term_idfs.items(): - term_idfs[term] = np.log(N/(1+n_t)) - return term_idfs - -def get_source_corpus_idf(all_data, path_to_index): - """ - fetches idf weights from source corpus (disks1-5+aquaint|wikipedia) index, for all the qa pairs - """ - # first run maven to build ../idf_baseline/FetchTermIDF - maven_cmd = "mvn -f ../idf_baseline/pom.xml clean package appassembler:assemble" - pargs = shlex.split(maven_cmd) - p = subprocess.Popen(pargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, \ - bufsize=1, universal_newlines=True) - pout, perr = p.communicate() - # if build failure, exit with message - if "BUILD FAILURE" in pout or "BUILD FAILURE" in perr: - print("\nERROR: Could not build ../idf_baseline/FetchTermIDF. Fix build errors before proceeding") - print("$ cd ../idf_baseline") - print("$ mvn clean package appassembler:assemble") - sys.exit(0) - - if not type(all_data) is list: - all_data = list(all_data) - term_idfs = defaultdict(float) - all_terms = set([term for doc in all_data for term in doc.split()]) - with open('dataset.vocab', 'w') as vf: - for term in list(all_terms): - print(term, file=vf) - - fetchIDF_cmd = \ - "sh ../idf_baseline/target/appassembler/bin/FetchTermIDF -index {} -vocabFile {}".\ - format(path_to_index, 'dataset.vocab') - pargs = shlex.split(fetchIDF_cmd) - p = subprocess.Popen(pargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, \ - bufsize=1, universal_newlines=True) - pout, perr = p.communicate() - - lines = str(pout).split('\n') - for line in lines: - if not line: - continue - fields = line.strip().split("\t") - term, weight = fields[0], fields[-1] - term_idfs[term] = float(weight) - - for line in str(perr).split('\n'): - print('Warning: '+line) - return term_idfs - -def compute_overlap(questions, answers): - """ - returns simple overlap between document pairs - """ - overlap_scores = [] - for q, a in zip(questions, answers): - q_terms = set(q.split()) - a_terms = set(a.split()) - common_terms = q_terms.intersection(a_terms) - overlap = float(len(common_terms))/(len(q_terms) + len(a_terms)) - overlap_scores.append(overlap) - return np.array(overlap_scores) - -def compute_idf_weighted_overlap(questions, answers, idf_weights): - """ - returns idf weighted overlap - """ - overlap_scores = [] - for q, a in zip(questions, answers): - q_terms = set(q.split()) - a_terms = set(a.split()) - common_terms = q_terms.intersection(a_terms) - idf_weighted_overlap = np.sum([idf_weights[term] for term in list(common_terms)]) - idf_weighted_overlap /= (len(q_terms) + len(a_terms)) - overlap_scores.append(idf_weighted_overlap) - return np.array(overlap_scores) - - -def set_external_features_as_per_paper(trainer, corpus_index=None): - """ - computes external features as per the paper AND saves them into trainer - """ - all_questions, all_answers = [], [] - for split in trainer.data_splits.keys(): - questions, answers, labels, max_q_len, max_a_len, default_ext_feats = \ - trainer.data_splits[split] - all_questions.extend(questions) - all_answers.extend(answers) - - all_data = set(all_questions + all_answers) - print('corpus_index', corpus_index) - idf_weights = get_qadata_only_idf(list(all_data)) if not corpus_index else \ - get_source_corpus_idf(list(all_data), corpus_index) - - external_features = {} - - # NOTE: expected external features as per paper are - # 1. overlap(q, a), - # 2. idf_overlap(q, a), - # 3. overlap(stopped(q), stopped(a)), - # 4. idf_over(stopped(q), stopped(a)) - - for split in trainer.data_splits.keys(): - questions, answers, labels, max_q_len, max_a_len, default_ext_feats = \ - trainer.data_splits[split] - - overlap = compute_overlap(questions, answers) - idf_weighted_overlap = compute_idf_weighted_overlap(questions, answers, idf_weights) - overlap_no_stopwords =\ - compute_overlap(stopped(questions), stopped(answers)) - idf_weighted_overlap_no_stopwords =\ - compute_idf_weighted_overlap(stopped(questions), stopped(answers), idf_weights) - ext_feats = [np.array(feats) for feats in zip(overlap, idf_weighted_overlap,\ - overlap_no_stopwords, idf_weighted_overlap_no_stopwords)] - trainer.data_splits[split][-1] = ext_feats - external_features[split] = ext_feats - return external_features - - -def set_external_features_as_per_paper_and_stem(trainer, corpus_index=None): - """ - computes external features as per the paper but performs stemming before computing IDF. - features are saved into the trainer.data_splits - """ - all_questions, all_answers = [], [] - for split in trainer.data_splits.keys(): - questions, answers, labels, max_q_len, max_a_len, default_ext_feats = \ - trainer.data_splits[split] - all_questions.extend(questions) - all_answers.extend(answers) - - all_data = set(all_questions + all_answers) - - # stem all words except stopwords to compute idf (required for feature number 2.) - stoplist = set(stopwords.words('english')) - stemmer = PorterStemmer() - def stem_non_stop_words(sentence): - return ' '.join([stemmer.stem(word) if word not in stoplist else word \ - for word in sentence.split()]) - all_but_stopwords_stemmed = [stem_non_stop_words(sentence) for sentence in list(all_data)] - idf_weights = get_qadata_only_idf(all_but_stopwords_stemmed) if not corpus_index else \ - get_source_corpus_idf(all_but_stopwords_stemmed, corpus_index) - - external_features = {} - - for split in trainer.data_splits.keys(): - questions, answers, labels, max_q_len, max_a_len, default_ext_feats = \ - trainer.data_splits[split] - - que_stem_all_but_stopwords = [stem_non_stop_words(que) for que in questions] - ans_stem_all_but_stopwords = [stem_non_stop_words(ans) for ans in answers] - - overlap = compute_overlap(que_stem_all_but_stopwords, ans_stem_all_but_stopwords) - idf_weighted_overlap = compute_idf_weighted_overlap(que_stem_all_but_stopwords,\ - ans_stem_all_but_stopwords, idf_weights) - - que_stopped_stemmed = stemmed(stopped(questions)) - ans_stopped_stemmed = stemmed(stopped(answers)) - - overlap_no_stopwords = compute_overlap(que_stopped_stemmed, ans_stopped_stemmed) - idf_weighted_overlap_no_stopwords =\ - compute_idf_weighted_overlap(que_stopped_stemmed, ans_stopped_stemmed, idf_weights) - ext_feats = [np.array(feats) for feats in zip(overlap, idf_weighted_overlap,\ - overlap_no_stopwords, idf_weighted_overlap_no_stopwords)] - trainer.data_splits[split][-1] = ext_feats - external_features[split] = ext_feats - return external_features \ No newline at end of file diff --git a/sm_cnn/main.py b/sm_cnn/main.py deleted file mode 100644 index 8267260c..00000000 --- a/sm_cnn/main.py +++ /dev/null @@ -1,110 +0,0 @@ -import numpy as np -import random -import logging - -import torch -from torchtext import data - -from args import get_args -from utils.relevancy_metrics import get_map_mrr -from trec_dataset import TrecDataset -from wiki_dataset import WikiDataset - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(levelname)s - %(message)s') -ch.setFormatter(formatter) -logger.addHandler(ch) - -args = get_args() -config = args - -torch.manual_seed(args.seed) - -if not args.cuda: - args.gpu = -1 -if torch.cuda.is_available() and args.cuda: - logger.info("Note: You are using GPU for training") - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) -if torch.cuda.is_available() and not args.cuda: - logger.info("Warning: You have Cuda but do not use it. You are using CPU for training") -np.random.seed(args.seed) -random.seed(args.seed) - -QID = data.Field(sequential=False) -QUESTION = data.Field(batch_first=True) -ANSWER = data.Field(batch_first=True) -LABEL = data.Field(sequential=False) -EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False, - postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr])) - -if config.dataset == 'TREC': - train, dev, test = TrecDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL) -elif config.dataset == 'wiki': - train, dev, test = WikiDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL) -else: - print("Unsupported dataset") - exit() - -QID.build_vocab(train, dev, test) -QUESTION.build_vocab(train, dev, test) -ANSWER.build_vocab(train, dev, test) -LABEL.build_vocab(train, dev, test) - -train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False, - sort=False, shuffle=True) -dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False, - sort=False, shuffle=False) -test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False, - sort=False, shuffle=False) - -config.target_class = len(LABEL.vocab) -config.questions_num = len(QUESTION.vocab) -config.answers_num = len(ANSWER.vocab) -print("Label dict:", LABEL.vocab.itos) - -if args.cuda: - model = torch.load(args.trained_model, map_location=lambda storage, location: storage.cuda(args.gpu)) -else: - model = torch.load(args.trained_model, map_location=lambda storage,location: storage) - -index2label = np.array(LABEL.vocab.itos) -index2qid = np.array(QID.vocab.itos) - - -def predict(dataset, test_mode, dataset_iter): - model.eval() - dataset_iter.init_epoch() - - qids = [] - predictions = [] - labels = [] - for dev_batch_idx, dev_batch in enumerate(dataset_iter): - qid_array = index2qid[np.transpose(dev_batch.qid.cpu().data.numpy())] - true_label_array = index2label[np.transpose(dev_batch.label.cpu().data.numpy())] - - scores = model(dev_batch.question, dev_batch.answer, dev_batch.ext_feat) - score_array = scores[:, 2].cpu().data.numpy() - - qids.extend(qid_array.tolist()) - predictions.extend(score_array.tolist()) - labels.extend(true_label_array.tolist()) - - dev_map, dev_mrr = get_map_mrr(qids, predictions, labels) - logger.info("{} {}".format(dev_map, dev_mrr)) - -# Run the model on the dev set -predict(config.dataset, 'dev', dataset_iter=dev_iter) - -# Run the model on the test set -predict(config.dataset, 'test', dataset_iter=test_iter) - -if args.onnx: - print("Saving model to ONNX...") - dummy_batch = next(iter(dev_iter)) - dummy_input = (dummy_batch.question, dummy_batch.answer, dummy_batch.ext_feat) - torch.onnx.export(model, dummy_input, "sm_model.proto", verbose=True) diff --git a/sm_cnn/model.py b/sm_cnn/model.py index 20867461..02189cc1 100644 --- a/sm_cnn/model.py +++ b/sm_cnn/model.py @@ -1,17 +1,20 @@ +import numpy as np import torch import torch.nn as nn - import torch.nn.functional as F +import torch -class SmPlusPlus(nn.Module): +class SMCNN(nn.Module): def __init__(self, config): - super(SmPlusPlus, self).__init__() + super(SMCNN, self).__init__() output_channel = config.output_channel questions_num = config.questions_num answers_num = config.answers_num - words_dim = config.words_dim + words_dim = config.word_vectors_dim filter_width = config.filter_width self.mode = config.mode + self.arch = 'smcnn' + self.skip_embedding_lookup = True n_classes = config.target_class ext_feats_size = 4 @@ -43,7 +46,7 @@ def _unsqueeze(self, tensor): dim = tensor.size() return tensor.view(dim[0], 1, dim[1], dim[2]) - def forward(self, x_question, x_answer, x_ext): + def forward(self, x_question, x_answer, x_ext, word_to_doc_count=None, raw_sent1=None, raw_sent2=None): if self.mode == 'rand': question = self._unsqueeze(self.question_embed(x_question)) answer = self._unsqueeze(self.answer_embed(x_answer)) # (batch, 1, sent_len, embed_dim) diff --git a/sm_cnn/overlap_features.py b/sm_cnn/overlap_features.py deleted file mode 100644 index 59371577..00000000 --- a/sm_cnn/overlap_features.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -import string -import pickle -from collections import defaultdict -from argparse import ArgumentParser - -from nltk.stem.porter import PorterStemmer - -def load_data(dname): - stemmer = PorterStemmer() - qids, questions, answers, labels = [], [], [], [] - print('Load folder ' + dname) - with open(dname+'a.toks', encoding='utf-8') as f: - for line in f: - question = line.strip().split() - question = [stemmer.stem(word) for word in question] - questions.append(question) - with open(dname+'b.toks', encoding='utf-8') as f: - for line in f: - answer = line.strip().split() - answer_list = [] - for word in answer: - try: - answer_list.append(stemmer.stem(word)) - except Exception as e: - print("couldn't stem the word:" + word) - answers.append(answer_list) - with open(dname+'id.txt', encoding='utf-8') as f: - for line in f: - qids.append(line.strip()) - with open(dname+'sim.txt', encoding='utf-8') as f: - for line in f: - labels.append(int(line.strip())) - return qids, questions, answers, labels - -def compute_overlap_features(questions, answers, word2df=None, stoplist=None): - word2df = word2df if word2df else {} - stoplist = stoplist if stoplist else set() - feats_overlap = [] - for question, answer in zip(questions, answers): - q_set = set([q for q in question if q not in stoplist]) - a_set = set([a for a in answer if a not in stoplist]) - word_overlap = q_set.intersection(a_set) - if len(q_set) == 0 and len(a_set) == 0: - overlap = 0 - else: - overlap = float(len(word_overlap)) / (len(q_set) + len(a_set)) - - word_overlap = q_set.intersection(a_set) - df_overlap = 0.0 - for w in word_overlap: - df_overlap += word2df[w] - - if len(q_set) == 0 and len(a_set) == 0: - df_overlap = 0 - else: - df_overlap /= (len(q_set) + len(a_set)) - - feats_overlap.append(np.array([overlap, df_overlap])) - return np.array(feats_overlap) - -def compute_overlap_idx(questions, answers, stoplist, q_max_sent_length, a_max_sent_length): - stoplist = stoplist if stoplist else [] - q_indices, a_indices = [], [] - for question, answer in zip(questions, answers): - q_set = set([q for q in question if q not in stoplist]) - a_set = set([a for a in answer if a not in stoplist]) - word_overlap = q_set.intersection(a_set) - - q_idx = np.ones(q_max_sent_length) * 2 - for i, q in enumerate(question): - value = 0 - if q in word_overlap: - value = 1 - q_idx[i] = value - q_indices.append(q_idx) - - a_idx = np.ones(a_max_sent_length) * 2 - for i, a in enumerate(answer): - value = 0 - if a in word_overlap: - value = 1 - a_idx[i] = value - a_indices.append(a_idx) - - q_indices = np.vstack(q_indices).astype('int32') - a_indices = np.vstack(a_indices).astype('int32') - - return q_indices, a_indices - -def compute_dfs(docs): - word2df = defaultdict(float) - for doc in docs: - for w in set(doc): - word2df[w] += 1.0 - num_docs = len(docs) - - for w, value in word2df.items(): - word2df[w] = np.math.log(num_docs / value) # bug feats fixed - - return word2df - -if __name__ == '__main__': - parser = ArgumentParser(description='create TrecQA/WikiQA dataset') - parser.add_argument('--dir', help='path to the TrecQA|WikiQA data directory', default="../../Castor-data/TrecQA") - args = parser.parse_args() - - stoplist = set([line.strip() for line in open('../../Castor-data/TrecQA/stopwords.txt', encoding='utf-8')]) - punct = set(string.punctuation) - stoplist.update(punct) - - all_questions, all_answers, all_qids = [], [], [] - base_dir = args.dir - - if 'TrecQA' in base_dir: - sub_dirs = ['train/', 'train-all/', 'raw-dev/', 'raw-test/'] - elif 'WikiQA' in base_dir: - sub_dirs = ['train/', 'dev/', 'test/'] - else: - print('Unsupported dataset') - exit() - - for sub in sub_dirs: - qids, questions, answers, labels = load_data(base_dir + sub) - all_questions.extend(questions) - all_answers.extend(answers) - all_qids.extend(qids) - - seen = set() - unique_questions = [] - for q, qid in zip(all_questions, all_qids): - if qid not in seen: - seen.add(qid) - unique_questions.append(q) - - docs = all_answers + unique_questions - word2dfs = compute_dfs(docs) - pickle.dump(word2dfs, open("word2dfs.p", "wb")) - - q_max_sent_length = max(map(lambda x: len(x), all_questions)) - a_max_sent_length = max(map(lambda x: len(x), all_answers)) - - for sub in sub_dirs: - qids, questions, answers, labels = load_data(base_dir + sub) - - overlap_feats = compute_overlap_features(questions, answers, stoplist=None, word2df=word2dfs) - overlap_feats_stoplist = compute_overlap_features(questions, answers, stoplist=stoplist, word2df=word2dfs) - overlap_feats = np.hstack([overlap_feats, overlap_feats_stoplist]) - - with open(base_dir + sub + 'overlap_feats.txt', 'w') as f: - for i in range(overlap_feats.shape[0]): - for j in range(4): - f.write(str(overlap_feats[i][j]) + ' ') - f.write('\n') diff --git a/sm_cnn/train.py b/sm_cnn/train.py deleted file mode 100644 index 79159947..00000000 --- a/sm_cnn/train.py +++ /dev/null @@ -1,210 +0,0 @@ -import time -import os -import numpy as np -import random - -import torch -import torch.nn as nn -import torch.onnx -from torchtext import data - -from args import get_args -from model import SmPlusPlus -from utils.relevancy_metrics import get_map_mrr -from trec_dataset import TrecDataset -from wiki_dataset import WikiDataset - -args = get_args() -config = args - -torch.manual_seed(args.seed) - -def set_vectors(field, vector_path): - if os.path.isfile(vector_path): - stoi, vectors, dim = torch.load(vector_path) - field.vocab.vectors = torch.Tensor(len(field.vocab), dim) - - for i, token in enumerate(field.vocab.itos): - wv_index = stoi.get(token, None) - if wv_index is not None: - field.vocab.vectors[i] = vectors[wv_index] - else: - # initialize with U(-0.25, 0.25) vectors - field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_(-0.25, 0.25) - else: - print("Error: Need word embedding pt file") - exit(1) - return field - -# Set default configuration in : args.py -args = get_args() -config = args - -# Set random seed for reproducibility -torch.manual_seed(args.seed) -torch.backends.cudnn.deterministic = True -if not args.cuda: - args.gpu = -1 -if torch.cuda.is_available() and args.cuda: - print("Note: You are using GPU for training") - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) -if torch.cuda.is_available() and not args.cuda: - print("You have Cuda but you're using CPU for training.") -np.random.seed(args.seed) -random.seed(args.seed) - -QID = data.Field(sequential=False) -QUESTION = data.Field(batch_first=True) -ANSWER = data.Field(batch_first=True) -LABEL = data.Field(sequential=False) -EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False, - postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr])) - -if config.dataset == 'TREC': - train, dev, test = TrecDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL) -elif config.dataset == 'wiki': - train, dev, test = WikiDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL) -else: - print("Unsupported dataset") - exit() - -QID.build_vocab(train, dev, test) -QUESTION.build_vocab(train, dev, test) -ANSWER.build_vocab(train, dev, test) -LABEL.build_vocab(train, dev, test) - - -QUESTION = set_vectors(QUESTION, args.vector_cache) -ANSWER = set_vectors(ANSWER, args.vector_cache) - -train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False, - sort=False, shuffle=True) -dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False, - sort=False, shuffle=False) -test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=False, repeat=False, - sort=False, shuffle=False) - -config.target_class = len(LABEL.vocab) -config.questions_num = len(QUESTION.vocab) -config.answers_num = len(ANSWER.vocab) - -print("Dataset {} Mode {}".format(args.dataset, args.mode)) -print("VOCAB num", len(QUESTION.vocab)) -print("LABEL.target_class:", len(LABEL.vocab)) -print("LABELS:", LABEL.vocab.itos) -print("Train instance", len(train)) -print("Dev instance", len(dev)) -print("Test instance", len(test)) - -if args.resume_snapshot: - if args.cuda: - model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) - else: - model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage) -else: - model = SmPlusPlus(config) - model.static_question_embed.weight.data.copy_(QUESTION.vocab.vectors) - model.nonstatic_question_embed.weight.data.copy_(QUESTION.vocab.vectors) - model.static_answer_embed.weight.data.copy_(ANSWER.vocab.vectors) - model.nonstatic_answer_embed.weight.data.copy_(ANSWER.vocab.vectors) - - if args.cuda: - model.cuda() - print("Shift model to GPU") - - -parameter = filter(lambda p: p.requires_grad, model.parameters()) - -# the SM model originally follows SGD but Adadelta is used here -optimizer = torch.optim.Adadelta(parameter, lr=args.lr, weight_decay=args.weight_decay) -criterion = nn.CrossEntropyLoss() -early_stop = False -best_dev_map = 0 -iterations = 0 -iters_not_improved = 0 -epoch = 0 -start = time.time() -header = ' Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss Accuracy Dev/Accuracy' -dev_log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f},{:12.4f},{:12.4f}'.split(',')) -log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{},{:12.4f},{}'.split(',')) -os.makedirs(args.save_path, exist_ok=True) -os.makedirs(os.path.join(args.save_path, args.dataset), exist_ok=True) -print(header) - -index2label = np.array(LABEL.vocab.itos) -index2qid = np.array(QID.vocab.itos) -index2question = np.array(ANSWER.vocab.itos) - -while True: - if early_stop: - print("Early Stopping. Epoch: {}, Best Dev Acc: {}".format(epoch, best_dev_map)) - break - epoch += 1 - train_iter.init_epoch() - n_correct, n_total = 0, 0 - - for batch_idx, batch in enumerate(train_iter): - iterations += 1 - model.train(); optimizer.zero_grad() - scores = model(batch.question, batch.answer, batch.ext_feat) - n_correct += (torch.max(scores, 1)[1].view(batch.label.size()).data == batch.label.data).sum() - n_total += batch.batch_size - train_acc = 100. * n_correct / n_total - - loss = criterion(scores, batch.label) - loss.backward() - optimizer.step() - - # Evaluate performance on validation set - if iterations % args.dev_every == 1: - # switch model into evaluation mode - model.eval() - dev_iter.init_epoch() - n_dev_correct = 0 - dev_losses = [] - - qids = [] - predictions = [] - labels = [] - for dev_batch_idx, dev_batch in enumerate(dev_iter): - qid_array = index2qid[np.transpose(dev_batch.qid.cpu().data.numpy())] - true_label_array = index2label[np.transpose(dev_batch.label.cpu().data.numpy())] - - scores = model(dev_batch.question, dev_batch.answer, dev_batch.ext_feat) - n_dev_correct += (torch.max(scores, 1)[1].view(dev_batch.label.size()).data == dev_batch.label.data).sum() - dev_loss = criterion(scores, dev_batch.label) - dev_losses.append(dev_loss.data[0]) - index_label = np.transpose(torch.max(scores, 1)[1].view(dev_batch.label.size()).cpu().data.numpy()) - label_array = index2label[index_label] - # get the relevance scores - score_array = scores[:, 2].cpu().data.numpy() - - qids.extend(qid_array.tolist()) - predictions.extend(score_array.tolist()) - labels.extend(true_label_array.tolist()) - - dev_map, dev_mrr = get_map_mrr(qids, predictions, labels) - print(dev_log_template.format(time.time() - start, - epoch, iterations, 1 + batch_idx, len(train_iter), - 100. * (1 + batch_idx) / len(train_iter), loss.data[0], - sum(dev_losses) / len(dev_losses), train_acc, dev_map)) - - # Update validation results - if dev_map > best_dev_map: - iters_not_improved = 0 - best_dev_map = dev_map - snapshot_path = os.path.join(args.save_path, args.dataset, args.mode+'_best_model.pt') - torch.save(model, snapshot_path) - else: - iters_not_improved += 1 - if iters_not_improved >= args.patience: - early_stop = True - break - - if iterations % args.log_every == 1: - # print progress message - print(log_template.format(time.time() - start, - epoch, iterations, 1 + batch_idx, len(train_iter), - 100. * (1 + batch_idx) / len(train_iter), loss.data[0], ' ' * 8, - n_correct / n_total * 100, ' ' * 12)) diff --git a/sm_cnn/trec_dataset.py b/sm_cnn/trec_dataset.py deleted file mode 100644 index 92984758..00000000 --- a/sm_cnn/trec_dataset.py +++ /dev/null @@ -1,14 +0,0 @@ -from torchtext import data - -class TrecDataset(data.TabularDataset): - dirname = 'data' - @classmethod - - def splits(cls, question_id, question_field, answer_field, external_field, label_field, root='.data', - train='trecqa.train.tsv', validation='trecqa.dev.tsv', test='trecqa.test.tsv'): - path = './data' - return super(TrecDataset, cls).splits( - path, root, train, validation, test, - format='TSV', fields=[('qid', question_id), ('label', label_field), ('question', question_field), - ('answer', answer_field), ('ext_feat', external_field)] - ) diff --git a/sm_cnn/wiki_dataset.py b/sm_cnn/wiki_dataset.py deleted file mode 100644 index 8c231100..00000000 --- a/sm_cnn/wiki_dataset.py +++ /dev/null @@ -1,14 +0,0 @@ -from torchtext import data - -class WikiDataset(data.TabularDataset): - dirname = 'data' - @classmethod - - def splits(cls, question_id, question_field, answer_field, external_field, label_field, root='.data', - train='wikiqa.train.tsv', validation='wikiqa.dev.tsv', test='wikiqa.test.tsv'): - path = './data' - return super(WikiDataset, cls).splits( - path, root, train, validation, test, - format='TSV', fields=[('qid', question_id), ('label', label_field), ('question', question_field), - ('answer', answer_field), ('ext_feat', external_field)] - )