', '', '', '', '- ']
for item in unproc_data:
# in case of NQ dataset, context containing tags is excluded for training
context = item["context"]
skip_flag = False
for tag in skip_tags:
if tag in context:
skip_flag = True
break
if skip_flag and is_training:
continue
doc_tokens = []
for token in item["context_tokens"]:
if token[0] in ['[TLE]', '[PAR]', '[DOC]']:
token[0] = '[SEP]'
doc_tokens.append(token[0])
# 2. qas
for qa in item['qas']:
qas_id = qa['qid']
question_text = qa['question']
# Only take the first answer
answer = qa['detected_answers'][0]
orig_answer_text = answer['text']
# Only take the first span
start_position = answer['token_spans'][0][0]
end_position = answer['token_spans'][0][1]
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
def read_squad_examples(input_file, is_training, version_2_with_negative=False,
debug=False, reduce_size=False, ratio=1.0):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
if debug:
input_data = input_data[:5]
for entry in input_data:
paragraphs = entry["paragraphs"]
for paragraph in paragraphs:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + \
0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
verbose_logging, version_2_with_negative, null_score_diff_threshold,
noq_position=False):
"""Write final predictions to the json file and log-odds of null if needed."""
# logger.info("Writing predictions to: %s" % (output_prediction_file))
# logger.info("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min null score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + \
result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
if noq_position:
# start and end is computed without question tokens
# add length of q and and -1 for [CLS] in q_ids
q_ids = feature.q_ids
q_len = np.sum(np.sign(q_ids))
noq_start_index = start_index
noq_end_index = end_index
start_index = start_index + q_len - 1
end_index = end_index + q_len - 1
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
if noq_position:
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[noq_start_index],
end_logit=result.end_logits[noq_end_index]))
else:
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(
tok_text, orig_text, do_lower_case, verbose_logging)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't include the empty option in the n-best, include it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest) == 1:
nbest.insert(0, _NbestPrediction(text="empty",
start_logit=0.0,
end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(_NbestPrediction(text="empty",
start_logit=0.0,
end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
# if verbose_logging:
# logger.info(
# "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
# if verbose_logging:
# logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
# orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
# if verbose_logging:
# logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
# if verbose_logging:
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def write_answer_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file, verbose_logging,
version_2_with_negative, null_score_diff_threshold):
"""Write final predictions to the json file and log-odds of null if needed."""
# logger.info("Writing predictions to: %s" % (output_prediction_file))
# logger.info("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min null score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + \
result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
# start and end index is from [CLS] [UNK] [SEP] C [SEP]
# each should be deducted 1 and added length of Q
offset = len(feature.q_tokens) - 2 # -2 for [CLS] and [SEP]
for start_index in start_indexes:
for end_index in end_indexes:
start_index = offset + start_index - 1 # -1 for [UNK]
end_index = offset + end_index - 1 # -1 for [UNK]
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(
tok_text, orig_text, do_lower_case, verbose_logging)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't include the empty option in the n-best, include it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest) == 1:
nbest.insert(0,
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = collections.Counter(
prediction_tokens) & collections.Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
def read_predictions(prediction_file):
with open(prediction_file) as f:
predictions = json.load(f)
return predictions
def read_answers(gold_file):
answers = {}
with gzip.open(gold_file, 'rb') as f:
for i, line in enumerate(f):
example = json.loads(line)
if i == 0 and 'header' in example:
continue
for qa in example['qas']:
answers[qa['qid']] = qa['answers']
return answers
def evaluate_mrqa(answers, predictions, skip_no_answer=False):
f1 = exact_match = total = 0
for qid, ground_truths in answers.items():
if qid not in predictions:
if not skip_no_answer:
message = 'Unanswered question %s will receive score 0.' % qid
print(message)
total += 1
continue
total += 1
prediction = predictions[qid]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
================================================
FILE: qa-eval/trainer.py
================================================
import collections
import math
import os
import time
import json
import socket
from tqdm import tqdm
import numpy as np
import pickle
import random
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim as optim
from transformers import BertForQuestionAnswering, BertTokenizer, AdamW
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils import clip_grad_norm_
from squad_utils import read_squad_examples, read_examples, \
convert_examples_to_features, \
write_predictions, read_answers, \
evaluate, evaluate_mrqa
from utils import eta, progress_bar, user_friendly_time, time_since
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
""" Create a schedule with a learning rate that decreases linearly after
linearly increasing during a warmup period.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
return LambdaLR(optimizer, lr_lambda, last_epoch)
class Trainer(object):
def __init__(self, args):
self.args = args
self.set_random_seed(random_seed=args.random_seed)
self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
def make_model_env(self, gpu, ngpus_per_node):
if gpu is not None:
self.args.gpu = self.args.devices[gpu]
if self.args.use_cuda and self.args.distributed:
if self.args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
self.args.rank = self.args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=self.args.dist_backend, init_method=self.args.dist_url,
world_size=self.args.world_size, rank=self.args.rank)
if self.args.rank == 0:
if self.args.debug:
print("debugging mode on.")
self.model = BertForQuestionAnswering.from_pretrained(self.args.bert_model)
if self.args.rank == 0:
self.get_dev_loader()
self.get_test_loader()
self.args.batch_size = int(self.args.batch_size / ngpus_per_node)
self.get_pretrain_loader()
self.pretrain_t_total = len(self.pretrain_loader) * self.args.pretrain_epochs
no_decay = ['bias', 'LayerNorm.weight']
self.optimizer_grouped_parameters = [
{'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay},
{'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
self.optimizer = AdamW(self.optimizer_grouped_parameters,
lr=self.args.pretrain_lr, eps=self.args.adam_epsilon)
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
num_warmup_steps=self.args.warmup_steps, num_training_steps=self.pretrain_t_total)
if self.args.use_cuda:
torch.cuda.set_device(self.args.gpu)
self.model.cuda(self.args.gpu)
self.args.workers = int((self.args.workers + ngpus_per_node - 1) / ngpus_per_node)
self.model = DistributedDataParallel(self.model, device_ids=[self.args.gpu],
find_unused_parameters=True)
cudnn.benchmark = True
def get_pretrain_loader(self):
data = self.args.pretrain_dataset
self.pretrain_sampler = DistributedSampler(data)
self.pretrain_loader = DataLoader(data, num_workers=self.args.workers, pin_memory=True,
sampler=self.pretrain_sampler, batch_size=self.args.batch_size)
def get_dev_loader(self):
data = self.args.dev_dataset
self.dev_loader = DataLoader(data, shuffle=False, batch_size=self.args.batch_size)
self.dev_examples = self.args.dev_examples
self.dev_features = self.args.dev_features
def get_test_loader(self):
data = self.args.test_dataset
self.test_loader = DataLoader(data, shuffle=False, batch_size=self.args.batch_size)
self.test_examples = self.args.test_examples
self.test_features = self.args.test_features
def train(self):
self.model.zero_grad()
for epoch in range(0, self.args.pretrain_epochs):
num_batches = len(self.pretrain_loader)
self.pretrain_sampler.set_epoch(epoch)
start = time.time()
# pretrain with unsupervised dataset
for step, batch in enumerate(self.pretrain_loader, start=1):
self.model.train()
input_ids, input_mask, seg_ids, start_positions, end_positions = batch
seq_len = torch.sum(torch.sign(input_ids), 1)
max_len = torch.max(seq_len)
input_ids = input_ids[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
input_mask = input_mask[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
seg_ids = seg_ids[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
start_positions = start_positions.clone().cuda(self.args.gpu, non_blocking=True)
end_positions = end_positions.clone().cuda(self.args.gpu, non_blocking=True)
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": seg_ids,
"start_positions": start_positions,
"end_positions": end_positions
}
loss = self.model(**inputs)[0]
loss.backward()
clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
self.optimizer.step()
self.scheduler.step()
self.model.zero_grad()
if self.args.rank == 0:
msg = "PRETRAIN {}/{} {} - ETA : {} - LOSS : {:.4f}".format(step,
num_batches, progress_bar(step, num_batches),
eta(start, step, num_batches),
float(loss.item()))
print(msg, end="\r")
if self.args.debug:
break
# save model
if self.args.rank == 0:
result_dict = self.evaluate_model(msg)
em = result_dict["exact_match"]
f1 = result_dict["f1"]
print("\nPRETRAIN took {} DEV - F1: {:.4f}, EM: {:.4f}\n"
.format(user_friendly_time(time_since(start)), f1, em))
if self.args.rank == 0:
result_dict = self.evaluate_model("TEST", False)
em = result_dict["exact_match"]
f1 = result_dict["f1"]
print("\nFINAL TEST - F1: {:.4f}, EM: {:.4f}\n"
.format(f1, em))
def evaluate_model(self, msg, dev=True):
if dev:
eval_examples = self.dev_examples
eval_features = self.dev_features
eval_loader = self.dev_loader
eval_file = self.args.dev_json_file
else:
eval_examples = self.test_examples
eval_features = self.test_features
eval_loader = self.test_loader
eval_file = self.args.test_json_file
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
all_results = []
example_index = -1
self.model.eval()
num_val_batches = len(eval_loader)
for i, batch in enumerate(eval_loader):
input_ids, input_mask, seg_ids = batch
seq_len = torch.sum(torch.sign(input_ids), 1)
max_len = torch.max(seq_len)
input_ids = input_ids[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
input_mask = input_mask[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
seg_ids = seg_ids[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True)
with torch.no_grad():
inputs = {
"input_ids": input_ids,
"attention_mask": input_mask,
"token_type_ids": seg_ids
}
if hasattr(self.model, 'module'):
outputs = self.model.module(**inputs)
else:
outputs = self.model(**inputs)
batch_start_logits, batch_end_logits = outputs[0], outputs[1]
batch_size = batch_start_logits.size(0)
for j in range(batch_size):
example_index += 1
start_logits = batch_start_logits[j].detach().cpu().tolist()
end_logits = batch_end_logits[j].detach().cpu().tolist()
eval_feature = eval_features[example_index]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
msg2 = "{} => Evaluating :{}/{}".format(msg, i, num_val_batches)
print(msg2, end="\r")
os.makedirs("./save/", exist_ok=True)
output_prediction_file = os.path.join("./save/", "prediction.json")
write_predictions(eval_examples, eval_features, all_results,
n_best_size=20, max_answer_length=30, do_lower_case=True,
output_prediction_file=output_prediction_file,
verbose_logging=False,
version_2_with_negative=False,
null_score_diff_threshold=0,
noq_position=False)
with open(output_prediction_file) as prediction_file:
predictions = json.load(prediction_file)
with open(eval_file) as f:
data_json = json.load(f)
dataset = data_json["data"]
result_dict = evaluate(dataset, predictions)
return result_dict
def set_random_seed(self, random_seed=2019):
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
np.random.seed(random_seed)
================================================
FILE: qa-eval/utils.py
================================================
import time
import math
import torch
def time_since(t):
""" Function for time. """
return time.time() - t
def progress_bar(completed, total, step=5):
""" Function returning a string progress bar. """
percent = int((completed / total) * 100)
bar = '[='
arrow_reached = False
for t in range(step, 101, step):
if arrow_reached:
bar += ' '
else:
if percent // t != 0:
bar += '='
else:
bar = bar[:-1]
bar += '>'
arrow_reached = True
if percent == 100:
bar = bar[:-1]
bar += '='
bar += ']'
return bar
def user_friendly_time(s):
""" Display a user friendly time from number of second. """
s = int(s)
if s < 60:
return "{}s".format(s)
m = s // 60
s = s % 60
if m < 60:
return "{}m {}s".format(m, s)
h = m // 60
m = m % 60
if h < 24:
return "{}h {}m {}s".format(h, m, s)
d = h // 24
h = h % 24
return "{}d {}h {}m {}s".format(d, h, m, s)
def eta(start, completed, total):
""" Function returning an ETA. """
# Computation
took = time_since(start)
time_per_step = took / completed
remaining_steps = total - completed
remaining_time = time_per_step * remaining_steps
return user_friendly_time(remaining_time)
def cal_running_avg_loss(loss, running_avg_loss, decay=0.99):
if running_avg_loss == 0:
return loss
else:
running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
return running_avg_loss
def kl_coef(i):
# coef for KL annealing
# reaches 1 at i = 22000
# https://github.com/kefirski/pytorch_RVAE/blob/master/utils/functional.py
return (math.tanh((i - 3500) / 1000) + 1) / 2
def compute_kernel(x, y):
x_size = x.size(0)
y_size = y.size(0)
dim = x.size(1)
x = x.unsqueeze(1) # (x_size, 1, dim)
y = y.unsqueeze(0) # (1, y_size, dim)
tiled_x = x.expand(x_size, y_size, dim)
tiled_y = y.expand(x_size, y_size, dim)
kernel_input = (tiled_x - tiled_y).pow(2).mean(2) / float(dim)
return torch.exp(-kernel_input) # (x_size, y_size)
def compute_mmd(x, y):
x_kernel = compute_kernel(x, x)
y_kernel = compute_kernel(y, y)
xy_kernel = compute_kernel(x, y)
mmd = x_kernel.mean() + y_kernel.mean() - 2 * xy_kernel.mean()
return mmd
class EMA(object):
"""Exponential moving average of model parameters.
Args:
model (torch.nn.Module): Model with parameters whose EMA will be kept.
decay (float): Decay rate for exponential moving average.
"""
def __init__(self, model, decay):
self.decay = decay
self.shadow = {}
self.original = {}
# Register model parameters
for name, param in model.named_parameters():
if param.requires_grad:
self.shadow[name] = param.data.clone()
def __call__(self, model, num_updates):
decay = min(self.decay, (1.0 + num_updates) / (10.0 + num_updates))
for name, param in model.named_parameters():
if param.requires_grad:
assert name in self.shadow
new_average = \
(1.0 - decay) * param.data + decay * self.shadow[name]
self.shadow[name] = new_average.clone()
def assign(self, model):
"""Assign exponential moving average of parameter values to the
respective parameters.
Args:
model (torch.nn.Module): Model to assign parameter values.
"""
for name, param in model.named_parameters():
if param.requires_grad:
assert name in self.shadow
self.original[name] = param.data.clone()
param.data = self.shadow[name]
def resume(self, model):
"""Restore original parameters to a model. That is, put back
the values that were in each parameter at the last call to `assign`.
Args:
model (torch.nn.Module): Model to assign parameter values.
"""
for name, param in model.named_parameters():
if param.requires_grad:
assert name in self.shadow
param.data = self.original[name]
================================================
FILE: vae/eval.py
================================================
import collections
import json
import os
import torch
from transformers import BertTokenizer
from tqdm import tqdm
from qgevalcap.eval import eval_qg
from squad_utils import evaluate, write_predictions
from utils import batch_to_device
def to_string(index, tokenizer):
tok_tokens = tokenizer.convert_ids_to_tokens(index)
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace("[PAD]", "")
tok_text = tok_text.replace("[SEP]", "")
tok_text = tok_text.replace("[CLS]", "")
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
return tok_text
class Result(object):
def __init__(self,
context,
real_question,
posterior_question,
prior_question,
real_answer,
posterior_answer,
prior_answer,
posterior_z_prob,
prior_z_prob):
self.context = context
self.real_question = real_question
self.posterior_question = posterior_question
self.prior_question = prior_question
self.real_answer = real_answer
self.posterior_answer = posterior_answer
self.prior_answer = prior_answer
self.posterior_z_prob = posterior_z_prob
self.prior_z_prob = prior_z_prob
def eval_vae(epoch, args, trainer, eval_data):
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
eval_loader, eval_examples, eval_features = eval_data
all_results = []
qa_results = []
qg_results = {}
res_dict = {}
example_index = -1
for batch in tqdm(eval_loader, desc="Eval iter", leave=False, position=4):
c_ids, q_ids, a_ids, start, end = batch_to_device(batch, args.device)
batch_size = c_ids.size(0)
batch_c_ids = c_ids.cpu().tolist()
batch_q_ids = q_ids.cpu().tolist()
batch_start = start.cpu().tolist()
batch_end = end.cpu().tolist()
batch_posterior_q_ids, \
batch_posterior_start, batch_posterior_end, \
posterior_z_prob = trainer.generate_posterior(c_ids, q_ids, a_ids)
batch_start_logits, batch_end_logits \
= trainer.generate_answer_logits(c_ids, q_ids, a_ids)
batch_posterior_q_ids, \
batch_posterior_start, batch_posterior_end = \
batch_posterior_q_ids.cpu().tolist(), \
batch_posterior_start.cpu().tolist(), batch_posterior_end.cpu().tolist()
posterior_z_prob = posterior_z_prob.cpu()
batch_prior_q_ids, \
batch_prior_start, batch_prior_end, \
prior_z_prob = trainer.generate_prior(c_ids)
batch_prior_q_ids, \
batch_prior_start, batch_prior_end = \
batch_prior_q_ids.cpu().tolist(), \
batch_prior_start.cpu().tolist(), batch_prior_end.cpu().tolist()
prior_z_prob = prior_z_prob.cpu()
for i in range(batch_size):
example_index += 1
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = eval_features[example_index]
unique_id = int(eval_feature.unique_id)
context = to_string(batch_c_ids[i], tokenizer)
real_question = to_string(batch_q_ids[i], tokenizer)
posterior_question = to_string(batch_posterior_q_ids[i], tokenizer)
prior_question = to_string(batch_prior_q_ids[i], tokenizer)
real_answer = to_string(batch_c_ids[i][batch_start[i]:(batch_end[i] + 1)], tokenizer)
posterior_answer = to_string(batch_c_ids[i][batch_posterior_start[i]:(batch_posterior_end[i] + 1)], tokenizer)
prior_answer = to_string(batch_c_ids[i][batch_prior_start[i]:(batch_prior_end[i] + 1)], tokenizer)
all_results.append(Result(context=context,
real_question=real_question,
posterior_question=posterior_question,
prior_question=prior_question,
real_answer=real_answer,
posterior_answer=posterior_answer,
prior_answer=prior_answer,
posterior_z_prob=posterior_z_prob[i],
prior_z_prob=prior_z_prob[i]))
qg_results[unique_id] = posterior_question
res_dict[unique_id] = real_question
qa_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
output_prediction_file = os.path.join(args.model_dir, "pred.json")
write_predictions(eval_examples, eval_features, qa_results, n_best_size=20,
max_answer_length=30, do_lower_case=True,
output_prediction_file=output_prediction_file,
verbose_logging=False,
version_2_with_negative=False,
null_score_diff_threshold=0,
noq_position=True)
with open(args.dev_dir) as f:
dataset_json = json.load(f)
dataset = dataset_json["data"]
with open(os.path.join(args.model_dir, "pred.json")) as prediction_file:
predictions = json.load(prediction_file)
ret = evaluate(dataset, predictions)
bleu = eval_qg(res_dict, qg_results)
return ret, bleu, all_results
================================================
FILE: vae/generate_qa.py
================================================
import argparse
import json
import os
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm
from transformers import BertTokenizer
from models import DiscreteVAE
class CustomDatset(Dataset):
def __init__(self, tokenizer, input_file, max_length=512):
self.lines = open(input_file, "r").readlines()
self.tokenizer = tokenizer
self.max_length = max_length
self.num_total = len(self.lines)
def __getitem__(self, idx):
context = self.lines[idx].strip()
tokens = self.tokenizer.tokenize(context)[:self.max_length]
ids = self.tokenizer.convert_tokens_to_ids(tokens)
# padding up to the maximum length
while len(ids) < self.max_length:
ids.append(0)
ids = torch.tensor(ids, dtype=torch.long)
return ids
def __len__(self):
return self.num_total
def main(args):
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
data = CustomDatset(tokenizer, args.data_file, args.max_length)
data_loader = DataLoader(data, shuffle=False, batch_size=args.batch_size)
device = torch.cuda.current_device()
checkpoint = torch.load(args.checkpoint, map_location="cpu")
vae = DiscreteVAE(checkpoint["args"])
vae.load_state_dict(checkpoint["state_dict"])
vae.eval()
vae = vae.to(device)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
output_file = os.path.join(args.output_dir, "synthetic_qa.jsonl")
fw = open(output_file, "w")
for batch in tqdm(data_loader, total=len(data_loader)):
c_ids = batch
c_len = torch.sum(torch.sign(c_ids),1 )
max_c_len = torch.max(c_len)
c_ids = c_ids[:, :max_c_len].to(device)
# sample latent variable K times
for _ in range(args.k):
with torch.no_grad():
_, _, zq, _, za = vae.prior_encoder(c_ids)
batch_q_ids, batch_start, batch_end = vae.generate(
zq, za, c_ids)
for i in range(c_ids.size(0)):
_c_ids = c_ids[i].cpu().tolist()
q_ids = batch_q_ids[i].cpu().tolist()
start_pos = batch_start[i].item()
end_pos = batch_end[i].item()
a_ids = _c_ids[start_pos: end_pos+1]
c_text = tokenizer.decode(_c_ids, replace_special_tokens=True)
q_text = tokenizer.decode(q_ids, replace_speical_tokens=True)
a_text = tokenizer.decode(a_ids, replace_special_tokens=True)
json_dict = {
"context":c_text,
"question": q_text,
"answer": a_text
}
fw.write(json.dumps(json_dict) + "\n")
fw.flush()
fw.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--seed", default=1004, type=int)
parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
parser.add_argument("--max_length", default=384,
type=int, help="max context length")
parser.add_argument("--batch_size", default=64,
type=int, help="batch_size")
parser.add_argument("--data_file", type=str,
required=True, help="text file of paragraphs")
parser.add_argument("--checkpoint", default="../save/vae-checkpoint/best_f1_model.pt",
type=str, help="checkpoint for vae model")
parser.add_argument("--output_dir", default="../data/synthetic_data/", type=str)
parser.add_argument("--ratio", default=1.0, type=float)
parser.add_argument("--k", default=1, type=int,
help="the number of QA pairs for each paragraph")
args = parser.parse_args()
main(args)
================================================
FILE: vae/main.py
================================================
import argparse
import os
import random
import numpy as np
import torch
from tqdm import tqdm, trange
from transformers import BertTokenizer
from eval import eval_vae
from trainer import VAETrainer
from utils import batch_to_device, get_harv_data_loader, get_squad_data_loader
def main(args):
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
train_loader, _, _ = get_squad_data_loader(tokenizer, args.train_dir,
shuffle=True, args=args)
eval_data = get_squad_data_loader(tokenizer, args.dev_dir,
shuffle=False, args=args)
args.device = torch.cuda.current_device()
trainer = VAETrainer(args)
loss_log1 = tqdm(total=0, bar_format='{desc}', position=2)
loss_log2 = tqdm(total=0, bar_format='{desc}', position=3)
eval_log = tqdm(total=0, bar_format='{desc}', position=5)
best_eval_log = tqdm(total=0, bar_format='{desc}', position=6)
print("MODEL DIR: " + args.model_dir)
best_bleu, best_em, best_f1 = 0.0, 0.0, 0.0
for epoch in trange(int(args.epochs), desc="Epoch", position=0):
for batch in tqdm(train_loader, desc="Train iter", leave=False, position=1):
c_ids, q_ids, a_ids, start_positions, end_positions \
= batch_to_device(batch, args.device)
trainer.train(c_ids, q_ids, a_ids, start_positions, end_positions)
str1 = 'Q REC : {:06.4f} A REC : {:06.4f}'
str2 = 'ZQ KL : {:06.4f} ZA KL : {:06.4f} INFO : {:06.4f}'
str1 = str1.format(float(trainer.loss_q_rec), float(trainer.loss_a_rec))
str2 = str2.format(float(trainer.loss_zq_kl), float(trainer.loss_za_kl), float(trainer.loss_info))
loss_log1.set_description_str(str1)
loss_log2.set_description_str(str2)
if epoch > 10:
metric_dict, bleu, _ = eval_vae(epoch, args, trainer, eval_data)
f1 = metric_dict["f1"]
em = metric_dict["exact_match"]
bleu = bleu * 100
_str = '{}-th Epochs BLEU : {:02.2f} EM : {:02.2f} F1 : {:02.2f}'
_str = _str.format(epoch, bleu, em, f1)
eval_log.set_description_str(_str)
if em > best_em:
best_em = em
if f1 > best_f1:
best_f1 = f1
trainer.save(os.path.join(args.model_dir, "best_f1_model.pt"))
if bleu > best_bleu:
best_bleu = bleu
trainer.save(os.path.join(args.model_dir, "best_bleu_model.pt"))
_str = 'BEST BLEU : {:02.2f} EM : {:02.2f} F1 : {:02.2f}'
_str = _str.format(best_bleu, best_em, best_f1)
best_eval_log.set_description_str(_str)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--seed", default=1004, type=int)
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--train_dir', default='../data/squad/train-v1.1.json')
parser.add_argument('--dev_dir', default='../data/squad/my_dev.json')
parser.add_argument("--max_c_len", default=384, type=int, help="max context length")
parser.add_argument("--max_q_len", default=64, type=int, help="max query length")
parser.add_argument("--model_dir", default="../save/vae-checkpoint", type=str)
parser.add_argument("--epochs", default=20, type=int)
parser.add_argument("--lr", default=1e-3, type=float, help="lr")
parser.add_argument("--batch_size", default=64, type=int, help="batch_size")
parser.add_argument("--weight_decay", default=0.0, type=float, help="weight decay")
parser.add_argument("--clip", default=5.0, type=float, help="max grad norm")
parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
parser.add_argument('--enc_nhidden', type=int, default=300)
parser.add_argument('--enc_nlayers', type=int, default=1)
parser.add_argument('--enc_dropout', type=float, default=0.2)
parser.add_argument('--dec_a_nhidden', type=int, default=300)
parser.add_argument('--dec_a_nlayers', type=int, default=1)
parser.add_argument('--dec_a_dropout', type=float, default=0.2)
parser.add_argument('--dec_q_nhidden', type=int, default=900)
parser.add_argument('--dec_q_nlayers', type=int, default=2)
parser.add_argument('--dec_q_dropout', type=float, default=0.3)
parser.add_argument('--nzqdim', type=int, default=50)
parser.add_argument('--nza', type=int, default=20)
parser.add_argument('--nzadim', type=int, default=10)
parser.add_argument('--lambda_kl', type=float, default=0.1)
parser.add_argument('--lambda_info', type=float, default=1.0)
args = parser.parse_args()
if args.debug:
args.model_dir = "./dummy"
# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
args.model_dir = os.path.abspath(model_dir)
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
main(args)
================================================
FILE: vae/models.py
================================================
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch_scatter import scatter_max
from transformers import BertModel, BertTokenizer
def return_mask_lengths(ids):
mask = torch.sign(ids).float()
lengths = torch.sum(mask, 1)
return mask, lengths
def cal_attn(query, memories, mask):
mask = (1.0 - mask.float()) * -10000.0
attn_logits = torch.matmul(query, memories.transpose(-1, -2).contiguous())
attn_logits = attn_logits + mask
attn_weights = F.softmax(attn_logits, dim=-1)
attn_outputs = torch.matmul(attn_weights, memories)
return attn_outputs, attn_logits
def gumbel_softmax(logits, tau=1, hard=False, eps=1e-20, dim=-1):
# type: (Tensor, float, bool, float, int) -> Tensor
gumbels = -(torch.empty_like(logits).exponential_() +
eps).log() # ~Gumbel(0,1)
gumbels = (logits + gumbels) / tau # ~Gumbel(logits,tau)
y_soft = gumbels.softmax(dim)
if hard:
# Straight through.
index = y_soft.max(dim, keepdim=True)[1]
y_hard = torch.zeros_like(logits).scatter_(dim, index, 1.0)
ret = y_hard - y_soft.detach() + y_soft
else:
# Re-parametrization trick.
ret = y_soft
return ret
class CategoricalKLLoss(nn.Module):
def __init__(self):
super(CategoricalKLLoss, self).__init__()
def forward(self, P, Q):
log_P = P.log()
log_Q = Q.log()
kl = (P * (log_P - log_Q)).sum(dim=-1).sum(dim=-1)
return kl.mean(dim=0)
class GaussianKLLoss(nn.Module):
def __init__(self):
super(GaussianKLLoss, self).__init__()
def forward(self, mu1, logvar1, mu2, logvar2):
numerator = logvar1.exp() + torch.pow(mu1 - mu2, 2)
fraction = torch.div(numerator, (logvar2.exp()))
kl = 0.5 * torch.sum(logvar2 - logvar1 + fraction - 1, dim=1)
return kl.mean(dim=0)
class Embedding(nn.Module):
def __init__(self, bert_model):
super(Embedding, self).__init__()
bert_embeddings = BertModel.from_pretrained(bert_model).embeddings
self.word_embeddings = bert_embeddings.word_embeddings
self.token_type_embeddings = bert_embeddings.token_type_embeddings
self.position_embeddings = bert_embeddings.position_embeddings
self.LayerNorm = bert_embeddings.LayerNorm
self.dropout = bert_embeddings.dropout
def forward(self, input_ids, token_type_ids=None, position_ids=None):
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
if position_ids is None:
seq_length = input_ids.size(1)
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
words_embeddings = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = words_embeddings + token_type_embeddings + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class ContextualizedEmbedding(nn.Module):
def __init__(self, bert_model):
super(ContextualizedEmbedding, self).__init__()
bert = BertModel.from_pretrained(bert_model)
self.embedding = bert.embeddings
self.encoder = bert.encoder
self.num_hidden_layers = bert.config.num_hidden_layers
def forward(self, input_ids, attention_mask, token_type_ids=None):
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
seq_length = input_ids.size(1)
position_ids = torch.arange(
seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
extended_attention_mask = attention_mask.unsqueeze(
1).unsqueeze(2).float()
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
head_mask = [None] * self.num_hidden_layers
embedding_output = self.embedding(
input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
encoder_outputs = self.encoder(embedding_output,
extended_attention_mask,
head_mask=head_mask)
sequence_output = encoder_outputs[0]
return sequence_output
class CustomLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional=False):
super(CustomLSTM, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.bidirectional = bidirectional
self.dropout = nn.Dropout(dropout)
if dropout > 0.0 and num_layers == 1:
dropout = 0.0
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, dropout=dropout,
bidirectional=bidirectional, batch_first=True)
def forward(self, inputs, input_lengths, state=None):
_, total_length, _ = inputs.size()
input_packed = pack_padded_sequence(inputs, input_lengths,
batch_first=True, enforce_sorted=False)
self.lstm.flatten_parameters()
output_packed, state = self.lstm(input_packed, state)
output = pad_packed_sequence(
output_packed, batch_first=True, total_length=total_length)[0]
output = self.dropout(output)
return output, state
class PosteriorEncoder(nn.Module):
def __init__(self, embedding, emsize,
nhidden, nlayers,
nzqdim, nza, nzadim,
dropout=0.0):
super(PosteriorEncoder, self).__init__()
self.embedding = embedding
self.nhidden = nhidden
self.nlayers = nlayers
self.nzqdim = nzqdim
self.nza = nza
self.nzadim = nzadim
self.encoder = CustomLSTM(input_size=emsize,
hidden_size=nhidden,
num_layers=nlayers,
dropout=dropout,
bidirectional=True)
self.question_attention = nn.Linear(2 * nhidden, 2 * nhidden)
self.context_attention = nn.Linear(2 * nhidden, 2 * nhidden)
self.zq_attention = nn.Linear(nzqdim, 2 * nhidden)
self.zq_linear = nn.Linear(4 * 2 * nhidden, 2 * nzqdim)
self.za_linear = nn.Linear(nzqdim + 2 * 2 * nhidden, nza * nzadim)
def forward(self, c_ids, q_ids, a_ids):
c_mask, c_lengths = return_mask_lengths(c_ids)
q_mask, q_lengths = return_mask_lengths(q_ids)
# question enc
q_embeddings = self.embedding(q_ids)
q_hs, q_state = self.encoder(q_embeddings, q_lengths)
q_h = q_state[0].view(self.nlayers, 2, -1, self.nhidden)[-1]
q_h = q_h.transpose(0, 1).contiguous().view(-1, 2 * self.nhidden)
# context enc
c_embeddings = self.embedding(c_ids)
c_hs, c_state = self.encoder(c_embeddings, c_lengths)
c_h = c_state[0].view(self.nlayers, 2, -1, self.nhidden)[-1]
c_h = c_h.transpose(0, 1).contiguous().view(-1, 2 * self.nhidden)
# context and answer enc
c_a_embeddings = self.embedding(c_ids, a_ids, None)
c_a_hs, c_a_state = self.encoder(c_a_embeddings, c_lengths)
c_a_h = c_a_state[0].view(self.nlayers, 2, -1, self.nhidden)[-1]
c_a_h = c_a_h.transpose(0, 1).contiguous().view(-1, 2 * self.nhidden)
# attetion q, c
mask = c_mask.unsqueeze(1)
c_attned_by_q, _ = cal_attn(self.question_attention(q_h).unsqueeze(1),
c_hs,
mask)
c_attned_by_q = c_attned_by_q.squeeze(1)
# attetion c, q
mask = q_mask.unsqueeze(1)
q_attned_by_c, _ = cal_attn(self.context_attention(c_h).unsqueeze(1),
q_hs,
mask)
q_attned_by_c = q_attned_by_c.squeeze(1)
h = torch.cat([q_h, q_attned_by_c, c_h, c_attned_by_q], dim=-1)
zq_mu, zq_logvar = torch.split(self.zq_linear(h), self.nzqdim, dim=1)
zq = zq_mu + torch.randn_like(zq_mu) * torch.exp(0.5 * zq_logvar)
# attention zq, c_a
mask = c_mask.unsqueeze(1)
c_a_attned_by_zq, _ = cal_attn(self.zq_attention(zq).unsqueeze(1),
c_a_hs,
mask)
c_a_attned_by_zq = c_a_attned_by_zq.squeeze(1)
h = torch.cat([zq, c_a_attned_by_zq, c_a_h], dim=-1)
za_logits = self.za_linear(h).view(-1, self.nza, self.nzadim)
za_prob = F.softmax(za_logits, dim=-1)
za = gumbel_softmax(za_logits, hard=True)
return zq_mu, zq_logvar, zq, za_prob, za
class PriorEncoder(nn.Module):
def __init__(self, embedding, emsize,
nhidden, nlayers,
nzqdim, nza, nzadim,
dropout=0):
super(PriorEncoder, self).__init__()
self.embedding = embedding
self.nhidden = nhidden
self.nlayers = nlayers
self.nzqdim = nzqdim
self.nza = nza
self.nzadim = nzadim
self.context_encoder = CustomLSTM(input_size=emsize,
hidden_size=nhidden,
num_layers=nlayers,
dropout=dropout,
bidirectional=True)
self.zq_attention = nn.Linear(nzqdim, 2 * nhidden)
self.zq_linear = nn.Linear(2 * nhidden, 2 * nzqdim)
self.za_linear = nn.Linear(nzqdim + 2 * 2 * nhidden, nza * nzadim)
def forward(self, c_ids):
c_mask, c_lengths = return_mask_lengths(c_ids)
c_embeddings = self.embedding(c_ids)
c_hs, c_state = self.context_encoder(c_embeddings, c_lengths)
c_h = c_state[0].view(self.nlayers, 2, -1, self.nhidden)[-1]
c_h = c_h.transpose(0, 1).contiguous().view(-1, 2 * self.nhidden)
zq_mu, zq_logvar = torch.split(self.zq_linear(c_h), self.nzqdim, dim=1)
zq = zq_mu + torch.randn_like(zq_mu)*torch.exp(0.5*zq_logvar)
mask = c_mask.unsqueeze(1)
c_attned_by_zq, _ = cal_attn(self.zq_attention(zq).unsqueeze(1),
c_hs,
mask)
c_attned_by_zq = c_attned_by_zq.squeeze(1)
h = torch.cat([zq, c_attned_by_zq, c_h], dim=-1)
za_logits = self.za_linear(h).view(-1, self.nza, self.nzadim)
za_prob = F.softmax(za_logits, dim=-1)
za = gumbel_softmax(za_logits, hard=True)
return zq_mu, zq_logvar, zq, za_prob, za
def interpolation(self, c_ids, zq):
c_mask, c_lengths = return_mask_lengths(c_ids)
c_embeddings = self.embedding(c_ids)
c_hs, c_state = self.context_encoder(c_embeddings, c_lengths)
c_h = c_state[0].view(self.nlayers, 2, -1, self.nhidden)[-1]
c_h = c_h.transpose(0, 1).contiguous().view(-1, 2 * self.nhidden)
mask = c_mask.unsqueeze(1)
c_attned_by_zq, _ = cal_attn(
self.zq_attention(zq).unsqueeze(1), c_hs, mask)
c_attned_by_zq = c_attned_by_zq.squeeze(1)
h = torch.cat([zq, c_attned_by_zq, c_h], dim=-1)
za_logits = self.za_linear(h).view(-1, self.nza, self.nzadim)
za = gumbel_softmax(za_logits, hard=True)
return za
class AnswerDecoder(nn.Module):
def __init__(self, embedding, emsize,
nhidden, nlayers,
dropout=0.0):
super(AnswerDecoder, self).__init__()
self.embedding = embedding
self.context_lstm = CustomLSTM(input_size=4 * emsize,
hidden_size=nhidden,
num_layers=nlayers,
dropout=dropout,
bidirectional=True)
self.start_linear = nn.Linear(2 * nhidden, 1)
self.end_linear = nn.Linear(2 * nhidden, 1)
self.ls = nn.LogSoftmax(dim=1)
def forward(self, init_state, c_ids):
_, max_c_len = c_ids.size()
c_mask, c_lengths = return_mask_lengths(c_ids)
H = self.embedding(c_ids, c_mask)
U = init_state.unsqueeze(1).repeat(1, max_c_len, 1)
G = torch.cat([H, U, H * U, torch.abs(H - U)], dim=-1)
M, _ = self.context_lstm(G, c_lengths)
start_logits = self.start_linear(M).squeeze(-1)
end_logits = self.end_linear(M).squeeze(-1)
start_end_mask = (c_mask == 0)
masked_start_logits = start_logits.masked_fill(
start_end_mask, -10000.0)
masked_end_logits = end_logits.masked_fill(start_end_mask, -10000.0)
return masked_start_logits, masked_end_logits
def generate(self, init_state, c_ids):
start_logits, end_logits = self.forward(init_state, c_ids)
c_mask, _ = return_mask_lengths(c_ids)
batch_size, max_c_len = c_ids.size()
mask = torch.matmul(c_mask.unsqueeze(2).float(),
c_mask.unsqueeze(1).float())
mask = torch.triu(mask) == 0
score = (self.ls(start_logits).unsqueeze(2)
+ self.ls(end_logits).unsqueeze(1))
score = score.masked_fill(mask, -10000.0)
score, start_positions = score.max(dim=1)
score, end_positions = score.max(dim=1)
start_positions = torch.gather(start_positions,
1,
end_positions.view(-1, 1)).squeeze(1)
idxes = torch.arange(0, max_c_len, out=torch.LongTensor(max_c_len))
idxes = idxes.unsqueeze(0).to(
start_logits.device).repeat(batch_size, 1)
start_positions = start_positions.unsqueeze(1)
start_mask = (idxes >= start_positions).long()
end_positions = end_positions.unsqueeze(1)
end_mask = (idxes <= end_positions).long()
a_ids = start_mask + end_mask - 1
return a_ids, start_positions.squeeze(1), end_positions.squeeze(1)
class ContextEncoderforQG(nn.Module):
def __init__(self, embedding, emsize,
nhidden, nlayers,
dropout=0.0):
super(ContextEncoderforQG, self).__init__()
self.embedding = embedding
self.context_lstm = CustomLSTM(input_size=emsize,
hidden_size=nhidden,
num_layers=nlayers,
dropout=dropout,
bidirectional=True)
self.context_linear = nn.Linear(2 * nhidden, 2 * nhidden)
self.fusion = nn.Linear(4 * nhidden, 2 * nhidden, bias=False)
self.gate = nn.Linear(4 * nhidden, 2 * nhidden, bias=False)
def forward(self, c_ids, a_ids):
c_mask, c_lengths = return_mask_lengths(c_ids)
c_embeddings = self.embedding(c_ids, c_mask, a_ids)
c_outputs, _ = self.context_lstm(c_embeddings, c_lengths)
# attention
mask = torch.matmul(c_mask.unsqueeze(2), c_mask.unsqueeze(1))
c_attned_by_c, _ = cal_attn(self.context_linear(c_outputs),
c_outputs,
mask)
c_concat = torch.cat([c_outputs, c_attned_by_c], dim=2)
c_fused = self.fusion(c_concat).tanh()
c_gate = self.gate(c_concat).sigmoid()
c_outputs = c_gate * c_fused + (1 - c_gate) * c_outputs
return c_outputs
class QuestionDecoder(nn.Module):
def __init__(self, sos_id, eos_id,
embedding, contextualized_embedding, emsize,
nhidden, ntokens, nlayers,
dropout=0.0,
max_q_len=64):
super(QuestionDecoder, self).__init__()
self.sos_id = sos_id
self.eos_id = eos_id
self.emsize = emsize
self.embedding = embedding
self.nhidden = nhidden
self.ntokens = ntokens
self.nlayers = nlayers
# this max_len include sos eos
self.max_q_len = max_q_len
self.context_lstm = ContextEncoderforQG(contextualized_embedding, emsize,
nhidden // 2, nlayers, dropout)
self.question_lstm = CustomLSTM(input_size=emsize,
hidden_size=nhidden,
num_layers=nlayers,
dropout=dropout,
bidirectional=False)
self.question_linear = nn.Linear(nhidden, nhidden)
self.concat_linear = nn.Sequential(nn.Linear(2*nhidden, 2*nhidden),
nn.Dropout(dropout),
nn.Linear(2*nhidden, 2*emsize))
self.logit_linear = nn.Linear(emsize, ntokens, bias=False)
# fix output word matrix
self.logit_linear.weight = embedding.word_embeddings.weight
for param in self.logit_linear.parameters():
param.requires_grad = False
self.discriminator = nn.Bilinear(emsize, nhidden, 1)
def postprocess(self, q_ids):
eos_mask = q_ids == self.eos_id
no_eos_idx_sum = (eos_mask.sum(dim=1) == 0).long() * \
(self.max_q_len - 1)
eos_mask = eos_mask.cpu().numpy()
q_lengths = np.argmax(eos_mask, axis=1) + 1
q_lengths = torch.tensor(q_lengths).to(
q_ids.device).long() + no_eos_idx_sum
batch_size, max_len = q_ids.size()
idxes = torch.arange(0, max_len).to(q_ids.device)
idxes = idxes.unsqueeze(0).repeat(batch_size, 1)
q_mask = (idxes < q_lengths.unsqueeze(1))
q_ids = q_ids.long() * q_mask.long()
return q_ids
def forward(self, init_state, c_ids, q_ids, a_ids):
batch_size, max_q_len = q_ids.size()
c_outputs = self.context_lstm(c_ids, a_ids)
c_mask, _ = return_mask_lengths(c_ids)
q_mask, q_lengths = return_mask_lengths(q_ids)
# question dec
q_embeddings = self.embedding(q_ids)
q_outputs, _ = self.question_lstm(q_embeddings, q_lengths, init_state)
# attention
mask = torch.matmul(q_mask.unsqueeze(2), c_mask.unsqueeze(1))
c_attned_by_q, attn_logits = cal_attn(self.question_linear(q_outputs),
c_outputs,
mask)
# gen logits
q_concated = torch.cat([q_outputs, c_attned_by_q], dim=2)
q_concated = self.concat_linear(q_concated)
q_maxouted, _ = q_concated.view(
batch_size, max_q_len, self.emsize, 2).max(dim=-1)
gen_logits = self.logit_linear(q_maxouted)
# copy logits
bq = batch_size * max_q_len
c_ids = c_ids.unsqueeze(1).repeat(
1, max_q_len, 1).view(bq, -1).contiguous()
attn_logits = attn_logits.view(bq, -1).contiguous()
copy_logits = torch.zeros(bq, self.ntokens).to(c_ids.device)
copy_logits = copy_logits - 10000.0
copy_logits, _ = scatter_max(attn_logits, c_ids, out=copy_logits)
copy_logits = copy_logits.masked_fill(copy_logits == -10000.0, 0)
copy_logits = copy_logits.view(batch_size, max_q_len, -1).contiguous()
logits = gen_logits + copy_logits
# mutual information btw answer and question
a_emb = c_outputs * a_ids.float().unsqueeze(2)
a_mean_emb = torch.sum(a_emb, 1) / a_ids.sum(1).unsqueeze(1).float()
fake_a_mean_emb = torch.cat([a_mean_emb[-1].unsqueeze(0),
a_mean_emb[:-1]], dim=0)
q_emb = q_maxouted * q_mask.unsqueeze(2)
q_mean_emb = torch.sum(q_emb, 1) / q_lengths.unsqueeze(1).float()
fake_q_mean_emb = torch.cat([q_mean_emb[-1].unsqueeze(0),
q_mean_emb[:-1]], dim=0)
bce_loss = nn.BCEWithLogitsLoss()
true_logits = self.discriminator(q_mean_emb, a_mean_emb)
true_labels = torch.ones_like(true_logits)
fake_a_logits = self.discriminator(q_mean_emb, fake_a_mean_emb)
fake_q_logits = self.discriminator(fake_q_mean_emb, a_mean_emb)
fake_logits = torch.cat([fake_a_logits, fake_q_logits], dim=0)
fake_labels = torch.zeros_like(fake_logits)
true_loss = bce_loss(true_logits, true_labels)
fake_loss = 0.5 * bce_loss(fake_logits, fake_labels)
loss_info = 0.5 * (true_loss + fake_loss)
return logits, loss_info
def generate(self, init_state, c_ids, a_ids):
c_mask, _ = return_mask_lengths(c_ids)
c_outputs = self.context_lstm(c_ids, a_ids)
batch_size = c_ids.size(0)
q_ids = torch.LongTensor([self.sos_id] * batch_size).unsqueeze(1)
q_ids = q_ids.to(c_ids.device)
token_type_ids = torch.zeros_like(q_ids)
position_ids = torch.zeros_like(q_ids)
q_embeddings = self.embedding(q_ids, token_type_ids, position_ids)
state = init_state
# unroll
all_q_ids = list()
all_q_ids.append(q_ids)
for _ in range(self.max_q_len - 1):
position_ids = position_ids + 1
q_outputs, state = self.question_lstm.lstm(q_embeddings, state)
# attention
mask = c_mask.unsqueeze(1)
c_attned_by_q, attn_logits = cal_attn(self.question_linear(q_outputs),
c_outputs,
mask)
# gen logits
q_concated = torch.cat([q_outputs, c_attned_by_q], dim=2)
q_concated = self.concat_linear(q_concated)
q_maxouted, _ = q_concated.view(
batch_size, 1, self.emsize, 2).max(dim=-1)
gen_logits = self.logit_linear(q_maxouted)
# copy logits
attn_logits = attn_logits.squeeze(1)
copy_logits = torch.zeros(
batch_size, self.ntokens).to(c_ids.device)
copy_logits = copy_logits - 10000.0
copy_logits, _ = scatter_max(attn_logits, c_ids, out=copy_logits)
copy_logits = copy_logits.masked_fill(copy_logits == -10000.0, 0)
logits = gen_logits + copy_logits.unsqueeze(1)
q_ids = torch.argmax(logits, 2)
all_q_ids.append(q_ids)
q_embeddings = self.embedding(q_ids, token_type_ids, position_ids)
q_ids = torch.cat(all_q_ids, 1)
q_ids = self.postprocess(q_ids)
return q_ids
def sample(self, init_state, c_ids, a_ids):
c_mask, c_lengths = return_mask_lengths(c_ids)
c_outputs = self.context_lstm(c_ids, a_ids)
batch_size = c_ids.size(0)
q_ids = torch.LongTensor([self.sos_id] * batch_size).unsqueeze(1)
q_ids = q_ids.to(c_ids.device)
token_type_ids = torch.zeros_like(q_ids)
position_ids = torch.zeros_like(q_ids)
q_embeddings = self.embedding(q_ids, token_type_ids, position_ids)
state = init_state
# unroll
all_q_ids = list()
all_q_ids.append(q_ids)
for _ in range(self.max_q_len - 1):
position_ids = position_ids + 1
q_outputs, state = self.question_lstm.lstm(q_embeddings, state)
# attention
mask = c_mask.unsqueeze(1)
c_attned_by_q, attn_logits = cal_attn(self.question_linear(q_outputs),
c_outputs,
mask)
# gen logits
q_concated = torch.cat([q_outputs, c_attned_by_q], dim=2)
q_concated = self.concat_linear(q_concated)
q_maxouted, _ = q_concated.view(batch_size, 1, self.emsize, 2).max(dim=-1)
gen_logits = self.logit_linear(q_maxouted)
# copy logits
attn_logits = attn_logits.squeeze(1)
copy_logits = torch.zeros(batch_size, self.ntokens).to(c_ids.device)
copy_logits = copy_logits - 10000.0
copy_logits, _ = scatter_max(attn_logits, c_ids, out=copy_logits)
copy_logits = copy_logits.masked_fill(copy_logits == -10000.0, 0)
logits = gen_logits + copy_logits.unsqueeze(1)
logits = logits.squeeze(1)
logits = self.top_k_top_p_filtering(logits, 2, top_p=0.8)
probs = F.softmax(logits, dim=-1)
q_ids = torch.multinomial(probs, num_samples=1) # [b,1]
all_q_ids.append(q_ids)
q_embeddings = self.embedding(q_ids, token_type_ids, position_ids)
q_ids = torch.cat(all_q_ids, 1)
q_ids = self.postprocess(q_ids)
return q_ids
class DiscreteVAE(nn.Module):
def __init__(self, args):
super(DiscreteVAE, self).__init__()
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
padding_idx = tokenizer.vocab['[PAD]']
sos_id = tokenizer.vocab['[CLS]']
eos_id = tokenizer.vocab['[SEP]']
ntokens = len(tokenizer.vocab)
bert_model = args.bert_model
if "large" in bert_model:
emsize = 1024
else:
emsize = 768
enc_nhidden = args.enc_nhidden
enc_nlayers = args.enc_nlayers
enc_dropout = args.enc_dropout
dec_a_nhidden = args.dec_a_nhidden
dec_a_nlayers = args.dec_a_nlayers
dec_a_dropout = args.dec_a_dropout
self.dec_q_nhidden = dec_q_nhidden = args.dec_q_nhidden
self.dec_q_nlayers = dec_q_nlayers = args.dec_q_nlayers
dec_q_dropout = args.dec_q_dropout
self.nzqdim = nzqdim = args.nzqdim
self.nza = nza = args.nza
self.nzadim = nzadim = args.nzadim
self.lambda_kl = args.lambda_kl
self.lambda_info = args.lambda_info
max_q_len = args.max_q_len
embedding = Embedding(bert_model)
contextualized_embedding = ContextualizedEmbedding(bert_model)
# freeze embedding
for param in embedding.parameters():
param.requires_grad = False
for param in contextualized_embedding.parameters():
param.requires_grad = False
self.posterior_encoder = PosteriorEncoder(embedding, emsize,
enc_nhidden, enc_nlayers,
nzqdim, nza, nzadim,
enc_dropout)
self.prior_encoder = PriorEncoder(embedding, emsize,
enc_nhidden, enc_nlayers,
nzqdim, nza, nzadim, enc_dropout)
self.answer_decoder = AnswerDecoder(contextualized_embedding, emsize,
dec_a_nhidden, dec_a_nlayers,
dec_a_dropout)
self.question_decoder = QuestionDecoder(sos_id, eos_id,
embedding, contextualized_embedding, emsize,
dec_q_nhidden, ntokens, dec_q_nlayers,
dec_q_dropout,
max_q_len)
self.q_h_linear = nn.Linear(nzqdim, dec_q_nlayers * dec_q_nhidden)
self.q_c_linear = nn.Linear(nzqdim, dec_q_nlayers * dec_q_nhidden)
self.a_linear = nn.Linear(nza * nzadim, emsize, False)
self.q_rec_criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)
self.gaussian_kl_criterion = GaussianKLLoss()
self.categorical_kl_criterion = CategoricalKLLoss()
def return_init_state(self, zq, za):
q_init_h = self.q_h_linear(zq)
q_init_c = self.q_c_linear(zq)
q_init_h = q_init_h.view(-1, self.dec_q_nlayers,
self.dec_q_nhidden).transpose(0, 1).contiguous()
q_init_c = q_init_c.view(-1, self.dec_q_nlayers,
self.dec_q_nhidden).transpose(0, 1).contiguous()
q_init_state = (q_init_h, q_init_c)
za_flatten = za.view(-1, self.nza * self.nzadim)
a_init_state = self.a_linear(za_flatten)
return q_init_state, a_init_state
def forward(self, c_ids, q_ids, a_ids, start_positions, end_positions):
posterior_zq_mu, posterior_zq_logvar, posterior_zq, \
posterior_za_prob, posterior_za \
= self.posterior_encoder(c_ids, q_ids, a_ids)
prior_zq_mu, prior_zq_logvar, _, \
prior_za_prob, _ \
= self.prior_encoder(c_ids)
q_init_state, a_init_state = self.return_init_state(
posterior_zq, posterior_za)
# answer decoding
start_logits, end_logits = self.answer_decoder(a_init_state, c_ids)
# question decoding
q_logits, loss_info = self.question_decoder(
q_init_state, c_ids, q_ids, a_ids)
# q rec loss
loss_q_rec = self.q_rec_criterion(q_logits[:, :-1, :].transpose(1, 2).contiguous(),
q_ids[:, 1:])
# a rec loss
max_c_len = c_ids.size(1)
a_rec_criterion = nn.CrossEntropyLoss(ignore_index=max_c_len)
start_positions.clamp_(0, max_c_len)
end_positions.clamp_(0, max_c_len)
loss_start_a_rec = a_rec_criterion(start_logits, start_positions)
loss_end_a_rec = a_rec_criterion(end_logits, end_positions)
loss_a_rec = 0.5 * (loss_start_a_rec + loss_end_a_rec)
# kl loss
loss_zq_kl = self.gaussian_kl_criterion(posterior_zq_mu,
posterior_zq_logvar,
prior_zq_mu,
prior_zq_logvar)
loss_za_kl = self.categorical_kl_criterion(posterior_za_prob,
prior_za_prob)
loss_kl = self.lambda_kl * (loss_zq_kl + loss_za_kl)
loss_info = self.lambda_info * loss_info
loss = loss_q_rec + loss_a_rec + loss_kl + loss_info
return loss, \
loss_q_rec, loss_a_rec, \
loss_zq_kl, loss_za_kl, \
loss_info
def generate(self, zq, za, c_ids):
q_init_state, a_init_state = self.return_init_state(zq, za)
a_ids, start_positions, end_positions = self.answer_decoder.generate(
a_init_state, c_ids)
q_ids = self.question_decoder.generate(q_init_state, c_ids, a_ids)
return q_ids, start_positions, end_positions
def return_answer_logits(self, zq, za, c_ids):
_, a_init_state = self.return_init_state(zq, za)
start_logits, end_logits = self.answer_decoder(a_init_state, c_ids)
return start_logits, end_logits
================================================
FILE: vae/qgevalcap/.gitignore
================================================
*.pyc
================================================
FILE: vae/qgevalcap/README.md
================================================
## evaluation scripts
./eval.py --out_file \
================================================
FILE: vae/qgevalcap/bleu/.gitignore
================================================
*.pyc
================================================
FILE: vae/qgevalcap/bleu/LICENSE
================================================
Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
================================================
FILE: vae/qgevalcap/bleu/bleu.py
================================================
#!/usr/bin/env python
#
# File Name : bleu.py
#
# Description : Wrapper for BLEU scorer.
#
# Creation Date : 06-01-2015
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang and Tsung-Yi Lin
#!/usr/bin/env python
# bleu_scorer.py
# David Chiang
# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.
# Modified by:
# Hao Fang
# Tsung-Yi Lin
'''Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
'''
import copy
import sys, math, re
from collections import defaultdict
def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen))/len(reflen)
## lhuang: N.B.: leave reflen computaiton to the very end!!
## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
return (reflen, maxcounts)
def cook_test(test, reflen, refmaxcounts, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
testlen, counts = precook(test, n, True)
result = {}
# Calculate effective reference sentence length.
if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
result['correct'] = [0]*n
for (ngram, count) in counts.items():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
return result
class BleuScorer(object):
"""Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for a node).
def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1][0],self.crefs[-1][1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
self._score = None ## need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs[0], rs[1]))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute
return self
def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens))/len(reflens)
elif option == "closest":
reflen = min((abs(l-testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess','correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)
if verbose > 1:
print (comps, reflen)
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)
if verbose > 0:
print (totalcomps)
print ("ratio:", ratio)
self._score = bleus
return self._score, bleu_list
class Bleu:
def __init__(self, n=4):
# default compute Blue score up to 4
self._n = n
self._hypo_for_image = {}
self.ref_for_image = {}
def compute_score(self, gts, res):
assert(gts.keys() == res.keys())
imgIds = gts.keys()
bleu_scorer = BleuScorer(n=self._n)
for id in imgIds:
hypo = res[id]
ref = gts[id]
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) >= 1)
bleu_scorer += (hypo[0], ref)
#score, scores = bleu_scorer.compute_score(option='shortest')
score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
#score, scores = bleu_scorer.compute_score(option='average', verbose=1)
# return (bleu, bleu_info)
return score, scores
def method(self):
return "Bleu"
================================================
FILE: vae/qgevalcap/bleu/bleu_scorer.py
================================================
#!/usr/bin/env python
# bleu_scorer.py
# David Chiang
# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.
# Modified by:
# Hao Fang
# Tsung-Yi Lin
'''Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
'''
import copy
import sys, math, re
from collections import defaultdict
def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.iteritems():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen))/len(reflen)
## lhuang: N.B.: leave reflen computaiton to the very end!!
## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
return (reflen, maxcounts)
def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
testlen, counts = precook(test, n, True)
result = {}
# Calculate effective reference sentence length.
if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
result['correct'] = [0]*n
for (ngram, count) in counts.iteritems():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
return result
class BleuScorer(object):
"""Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for a node).
def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
self._score = None ## need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute
return self
def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens))/len(reflens)
elif option == "closest":
reflen = min((abs(l-testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess','correct']:
for k in xrange(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in xrange(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)
if verbose > 1:
print comps, reflen
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in xrange(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in xrange(n):
bleus[k] *= math.exp(1 - 1/ratio)
if verbose > 0:
print totalcomps
print "ratio:", ratio
self._score = bleus
return self._score, bleu_list
================================================
FILE: vae/qgevalcap/cider/__init__.py
================================================
__author__ = 'tylin'
================================================
FILE: vae/qgevalcap/cider/cider.py
================================================
# Filename: cider.py
#
# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
# by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
#
# Creation Date: Sun Feb 8 14:16:54 2015
#
# Authors: Ramakrishna Vedantam and Tsung-Yi Lin
import pdb
#!/usr/bin/env python
# Tsung-Yi Lin
# Ramakrishna Vedantam
import copy
from collections import defaultdict
import numpy as np
import pdb
import math
def precook(s, n=4, out=False):
"""
Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well.
:param s: string : sentence to be converted into ngrams
:param n: int : number of ngrams for which representation is calculated
:return: term frequency vector for occuring ngrams
"""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.
:param refs: list of string : reference sentences for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (list of dict)
'''
return [precook(ref, n) for ref in refs]
def cook_test(test, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.
:param test: list of string : hypothesis sentence for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (dict)
'''
return precook(test, n, True)
class CiderScorer(object):
"""CIDEr scorer.
"""
def copy(self):
''' copy the refs.'''
new = CiderScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
return new
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
''' singular instance '''
self.n = n
self.sigma = sigma
self.crefs = []
self.ctest = []
self.document_frequency = defaultdict(float)
self.cook_append(test, refs)
self.ref_len = None
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
self.ctest.append(cook_test(test)) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new CiderScorer instances
self.cook_append(other[0], other[1])
else:
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
return self
def compute_doc_freq(self):
'''
Compute term frequency for reference data.
This will be used to compute idf (inverse document frequency later)
The term frequency is stored in the object
:return: None
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
def compute_cider(self):
def counts2vec(cnts):
"""
Function maps counts of ngram to vector of tfidf weights.
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
The n-th entry of array denotes length of n-grams.
:param cnts:
:return: vec (array of dict), norm (array of float), length (int)
"""
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram,term_freq) in cnts.iteritems():
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
n = len(ngram)-1
# tf (term_freq) * idf (precomputed idf) for n-grams
vec[n][ngram] = float(term_freq)*(self.ref_len - df)
# compute norm for the vector. the norm will be used for computing similarity
norm[n] += pow(vec[n][ngram], 2)
if n == 1:
length += term_freq
norm = [np.sqrt(n) for n in norm]
return vec, norm, length
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
'''
Compute the cosine similarity of two vectors.
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
:param vec_ref: array of dictionary for vector corresponding to reference
:param norm_hyp: array of float for vector corresponding to hypothesis
:param norm_ref: array of float for vector corresponding to reference
:param length_hyp: int containing length of hypothesis
:param length_ref: int containing length of reference
:return: array of score for each n-grams cosine similarity
'''
delta = float(length_hyp - length_ref)
# measure consine similarity
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram,count) in vec_hyp[n].iteritems():
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
val[n] /= (norm_hyp[n]*norm_ref[n])
assert(not math.isnan(val[n]))
# vrama91: added a length based gaussian penalty
val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
return val
# compute log reference length
self.ref_len = np.log(float(len(self.crefs)))
scores = []
for test, refs in zip(self.ctest, self.crefs):
# compute vector for test captions
vec, norm, length = counts2vec(test)
# compute vector for ref captions
score = np.array([0.0 for _ in range(self.n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(ref)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
# change by vrama91 - mean of ngram scores, instead of sum
score_avg = np.mean(score)
# divide by number of references
score_avg /= len(refs)
# multiply score by 10
score_avg *= 10.0
# append score of an image to the score list
scores.append(score_avg)
return scores
def compute_score(self, option=None, verbose=0):
# compute idf
self.compute_doc_freq()
# assert to check document frequency
assert(len(self.ctest) >= max(self.document_frequency.values()))
# compute cider score
score = self.compute_cider()
# debug
# print score
return np.mean(np.array(score)), np.array(score)
class Cider:
"""
Main Class to compute the CIDEr metric
"""
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
# set cider to sum over 1 to 4-grams
self._n = n
# set the standard deviation parameter for gaussian penalty
self._sigma = sigma
def compute_score(self, gts, res):
"""
Main function to compute CIDEr score
:param hypo_for_image (dict) : dictionary with key and value
ref_for_image (dict) : dictionary with key and value
:return: cider (float) : computed CIDEr score for the corpus
"""
assert(gts.keys() == res.keys())
imgIds = gts.keys()
cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
for id in imgIds:
hypo = res[id]
ref = gts[id]
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
cider_scorer += (hypo[0], ref)
(score, scores) = cider_scorer.compute_score()
return score, scores
def method(self):
return "CIDEr"
================================================
FILE: vae/qgevalcap/cider/cider_scorer.py
================================================
#!/usr/bin/env python
# Tsung-Yi Lin
# Ramakrishna Vedantam
import copy
from collections import defaultdict
import numpy as np
import pdb
import math
def precook(s, n=4, out=False):
"""
Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well.
:param s: string : sentence to be converted into ngrams
:param n: int : number of ngrams for which representation is calculated
:return: term frequency vector for occuring ngrams
"""
words = s.split()
counts = defaultdict(int)
for k in xrange(1,n+1):
for i in xrange(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.
:param refs: list of string : reference sentences for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (list of dict)
'''
return [precook(ref, n) for ref in refs]
def cook_test(test, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.
:param test: list of string : hypothesis sentence for some image
:param n: int : number of ngrams for which (ngram) representation is calculated
:return: result (dict)
'''
return precook(test, n, True)
class CiderScorer(object):
"""CIDEr scorer.
"""
def copy(self):
''' copy the refs.'''
new = CiderScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
return new
def __init__(self, test=None, refs=None, n=4, sigma=6.0):
''' singular instance '''
self.n = n
self.sigma = sigma
self.crefs = []
self.ctest = []
self.document_frequency = defaultdict(float)
self.cook_append(test, refs)
self.ref_len = None
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
self.ctest.append(cook_test(test)) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new CiderScorer instances
self.cook_append(other[0], other[1])
else:
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
return self
def compute_doc_freq(self):
'''
Compute term frequency for reference data.
This will be used to compute idf (inverse document frequency later)
The term frequency is stored in the object
:return: None
'''
for refs in self.crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
self.document_frequency[ngram] += 1
# maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
def compute_cider(self):
def counts2vec(cnts):
"""
Function maps counts of ngram to vector of tfidf weights.
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
The n-th entry of array denotes length of n-grams.
:param cnts:
:return: vec (array of dict), norm (array of float), length (int)
"""
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram,term_freq) in cnts.iteritems():
# give word count 1 if it doesn't appear in reference corpus
df = np.log(max(1.0, self.document_frequency[ngram]))
# ngram index
n = len(ngram)-1
# tf (term_freq) * idf (precomputed idf) for n-grams
vec[n][ngram] = float(term_freq)*(self.ref_len - df)
# compute norm for the vector. the norm will be used for computing similarity
norm[n] += pow(vec[n][ngram], 2)
if n == 1:
length += term_freq
norm = [np.sqrt(n) for n in norm]
return vec, norm, length
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
'''
Compute the cosine similarity of two vectors.
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
:param vec_ref: array of dictionary for vector corresponding to reference
:param norm_hyp: array of float for vector corresponding to hypothesis
:param norm_ref: array of float for vector corresponding to reference
:param length_hyp: int containing length of hypothesis
:param length_ref: int containing length of reference
:return: array of score for each n-grams cosine similarity
'''
delta = float(length_hyp - length_ref)
# measure consine similarity
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
# ngram
for (ngram,count) in vec_hyp[n].iteritems():
# vrama91 : added clipping
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
val[n] /= (norm_hyp[n]*norm_ref[n])
assert(not math.isnan(val[n]))
# vrama91: added a length based gaussian penalty
val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
return val
# compute log reference length
self.ref_len = np.log(float(len(self.crefs)))
scores = []
for test, refs in zip(self.ctest, self.crefs):
# compute vector for test captions
vec, norm, length = counts2vec(test)
# compute vector for ref captions
score = np.array([0.0 for _ in range(self.n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(ref)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
# change by vrama91 - mean of ngram scores, instead of sum
score_avg = np.mean(score)
# divide by number of references
score_avg /= len(refs)
# multiply score by 10
score_avg *= 10.0
# append score of an image to the score list
scores.append(score_avg)
return scores
def compute_score(self, option=None, verbose=0):
# compute idf
self.compute_doc_freq()
# assert to check document frequency
assert(len(self.ctest) >= max(self.document_frequency.values()))
# compute cider score
score = self.compute_cider()
# debug
# print score
return np.mean(np.array(score)), np.array(score)
================================================
FILE: vae/qgevalcap/eval.py
================================================
import os
import copy
from collections import defaultdict
from argparse import ArgumentParser
import pickle
import json
from json import encoder
import math
def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i:i + k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram, count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen)) / len(reflen)
## lhuang: N.B.: leave reflen computaiton to the very end!!
## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
return (reflen, maxcounts)
def cook_test(test, reflen, refmaxcounts, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
testlen, counts = precook(test, n, True)
result = {}
# Calculate effective reference sentence length.
if eff == "closest":
result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
result['correct'] = [0] * n
for (ngram, count) in counts.items():
result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
return result
class BleuScorer(object):
"""Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for a node).
def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1][0], self.crefs[-1][1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
self._score = None ## need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs[0], rs[1]))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute
return self
def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens)) / len(reflens)
elif option == "closest":
reflen = min((abs(l - testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess', 'correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/ (float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1. / (k + 1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
if verbose > 1:
print(comps, reflen)
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1. / (k + 1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1 / ratio)
if verbose > 0:
print(totalcomps)
print("ratio:", ratio)
self._score = bleus
return self._score, bleu_list
class Bleu:
def __init__(self, n=4):
# default compute Blue score up to 4
self._n = n
self._hypo_for_image = {}
self.ref_for_image = {}
def compute_score(self, gts, res):
assert (gts.keys() == res.keys())
imgIds = gts.keys()
bleu_scorer = BleuScorer(n=self._n)
for id in imgIds:
hypo = res[id]
ref = gts[id]
# Sanity check.
assert (type(hypo) is list)
assert (len(hypo) == 1)
assert (type(ref) is list)
assert (len(ref) >= 1)
bleu_scorer += (hypo[0], ref)
# score, scores = bleu_scorer.compute_score(option='shortest')
score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
# score, scores = bleu_scorer.compute_score(option='average', verbose=1)
# return (bleu, bleu_info)
return score, scores
def method(self):
return "Bleu"
class QGEvalCap:
def __init__(self, gts, res):
self.gts = gts
self.res = res
def evaluate(self, not_print=True):
output = []
scorers = [
(Bleu(4), "Bleu_4"),
# (meteor.Meteor(),"METEOR"),
# (rouge.Rouge(), "ROUGE_L"),
# (cider.Cider(), "CIDEr")
]
# =================================================
# Compute scores
# =================================================
for scorer, method in scorers:
# print 'computing %s score...'%(scorer.method())
score, scores = scorer.compute_score(self.gts, self.res)
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
if not not_print:
print("%s: %0.5f" % (m, sc))
output.append(sc)
else:
if not not_print:
print("%s: %0.5f" % (method, score))
output.append(score)
return output
def eval_qg(res_dict, gts_dict, not_print=True):
encoder.FLOAT_REPR = lambda o: format(o, '.4f')
res = defaultdict(lambda: [])
gts = defaultdict(lambda: [])
for key in gts_dict.keys():
res[key] = [res_dict[key].encode('utf-8')]
gts[key].append(gts_dict[key].encode('utf-8'))
QGEval = QGEvalCap(gts, res)
return QGEval.evaluate(not_print)[0][-1]
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-out", "--dataset_file", dest="dataset_file", default="../data/dev-v1.1.json",
help="output file directory")
parser.add_argument("-out", "--output_file", dest="output_file", default="../output/2/qg_results.pkl",
help="output file directory")
args = parser.parse_args()
with open(args.dataset_file) as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
res_dict = {}
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
res_dict[qa['id']] = qa['question']
with open(args.output_file, "rb") as f:
gts_dict = pickle.load(f)
print("scores: \n")
eval_qg(res_dict, gts_dict, not_print=False)
================================================
FILE: vae/qgevalcap/meteor/__init__.py
================================================
__author__ = 'tylin'
================================================
FILE: vae/qgevalcap/meteor/meteor.py
================================================
#!/usr/bin/env python
# Python wrapper for METEOR implementation, by Xinlei Chen
# Acknowledge Michael Denkowski for the generous discussion and help
import os
import sys
import subprocess
import threading
# Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed.
METEOR_JAR = 'meteor-1.5.jar'
# print METEOR_JAR
class Meteor:
def __init__(self):
self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
'-', '-', '-stdio', '-l', 'en',
'-norm',
# '-t', 'adq'
# '-p', '0.85 0.2 0.6 0.75' # alpha beta gamma delta'',
# '-a', 'data/paraphrase-en.gz', '-m', 'exact stem paraphrase']
]
self.meteor_p = subprocess.Popen(self.meteor_cmd, \
cwd=os.path.dirname(os.path.abspath(__file__)), \
stdin=subprocess.PIPE, \
stdout=subprocess.PIPE, \
stderr=subprocess.PIPE)
# Used to guarantee thread safety
self.lock = threading.Lock()
def compute_score(self, gts, res):
assert(gts.keys() == res.keys())
imgIds = gts.keys()
scores = []
eval_line = 'EVAL'
self.lock.acquire()
for i in imgIds:
assert(len(res[i]) == 1)
stat = self._stat(res[i][0], gts[i])
eval_line += ' ||| {}'.format(stat)
self.meteor_p.stdin.write('{}\n'.format(eval_line))
for i in range(0,len(imgIds)):
scores.append(float(self.meteor_p.stdout.readline().strip()))
score = float(self.meteor_p.stdout.readline().strip())
self.lock.release()
return score, scores
def method(self):
return "METEOR"
def _stat(self, hypothesis_str, reference_list):
# SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str.encode('utf-8')))
# print score_line
self.meteor_p.stdin.write('{}\n'.format(score_line.encode('utf-8')))
return self.meteor_p.stdout.readline().strip()
def _score(self, hypothesis_str, reference_list):
self.lock.acquire()
# SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
self.meteor_p.stdin.write('{}\n'.format(score_line))
stats = self.meteor_p.stdout.readline().strip()
eval_line = 'EVAL ||| {}'.format(stats)
# EVAL ||| stats
self.meteor_p.stdin.write('{}\n'.format(eval_line))
score = float(self.meteor_p.stdout.readline().strip())
# bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
# thanks for Andrej for pointing this out
score = float(self.meteor_p.stdout.readline().strip())
self.lock.release()
return score
def __del__(self):
self.lock.acquire()
self.meteor_p.stdin.close()
self.meteor_p.kill()
self.meteor_p.wait()
self.lock.release()
================================================
FILE: vae/qgevalcap/rouge/__init__.py
================================================
__author__ = 'vrama91'
================================================
FILE: vae/qgevalcap/rouge/rouge.py
================================================
#!/usr/bin/env python
#
# File Name : rouge.py
#
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
#
# Creation Date : 2015-01-07 06:03
# Author : Ramakrishna Vedantam
import numpy as np
import pdb
def my_lcs(string, sub):
"""
Calculates longest common subsequence for a pair of tokenized strings
:param string : list of str : tokens from a string split using whitespace
:param sub : list of str : shorter string, also split using whitespace
:returns: length (list of int): length of the longest common subsequence between the two strings
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
"""
if(len(string)< len(sub)):
sub, string = string, sub
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
for j in range(1,len(sub)+1):
for i in range(1,len(string)+1):
if(string[i-1] == sub[j-1]):
lengths[i][j] = lengths[i-1][j-1] + 1
else:
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
return lengths[len(string)][len(sub)]
class Rouge():
'''
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
'''
def __init__(self):
# vrama91: updated the value below based on discussion with Hovey
self.beta = 1.2
def calc_score(self, candidate, refs):
"""
Compute ROUGE-L score given one candidate and references for an image
:param candidate: str : candidate sentence to be evaluated
:param refs: list of str : COCO reference sentences for the particular image to be evaluated
:returns score: int (ROUGE-L score for the candidate evaluated against references)
"""
assert(len(candidate)==1)
assert(len(refs)>0)
prec = []
rec = []
# split into tokens
token_c = candidate[0].split(" ")
for reference in refs:
# split into tokens
token_r = reference.split(" ")
# compute the longest common subsequence
lcs = my_lcs(token_r, token_c)
prec.append(lcs/float(len(token_c)))
rec.append(lcs/float(len(token_r)))
prec_max = max(prec)
rec_max = max(rec)
if(prec_max!=0 and rec_max !=0):
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
else:
score = 0.0
return score
def compute_score(self, gts, res):
"""
Computes Rouge-L score given a set of reference and candidate sentences for the dataset
Invoked by evaluate_captions.py
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
"""
assert(gts.keys() == res.keys())
imgIds = gts.keys()
score = []
for id in imgIds:
hypo = res[id]
ref = gts[id]
score.append(self.calc_score(hypo, ref))
# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
average_score = np.mean(np.array(score))
return average_score, np.array(score)
def method(self):
return "Rouge"
================================================
FILE: vae/squad_utils.py
================================================
import collections
import gzip
import json
import math
import re
import string
import sys
from copy import deepcopy
import json_lines
import numpy as np
from transformers.models.bert import BasicTokenizer
from transformers.data.processors.squad import whitespace_tokenize
from tqdm import tqdm
class SquadExample(object):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % self.qas_id
s += ", question_text: %s" % self.question_text
s += ", doc_tokens: [%s]" % " ".join(self.doc_tokens)
if self.start_position:
s += ", start_position: %d" % self.start_position
if self.end_position:
s += ", end_position: %d" % self.end_position
if self.is_impossible:
s += ", is_impossible: %r" % self.is_impossible
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
c_ids,
context_tokens,
q_ids,
q_tokens,
answer_text,
tag_ids,
input_mask,
segment_ids,
context_segment_ids=None,
noq_start_position=None,
noq_end_position=None,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.c_ids = c_ids
self.context_tokens = context_tokens
self.q_ids = q_ids
self.q_tokens = q_tokens
self.answer_text = answer_text
self.tag_ids = tag_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.context_segment_ids = context_segment_ids
self.noq_start_position = noq_start_position
self.noq_end_position = noq_end_position
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
features = []
for (example_index, example) in tqdm(enumerate(examples), total=len(examples)):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
context_tokens = list()
context_tokens.append("[CLS]")
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
context_tokens.append(all_doc_tokens[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
context_tokens.append("[SEP]")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
input_ids = np.asarray(input_ids, dtype=np.int32)
input_mask = np.asarray(input_mask, dtype=np.uint8)
segment_ids = np.asarray(segment_ids, dtype=np.uint8)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if out_of_span:
continue
if is_training and example.is_impossible:
start_position = 0
end_position = 0
c_ids = tokenizer.convert_tokens_to_ids(context_tokens)
while len(c_ids) < max_seq_length:
c_ids.append(0)
c_ids = np.asarray(c_ids, dtype=np.int32)
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
c_ids=c_ids,
context_tokens=None,
q_ids=None,
q_tokens=None,
answer_text=example.orig_answer_text,
tag_ids=None,
segment_ids=segment_ids,
noq_start_position=None,
noq_end_position=None,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible))
unique_id += 1
return features
def convert_examples_to_harv_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training):
"""Loads a data file into a list of `InputBatch`s.
each example only contains a sequence of ids for context(paragraph)
"""
unique_id = 1000000000
features = []
for example in tqdm(examples, total=len(examples)):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
context_tokens = list()
context_tokens.append("[CLS]")
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
context_tokens.append(all_doc_tokens[split_token_index])
tokens.append("[SEP]")
context_tokens.append("[SEP]")
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
continue
c_ids = tokenizer.convert_tokens_to_ids(context_tokens)
while len(c_ids) < max_seq_length:
c_ids.append(0)
features.append(
InputFeatures(
unique_id=None,
example_index=None,
doc_span_index=None,
tokens=None,
token_to_orig_map=None,
token_is_max_context=None,
input_ids=None,
input_mask=None,
c_ids=c_ids,
context_tokens=None,
q_ids=None,
q_tokens=None,
answer_text=None,
tag_ids=None,
segment_ids=None,
noq_start_position=None,
noq_end_position=None,
start_position=None,
end_position=None,
is_impossible=None))
unique_id += 1
return features
def convert_examples_to_features_answer_id(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, max_ans_length, is_training):
"""Loads a data file into a list of `InputBatch`s.
In addition to the original InputFeature class, it contains
c_ids: ids for context
tag ids: indicate the answer span of context,
noq_start_position: start position of answer in context without concatenation of question
noq_end_position: end position of answer in context without concatenation of question
"""
unique_id = 1000000000
features = []
for (example_index, example) in tqdm(enumerate(examples), total=len(examples)):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
context_tokens = list()
context_tokens.append("[CLS]")
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(
tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
context_tokens.append(all_doc_tokens[split_token_index])
tokens.append("[SEP]")
segment_ids.append(1)
context_tokens.append("[SEP]")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
noq_start_position = None
noq_end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
noq_start_position = 0
noq_end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
# plus one for [CLS] token
noq_start_position = tok_start_position - doc_start + 1
noq_end_position = tok_end_position - doc_start + 1
# skip the context that does not contain any answer span
if out_of_span:
continue
if is_training and example.is_impossible:
start_position = 0
end_position = 0
noq_start_position = 0
noq_end_position = 0
q_tokens = deepcopy(query_tokens)[:max_query_length - 2]
q_tokens.insert(0, "[CLS]")
q_tokens.append("[SEP]")
q_ids = tokenizer.convert_tokens_to_ids(q_tokens)
c_ids = tokenizer.convert_tokens_to_ids(context_tokens)
# pad up to maximum length
while len(q_ids) < max_query_length:
q_ids.append(0)
while len(c_ids) < max_seq_length:
c_ids.append(0)
# answer_text = example.orig_answer_text
# answer_tokens = tokenizer.tokenize(answer_text)[:max_ans_length - 2]
# answer_tokens.insert(0, "[CLS]")
# answer_tokens.append("[SEP]")
# answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)
# while len(answer_ids) < max_ans_length:
# answer_ids.append(0)
context_segment_ids = [0] * len(c_ids)
for answer_idx in range(noq_start_position, noq_end_position + 1):
context_segment_ids[answer_idx] = 1
# BIO tagging scheme
tag_ids = [0] * len(c_ids) # Outside
if noq_start_position is not None and noq_end_position is not None:
tag_ids[noq_start_position] = 1 # Begin
# Inside tag
for idx in range(noq_start_position + 1, noq_end_position + 1):
tag_ids[idx] = 2
assert len(tag_ids) == len(c_ids), "length of tag :{}, length of c :{}".format(
len(tag_ids), len(c_ids))
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
c_ids=c_ids,
context_tokens=context_tokens,
q_ids=q_ids,
q_tokens=q_tokens,
answer_text=example.orig_answer_text,
tag_ids=tag_ids,
segment_ids=segment_ids,
context_segment_ids=context_segment_ids,
noq_start_position=noq_start_position,
noq_end_position=noq_end_position,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible))
unique_id += 1
return features
def read_examples(input_file, debug=False, is_training=False):
# Read data
unproc_data = []
with gzip.open(input_file, 'rt', encoding='utf-8') as f: # opening file in binary(rb) mode
for item in json_lines.reader(f):
# print(item) #or use print(item['X']) for printing specific data
unproc_data.append(item)
# Delete header
unproc_data = unproc_data[1:]
if debug:
unproc_data = unproc_data[:100]
examples = []
skip_tags = ['
', '', '', '', '', '- ']
for item in unproc_data:
# in case of NQ dataset, context containing tags is excluded for training
context = item["context"]
skip_flag = False
for tag in skip_tags:
if tag in context:
skip_flag = True
break
if skip_flag and is_training:
continue
doc_tokens = []
for token in item["context_tokens"]:
if token[0] in ['[TLE]', '[PAR]', '[DOC]']:
token[0] = '[SEP]'
doc_tokens.append(token[0])
# 2. qas
for qa in item['qas']:
qas_id = qa['qid']
question_text = qa['question']
# Only take the first answer
answer = qa['detected_answers'][0]
orig_answer_text = answer['text']
# Only take the first span
start_position = answer['token_spans'][0][0]
end_position = answer['token_spans'][0][1]
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
def read_squad_examples(input_file, is_training, version_2_with_negative=False,
debug=False, reduce_size=False, ratio=1.0):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
if debug:
input_data = input_data[:5]
for entry in tqdm(input_data, total=len(input_data)):
paragraphs = entry["paragraphs"]
for paragraph in paragraphs:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + \
0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
verbose_logging, version_2_with_negative, null_score_diff_threshold,
noq_position=False):
"""Write final predictions to the json file and log-odds of null if needed."""
# logger.info("Writing predictions to: %s" % (output_prediction_file))
# logger.info("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min null score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + \
result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
if noq_position:
# start and end is computed without question tokens
# add length of q and and -1 for [CLS] in q_ids
q_ids = feature.q_ids
q_len = np.sum(np.sign(q_ids))
noq_start_index = start_index
noq_end_index = end_index
start_index = start_index + q_len - 1
end_index = end_index + q_len - 1
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
if noq_position:
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[noq_start_index],
end_logit=result.end_logits[noq_end_index]))
else:
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(
tok_text, orig_text, do_lower_case, verbose_logging)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't include the empty option in the n-best, include it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest) == 1:
nbest.insert(0, _NbestPrediction(text="empty",
start_logit=0.0,
end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(_NbestPrediction(text="empty",
start_logit=0.0,
end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
# if verbose_logging:
# logger.info(
# "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
# if verbose_logging:
# logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
# orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
# if verbose_logging:
# logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
# if verbose_logging:
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
def write_answer_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file, verbose_logging,
version_2_with_negative, null_score_diff_threshold):
"""Write final predictions to the json file and log-odds of null if needed."""
# logger.info("Writing predictions to: %s" % (output_prediction_file))
# logger.info("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min null score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + \
result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
# start and end index is from [CLS] [UNK] [SEP] C [SEP]
# each should be deducted 1 and added length of Q
offset = len(feature.q_tokens) - 2 # -2 for [CLS] and [SEP]
for start_index in start_indexes:
for end_index in end_indexes:
start_index = offset + start_index - 1 # -1 for [UNK]
end_index = offset + end_index - 1 # -1 for [UNK]
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(
tok_text, orig_text, do_lower_case, verbose_logging)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't include the empty option in the n-best, include it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest) == 1:
nbest.insert(0,
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = collections.Counter(
prediction_tokens) & collections.Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return (normalize_answer(prediction) == normalize_answer(ground_truth))
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
total += 1
if qa['id'] not in predictions:
message = 'Unanswered question ' + qa['id'] + \
' will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = predictions[qa['id']]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
def read_predictions(prediction_file):
with open(prediction_file) as f:
predictions = json.load(f)
return predictions
def read_answers(gold_file):
answers = {}
with gzip.open(gold_file, 'rb') as f:
for i, line in enumerate(f):
example = json.loads(line)
if i == 0 and 'header' in example:
continue
for qa in example['qas']:
answers[qa['qid']] = qa['answers']
return answers
def evaluate_mrqa(answers, predictions, skip_no_answer=False):
f1 = exact_match = total = 0
for qid, ground_truths in answers.items():
if qid not in predictions:
if not skip_no_answer:
message = 'Unanswered question %s will receive score 0.' % qid
print(message)
total += 1
continue
total += 1
prediction = predictions[qid]
exact_match += metric_max_over_ground_truths(
exact_match_score, prediction, ground_truths)
f1 += metric_max_over_ground_truths(
f1_score, prediction, ground_truths)
exact_match = 100.0 * exact_match / total
f1 = 100.0 * f1 / total
return {'exact_match': exact_match, 'f1': f1}
================================================
FILE: vae/trainer.py
================================================
import torch
import torch.nn as nn
from models import DiscreteVAE, return_mask_lengths
class VAETrainer(object):
def __init__(self, args):
self.args = args
self.clip = args.clip
self.device = args.device
self.vae = DiscreteVAE(args).to(self.device)
params = filter(lambda p: p.requires_grad, self.vae.parameters())
self.optimizer = torch.optim.Adam(params, lr=args.lr)
self.loss_q_rec = 0
self.loss_a_rec = 0
self.loss_zq_kl = 0
self.loss_za_kl = 0
self.loss_info = 0
def train(self, c_ids, q_ids, a_ids, start_positions, end_positions):
self.vae = self.vae.train()
# Forward
loss, \
loss_q_rec, loss_a_rec, \
loss_zq_kl, loss_za_kl, \
loss_info \
= self.vae(c_ids, q_ids, a_ids, start_positions, end_positions)
# Backward
self.optimizer.zero_grad()
loss.backward()
# Step
self.optimizer.step()
self.loss_q_rec = loss_q_rec.item()
self.loss_a_rec = loss_a_rec.item()
self.loss_zq_kl = loss_zq_kl.item()
self.loss_za_kl = loss_za_kl.item()
self.loss_info = loss_info.item()
def generate_posterior(self, c_ids, q_ids, a_ids):
self.vae = self.vae.eval()
with torch.no_grad():
_, _, zq, _, za = self.vae.posterior_encoder(c_ids, q_ids, a_ids)
q_ids, start_positions, end_positions = self.vae.generate(zq, za, c_ids)
return q_ids, start_positions, end_positions, zq
def generate_answer_logits(self, c_ids, q_ids, a_ids):
self.vae = self.vae.eval()
with torch.no_grad():
_, _, zq, _, za = self.vae.posterior_encoder(c_ids, q_ids, a_ids)
start_logits, end_logits = self.vae.return_answer_logits(zq, za, c_ids)
return start_logits, end_logits
def generate_prior(self, c_ids):
self.vae = self.vae.eval()
with torch.no_grad():
_, _, zq, _, za = self.vae.prior_encoder(c_ids)
q_ids, start_positions, end_positions = self.vae.generate(zq, za, c_ids)
return q_ids, start_positions, end_positions, zq
def save(self, filename):
params = {
'state_dict': self.vae.state_dict(),
'args': self.args
}
torch.save(params, filename)
================================================
FILE: vae/translate.py
================================================
import argparse
import pickle
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import os
from models import DiscreteVAE
from squad_utils import (InputFeatures, convert_examples_to_harv_features,
read_examples, read_squad_examples)
def return_mask_lengths(ids):
mask = torch.sign(ids).float()
lengths = torch.sum(mask, 1).long()
return mask, lengths
def post_process(q_ids, start_positions, end_positions, c_ids, total_max_len=384):
"""
concatenate question and context for BERT QA model:
[CLS] Question [SEP] Context [SEP]
"""
batch_size = q_ids.size(0)
# exclude CLS token in c_ids
c_ids = c_ids[:, 1:]
start_positions = start_positions - 1
end_positions = end_positions - 1
_, q_lengths = return_mask_lengths(q_ids)
_, c_lengths = return_mask_lengths(c_ids)
all_input_ids = []
all_seg_ids = []
for i in range(batch_size):
q_length = q_lengths[i]
c_length = c_lengths[i]
q = q_ids[i, :q_length] # exclude pad tokens
c = c_ids[i, :c_length] # exclude pad tokens
# input ids
pads = torch.zeros((total_max_len - q_length - c_length), device=q_ids.device, dtype=torch.long)
input_ids = torch.cat([q, c, pads], dim=0)
all_input_ids.append(input_ids)
# segment ids
zeros = torch.zeros_like(q)
ones = torch.ones_like(c)
seg_ids = torch.cat([zeros, ones, pads], dim=0)
all_seg_ids.append(seg_ids)
start_positions[i] = start_positions[i] + q_length
end_positions[i] = end_positions[i] + q_length
all_input_ids = torch.stack(all_input_ids, dim=0)
all_seg_ids = torch.stack(all_seg_ids, dim=0)
all_input_mask = (all_input_ids != 0).byte()
return all_input_ids, all_seg_ids, all_input_mask, start_positions, end_positions
def main(args):
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
args.tokenizer = tokenizer
device = torch.cuda.current_device()
checkpoint = torch.load(args.checkpoint, map_location="cpu")
vae = DiscreteVAE(checkpoint["args"])
vae.load_state_dict(checkpoint["state_dict"])
vae.eval()
vae = vae.to(device)
if args.squad:
examples = read_squad_examples(args.data_file, is_training=True, debug=args.debug)
features = convert_examples_to_harv_features(examples,
tokenizer=tokenizer,
max_seq_length=args.max_c_len,
max_query_length=args.max_q_len,
doc_stride=128,
is_training=True)
else:
examples = read_examples(args.data_file, is_training=True, debug=args.debug)
features = convert_examples_to_harv_features(examples,
tokenizer=tokenizer,
max_seq_length=args.max_c_len,
max_query_length=args.max_q_len,
doc_stride=128,
is_training=True)
features = features[:int(len(features) * args.ratio)]
all_c_ids = torch.tensor([f.c_ids for f in features], dtype=torch.long)
data = TensorDataset(all_c_ids)
data_loader = DataLoader(data, shuffle=False, batch_size=args.batch_size)
new_features = []
for batch in tqdm(data_loader, total=len(data_loader)):
c_ids = batch[0]
_, c_len = return_mask_lengths(c_ids)
max_c_len = torch.max(c_len)
c_ids = c_ids[:, :max_c_len].to(device)
# sample latent variable K times
for _ in range(args.k):
with torch.no_grad():
_, _, zq, _, za = vae.prior_encoder(c_ids)
batch_q_ids, batch_start, batch_end = vae.generate(zq, za, c_ids)
all_input_ids, all_seg_ids, \
all_input_mask, all_start, all_end = post_process(batch_q_ids, batch_start, batch_end, c_ids)
for i in range(c_ids.size(0)):
new_features.append(
InputFeatures(
unique_id=None,
example_index=None,
doc_span_index=None,
tokens=None,
token_to_orig_map=None,
token_is_max_context=None,
input_ids=all_input_ids[i].cpu().tolist(),
input_mask=all_input_mask[i].cpu().tolist(),
c_ids=None,
context_tokens=None,
q_ids=None,
q_tokens=None,
answer_text=None,
tag_ids=None,
segment_ids=all_seg_ids[i].cpu().tolist(),
noq_start_position=None,
noq_end_position=None,
start_position=all_start[i].cpu().tolist(),
end_position=all_end[i].cpu().tolist(),
is_impossible=None))
dir_name = os.path.dirname(args.output_file)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
with open(args.output_file, "wb") as f:
pickle.dump(new_features, f)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--squad', dest='squad', action='store_true', help="whether to generate QA from SQuAD context")
parser.add_argument("--seed", default=1004, type=int)
parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
parser.add_argument("--max_c_len", default=384 - 64, type=int, help="max context length")
parser.add_argument("--max_q_len", default=0, type=int, help="max query length")
parser.add_argument("--batch_size", default=64, type=int, help="batch_size")
parser.add_argument("--data_file", default="../data/squad/train-v1.1.json", type=str)
parser.add_argument("--checkpoint", default="../save/vae-checkpoint/best_f1_model.pt", type=str, help="checkpoint for vae model")
parser.add_argument("--output_file", default="../data/synthetic_data/1.0_squad_10x_features.pkl", type=str)
parser.add_argument("--ratio", default=1.0, type=float)
parser.add_argument("--k", default=10, type=int, help="the number of QA pairs for each paragraph")
args = parser.parse_args()
main(args)
================================================
FILE: vae/utils.py
================================================
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from squad_utils import (convert_examples_to_features_answer_id,
convert_examples_to_harv_features,
read_squad_examples)
def get_squad_data_loader(tokenizer, file, shuffle, args):
examples = read_squad_examples(file, is_training=True, debug=args.debug)
features = convert_examples_to_features_answer_id(examples,
tokenizer=tokenizer,
max_seq_length=args.max_c_len,
max_query_length=args.max_q_len,
max_ans_length=args.max_q_len,
doc_stride=128,
is_training=True)
all_c_ids = torch.tensor([f.c_ids for f in features], dtype=torch.long)
all_q_ids = torch.tensor([f.q_ids for f in features], dtype=torch.long)
all_tag_ids = torch.tensor([f.tag_ids for f in features], dtype=torch.long)
all_a_ids = (all_tag_ids != 0).long()
all_start_positions = torch.tensor([f.noq_start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.noq_end_position for f in features], dtype=torch.long)
all_data = TensorDataset(all_c_ids, all_q_ids, all_a_ids, all_start_positions, all_end_positions)
data_loader = DataLoader(all_data, args.batch_size, shuffle=shuffle)
return data_loader, examples, features
def get_harv_data_loader(tokenizer, file, shuffle, ratio, args):
examples = read_squad_examples(file, is_training=True, debug=args.debug)
random.shuffle(examples)
num_ex = int(len(examples) * ratio)
examples = examples[:num_ex]
features = convert_examples_to_harv_features(examples,
tokenizer=tokenizer,
max_seq_length=args.max_c_len,
max_query_length=args.max_q_len,
doc_stride=128,
is_training=True)
all_c_ids = torch.tensor([f.c_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_c_ids)
dataloader = DataLoader(dataset, shuffle=shuffle, batch_size=args.batch_size)
return features, dataloader
def batch_to_device(batch, device):
batch = (b.to(device) for b in batch)
c_ids, q_ids, a_ids, start_positions, end_positions = batch
c_len = torch.sum(torch.sign(c_ids), 1)
max_c_len = torch.max(c_len)
c_ids = c_ids[:, :max_c_len]
a_ids = a_ids[:, :max_c_len]
q_len = torch.sum(torch.sign(q_ids), 1)
max_q_len = torch.max(q_len)
q_ids = q_ids[:, :max_q_len]
return c_ids, q_ids, a_ids, start_positions, end_positions
| |