Source code for aitoolbox.nlp.experiment_evaluation.NLP_metrics

import os
import re
import shutil
import string
from collections import Counter
import numpy as np
from pyrouge import Rouge155
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from torchnlp.metrics import bleu
from transformers import glue_compute_metrics, xnli_compute_metrics

from aitoolbox.experiment.core_metrics.abstract_metric import AbstractBaseMetric


[docs]class ROUGEMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, target_actual_text=False, output_text_dir=None, output_text_cleaning_regex=(r'<.*?>', r'[^a-zA-Z0-9.?! ]+')): """ROGUE score calculation From this package: https://github.com/pltrdy/rouge Args: y_true (numpy.array or list): y_predicted (numpy.array or list): target_actual_text (bool): output_text_dir (str): output_text_cleaning_regex (list): """ self.output_text_cleaning_regex = output_text_cleaning_regex self.target_actual_text = target_actual_text self.output_text_dir = output_text_dir AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='ROGUE', np_array=False)
[docs] def calculate_metric(self): if self.output_text_dir is not None: # Not affecting the metric calculation. Just for record keeping it drops the texts to disk so they can be # reviewed self.dump_answer_text_to_disk(self.y_true, self.y_predicted, self.output_text_dir, self.output_text_cleaning_regex, self.target_actual_text) self.prepare_text() rouge_calc = Rouge() hypothesis = self.y_predicted reference = self.y_true # TODO: remove try-except... just for testing try: return rouge_calc.get_scores(hypothesis, reference, avg=True) except: print('hypothesis') print(hypothesis) print('reference') print(reference) exit()
[docs] def prepare_text(self): if not self.target_actual_text: self.y_true = [' '.join(sent) for sent in self.y_true] self.y_predicted = [' '.join(sent) if len(sent) > 0 else ' ' for sent in self.y_predicted]
[docs] @staticmethod def dump_answer_text_to_disk(true_text, pred_text, output_text_dir, output_text_cleaning_regex, target_actual_text): """ Problems: Defined regex text cleaning to deal with Illegal division by zero https://ireneli.eu/2018/01/11/working-with-rouge-1-5-5-evaluation-metric-in-python/ Args: true_text (list): pred_text (list): output_text_dir (str): output_text_cleaning_regex (list): target_actual_text (bool): Returns: """ if os.path.exists(output_text_dir): shutil.rmtree(output_text_dir) os.mkdir(output_text_dir) for i, (pred_answ, true_answ) in enumerate(zip(pred_text, true_text)): with open(os.path.join(output_text_dir, f'answer_pred_true_{i}.txt'), 'w', encoding='utf-8') as f: # Default regex cleaners: (r'<.*?>', r'[^a-zA-Z0-9.?! ]+') pred_answ_clean = ROUGEPerlMetric.regex_clean_text(pred_answ, output_text_cleaning_regex) pred_answ_clean = ' '.join(pred_answ_clean) if len(pred_answ_clean) > 0 else ' ' if target_actual_text: true_answ_clean = [true_answ] else: true_answ_clean = ROUGEPerlMetric.regex_clean_text(true_answ, output_text_cleaning_regex) true_answ_clean = ' '.join(true_answ_clean) f.write(f'Answer to question {i}:\n') f.write(f'Predicted:\t{pred_answ_clean}\n') f.write(f'True:\t{true_answ_clean}\n')
[docs]class ROUGEPerlMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, output_text_dir, output_text_cleaning_regex=(r'<.*?>', r'[^a-zA-Z0-9.?! ]+'), target_actual_text=False): """ROGUE score calculation using the Perl implementation Use this package: https://pypi.org/project/pyrouge/ https://github.com/bheinzerling/pyrouge Problems: Defined regex text cleaning to deal with Illegal division by zero https://ireneli.eu/2018/01/11/working-with-rouge-1-5-5-evaluation-metric-in-python/ Args: y_true (numpy.array or list): gold standard summaries are ‘model’ summaries y_predicted (numpy.array or list): your summaries are ‘system’ summaries output_text_dir (str): output_text_cleaning_regex (list): target_actual_text (bool): """ self.output_text_dir = output_text_dir self.output_text_cleaning_regex = output_text_cleaning_regex self.target_actual_text = target_actual_text AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='ROGUE_Perl', np_array=False)
[docs] def calculate_metric(self): self.dump_answer_text_to_disk(self.y_true, self.y_predicted, self.output_text_dir, self.output_text_cleaning_regex, self.target_actual_text) rouge = Rouge155() # In ROUGE, your summaries are ‘system’ summaries and the gold standard summaries are ‘model’ summaries. rouge.system_dir = os.path.join(self.output_text_dir, 'pred_answer') rouge.model_dir = os.path.join(self.output_text_dir, 'true_answer') rouge.system_filename_pattern = 'pred_answer_text.(\d+).txt' rouge.model_filename_pattern = 'true_answer_text.#ID#.txt' rouge_output = rouge.convert_and_evaluate() output_dict = rouge.output_to_dict(rouge_output) return output_dict
[docs] @staticmethod def dump_answer_text_to_disk(true_text, pred_text, output_text_dir, output_text_cleaning_regex, target_actual_text): """ Problems: Defined regex text cleaning to deal with Illegal division by zero https://ireneli.eu/2018/01/11/working-with-rouge-1-5-5-evaluation-metric-in-python/ Args: true_text (list): pred_text (list): output_text_dir (str): output_text_cleaning_regex (list): target_actual_text (bool): Returns: """ if os.path.exists(output_text_dir): shutil.rmtree(output_text_dir) os.mkdir(output_text_dir) os.mkdir(os.path.join(output_text_dir, 'true_answer')) os.mkdir(os.path.join(output_text_dir, 'pred_answer')) for i, text in enumerate(true_text): # TODO: Encoding setting not tested yet with open(os.path.join(output_text_dir, f'true_answer/true_answer_text.{i}.txt'), 'w', encoding='utf-8') as f: # Default regex cleaners: (r'<.*?>', r'[^a-zA-Z0-9.?! ]+') if target_actual_text: text_clean = [text] else: text_clean = ROUGEPerlMetric.regex_clean_text(text, output_text_cleaning_regex) f.write(' '.join(text_clean)) for i, text in enumerate(pred_text): # TODO: Encoding setting not tested yet with open(os.path.join(output_text_dir, f'pred_answer/pred_answer_text.{i}.txt'), 'w', encoding='utf-8') as f: # Default regex cleaners: (r'<.*?>', r'[^a-zA-Z0-9.?! ]+') text_clean = ROUGEPerlMetric.regex_clean_text(text, output_text_cleaning_regex) f.write(' '.join(text_clean) if len(text_clean) > 0 else ' ')
[docs] @staticmethod def regex_clean_text(text, cleaning_regex_list): """ Args: text (list): cleaning_regex_list (list): Returns: list: """ # The default is: (r'<.*?>', r'[^a-zA-Z0-9.?! ]+') for cleaning_regex in cleaning_regex_list: re_pattern = re.compile(cleaning_regex) text = [re_pattern.sub('', t) for t in text if len(re_pattern.sub('', t)) > 0] return text
[docs]class ExactMatchTextMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, target_actual_text=False, output_text_dir=None): """Calculate exact match of answered strings Args: y_true (numpy.array or list): y_predicted (numpy.array or list): target_actual_text (bool): output_text_dir (str): """ if len(y_true) != len(y_predicted): raise ValueError(f'len(y_true) != len(y_predicted). Got {len(y_true)} != {len(y_predicted)}') self.target_actual_text = target_actual_text self.output_text_dir = output_text_dir AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='EM', np_array=False)
[docs] def calculate_metric(self): if self.output_text_dir is not None: # Not affecting the metric calculation. Just for record keeping it drops the texts to disk so they can be # reviewed ROUGEMetric.dump_answer_text_to_disk(self.y_true, self.y_predicted, self.output_text_dir, [], self.target_actual_text) if not self.target_actual_text: self.y_true = [' '.join(sent) for sent in self.y_true] self.y_predicted = [' '.join(sent) for sent in self.y_predicted] em = 0 for pred_answ, true_answ in zip(self.y_predicted, self.y_true): em += int(self.normalize_answer(pred_answ) == self.normalize_answer(true_answ)) return 100. * em / len(self.y_true)
[docs] @staticmethod def normalize_answer(text_str): """Convert to lowercase and remove punctuation, articles and extra whitespace. All methods below this line are from the official SQuAD 2.0 eval script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ Args: text_str (str): Returns: str """ def remove_articles(text): regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) return re.sub(regex, ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(text_str))))
[docs]class F1TextMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, target_actual_text=False, output_text_dir=None): """Calculate F1 score of answered strings Args: y_true (numpy.array or list): y_predicted (numpy.array or list): target_actual_text (bool): output_text_dir (str): """ if len(y_true) != len(y_predicted): raise ValueError(f'len(y_true) != len(y_predicted). Got {len(y_true)} != {len(y_predicted)}') self.target_actual_text = target_actual_text self.output_text_dir = output_text_dir AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='F1', np_array=False)
[docs] def calculate_metric(self): if self.output_text_dir is not None: # Not affecting the metric calculation. Just for record keeping it drops the texts to disk so they can be # reviewed ROUGEMetric.dump_answer_text_to_disk(self.y_true, self.y_predicted, self.output_text_dir, [], self.target_actual_text) if not self.target_actual_text: self.y_true = [' '.join(sent) for sent in self.y_true] self.y_predicted = [' '.join(sent) for sent in self.y_predicted] f1 = 0 for pred_answ, true_answ in zip(self.y_predicted, self.y_true): f1 += self.compute_f1(true_answ, pred_answ) return 100. * f1 / len(self.y_true)
[docs] @staticmethod def compute_f1(a_gold, a_pred): gold_toks = F1TextMetric.get_tokens(a_gold) pred_toks = F1TextMetric.get_tokens(a_pred) common = Counter(gold_toks) & Counter(pred_toks) num_same = sum(common.values()) if len(gold_toks) == 0 or len(pred_toks) == 0: # If either is no-answer, then F1 is 1 if they agree, 0 otherwise return int(gold_toks == pred_toks) if num_same == 0: return 0 precision = 1.0 * num_same / len(pred_toks) recall = 1.0 * num_same / len(gold_toks) f1 = (2 * precision * recall) / (precision + recall) return f1
[docs] @staticmethod def get_tokens(s): if not s: return [] return ExactMatchTextMetric.normalize_answer(s).split()
[docs]class BLEUSentenceScoreMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, source_sents=None, output_text_dir=None): """BLEU score calculation NLTK provides the sentence_bleu() function for evaluating a candidate sentence against one or more reference sentences. https://machinelearningmastery.com/calculate-bleu-score-for-text-python/ The reference sentences must be provided as a list of sentences where each reference is a list of tokens. The candidate sentence is provided as a list of tokens. For example: reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']] candidate = ['this', 'is', 'a', 'test'] score = sentence_bleu(reference, candidate) Args: y_true (list): y_predicted (list): source_sents (list or None): output_text_dir (str or None): """ if output_text_dir is not None and source_sents is None: raise ValueError('output_text_dir is not None and source_sents is None; ' 'if output_text_dir you must give the source_sents') self.output_text_dir = output_text_dir self.source_sents = source_sents AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='BLEU_sentence_score', np_array=False)
[docs] def calculate_metric(self): self.check_transl_sent_num_match([self.y_true, self.y_predicted]) sentence_bleu_results = [sentence_bleu([true_t], pred_t) for true_t, pred_t in zip(self.y_true, self.y_predicted)] if self.output_text_dir is not None: self.dump_translation_text_to_disk(self.source_sents, [' '.join(sent) for sent in self.y_predicted], [' '.join(sent) for sent in self.y_true], sentence_bleu_results, self.output_text_dir) return np.mean(sentence_bleu_results)
[docs] @staticmethod def dump_translation_text_to_disk(source_sents, pred_translations, true_translations, sentence_bleu_results, output_text_dir): """ Args: source_sents (list): pred_translations (list): true_translations (list): sentence_bleu_results (list): output_text_dir (str): Returns: """ BLEUSentenceScoreMetric.check_transl_sent_num_match([pred_translations, true_translations, source_sents, sentence_bleu_results]) if os.path.exists(output_text_dir): shutil.rmtree(output_text_dir) os.mkdir(output_text_dir) for i, (source, pred_transl, true_transl, bleu_result) in enumerate(zip(source_sents, pred_translations, true_translations, sentence_bleu_results)): with open(os.path.join(output_text_dir, f'transl_{i}.txt'), 'w', encoding='utf-8') as f: f.write(f'Source:\t{source}\n') f.write(f'Predicted:\t{pred_transl}\n') f.write(f'True:\t{true_transl}\n') f.write(f'BLEU: {bleu_result}\n')
[docs] @staticmethod def check_transl_sent_num_match(sent_types): """ Args: sent_types (list): list of lists Raises: ValueError """ num_sents = len(sent_types[0]) for sent_t in sent_types: if len(sent_t) != num_sents: raise ValueError(f"The length of list elements across different text types does not match " f"The featured lengths are: {', '.join([str(len(el)) for el in sent_types])}")
[docs]class BLEUCorpusScoreMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, source_sents=None, output_text_dir=None): """BLEU corpus score calculation Function called corpus_bleu() for calculating the BLEU score for multiple sentences such as a paragraph or a document. https://machinelearningmastery.com/calculate-bleu-score-for-text-python/ The references must be specified as a list of documents where each document is a list of references and each alternative reference is a list of tokens, e.g. a list of lists of lists of tokens. The candidate documents must be specified as a list where each document is a list of tokens, e.g. a list of lists of tokens. references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]] candidates = [['this', 'is', 'a', 'test']] score = corpus_bleu(references, candidates) Args: y_true (list): y_predicted (list): source_sents (list or None): output_text_dir (str or None): """ self.output_text_dir = output_text_dir self.source_sents = source_sents AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='BLEU_corpus_score', np_array=False)
[docs] def calculate_metric(self): BLEUSentenceScoreMetric.check_transl_sent_num_match([self.y_true, self.y_predicted]) if self.output_text_dir is not None: BLEUSentenceScoreMetric.dump_translation_text_to_disk(self.source_sents, [' '.join(sent) for sent in self.y_predicted], [' '.join(sent) for sent in self.y_true], ['na'] * len(self.y_predicted), self.output_text_dir) return corpus_bleu([[sent] for sent in self.y_true], self.y_predicted)
[docs]class BLEUScoreStrTorchNLPMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, lowercase=False, source_sents=None, output_text_dir=None): """BLEU score calculation using the TorchNLP implementation Example: hypotheses = [ "The brown fox jumps over the dog 笑", "The brown fox jumps over the dog 2 笑" ] references = [ "The quick brown fox jumps over the lazy dog 笑", "The quick brown fox jumps over the lazy dog 笑" ] get_moses_multi_bleu(hypotheses, references, lowercase=True) 46.51 Args: y_true (list): y_predicted (list): lowercase (bool): source_sents (list or None): output_text_dir (str or None): """ self.output_text_dir = output_text_dir self.source_sents = source_sents self.lowercase = lowercase AbstractBaseMetric.__init__(self, y_true, y_predicted, metric_name='BLEU_str_torchNLP_score', np_array=False)
[docs] def calculate_metric(self): BLEUSentenceScoreMetric.check_transl_sent_num_match([self.y_true, self.y_predicted]) sentence_bleu_results = [ bleu.get_moses_multi_bleu([' '.join(true_t)], [' '.join(pred_t)], lowercase=self.lowercase) for true_t, pred_t in zip(self.y_true, self.y_predicted) ] if self.output_text_dir is not None: BLEUSentenceScoreMetric.dump_translation_text_to_disk(self.source_sents, [' '.join(sent) for sent in self.y_predicted], [' '.join(sent) for sent in self.y_true], sentence_bleu_results, self.output_text_dir) return float(np.mean(sentence_bleu_results))
[docs]class PerplexityMetric(AbstractBaseMetric): def __init__(self, batch_losses): """Perplexity metric used in MT Args: batch_losses (numpy.array or list): """ AbstractBaseMetric.__init__(self, None, batch_losses, metric_name='Perplexity', np_array=False)
[docs] def calculate_metric(self): return np.exp(np.mean(self.y_predicted))
[docs]class GLUEMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted, task_name): """GLUE evaluation metrics Wrapper around HF Transformers ``glue_compute_metrics()`` Args: y_true: y_predicted: task_name (str): name of the GLUE task """ self.task_name = task_name super().__init__(y_true, y_predicted, metric_name=f'GLUE_{task_name}')
[docs] def calculate_metric(self): metric_dict = glue_compute_metrics(task_name=self.task_name, preds=self.y_predicted, labels=self.y_true) metric_dict = {k.replace('/', '_'): v for k, v in metric_dict.items()} return metric_dict
[docs]class XNLIMetric(AbstractBaseMetric): def __init__(self, y_true, y_predicted): """XNLI evaluation metrics Wrapper around HF Transformers ``xnli_compute_metrics()`` Args: y_true: y_predicted: """ super().__init__(y_true, y_predicted, metric_name='xnli_accuracy')
[docs] def calculate_metric(self): metric_dict = xnli_compute_metrics(task_name='xnli', preds=self.y_predicted, labels=self.y_true) metric_dict = {k.replace('/', '_'): v for k, v in metric_dict.items()} return metric_dict