Source code for aitoolbox.nlp.core.core

import re
import unicodedata


[docs]def unicode_to_ascii(text_string): """Turn a Unicode string to plain ASCII Taken from: http://stackoverflow.com/a/518232/2809427 Args: text_string (str): Returns: str: """ return ''.join( c for c in unicodedata.normalize('NFD', text_string) if unicodedata.category(c) != 'Mn' )
[docs]def normalize_string(text_string, unicode_to_ascii_convert=True): """Lowercase, trim, and remove non-letter characters Args: text_string (str): unicode_to_ascii_convert (bool): Returns: str: """ text_string = text_string.lower().strip() if unicode_to_ascii_convert: text_string = unicode_to_ascii(text_string) text_string = re.sub(r"([.!?])", r" \1", text_string) text_string = re.sub(r"[^0-9a-zA-Z.!?]+", r" ", text_string) text_string = re.sub(r"\s+", r" ", text_string).strip() return text_string
[docs]def str2bool(w): """ Args: w: Returns: bool: """ if w.lower() in ('yes', 'true', 't', 'y', '1'): return True elif w.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise ValueError('Boolean value expected.')
[docs]def find_sub_list(sub_list, main_list): """Find starting and ending position of a sublist in a longer list. Args: sub_list (list): sublist main_list (list): main longer list Returns: (int, int): start and end index in the list l. Returns None if sublist is not found in the main list. """ if len(sub_list) > len(main_list): raise ValueError('len(sub_list) > len(main_list); should be len(sub_list) <= len(main_list)') sll = len(sub_list) for ind in (i for i, e in enumerate(main_list) if e == sub_list[0]): if main_list[ind:ind+sll] == sub_list: return ind, ind+sll-1