import re
import unicodedata
[docs]def unicode_to_ascii(text_string):
"""Turn a Unicode string to plain ASCII
Taken from: http://stackoverflow.com/a/518232/2809427
Args:
text_string (str):
Returns:
str:
"""
return ''.join(
c for c in unicodedata.normalize('NFD', text_string)
if unicodedata.category(c) != 'Mn'
)
[docs]def normalize_string(text_string, unicode_to_ascii_convert=True):
"""Lowercase, trim, and remove non-letter characters
Args:
text_string (str):
unicode_to_ascii_convert (bool):
Returns:
str:
"""
text_string = text_string.lower().strip()
if unicode_to_ascii_convert:
text_string = unicode_to_ascii(text_string)
text_string = re.sub(r"([.!?])", r" \1", text_string)
text_string = re.sub(r"[^0-9a-zA-Z.!?]+", r" ", text_string)
text_string = re.sub(r"\s+", r" ", text_string).strip()
return text_string
[docs]def str2bool(w):
"""
Args:
w:
Returns:
bool:
"""
if w.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif w.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise ValueError('Boolean value expected.')
[docs]def find_sub_list(sub_list, main_list):
"""Find starting and ending position of a sublist in a longer list.
Args:
sub_list (list): sublist
main_list (list): main longer list
Returns:
(int, int): start and end index in the list l. Returns None if sublist is not found in the main list.
"""
if len(sub_list) > len(main_list):
raise ValueError('len(sub_list) > len(main_list); should be len(sub_list) <= len(main_list)')
sll = len(sub_list)
for ind in (i for i, e in enumerate(main_list) if e == sub_list[0]):
if main_list[ind:ind+sll] == sub_list:
return ind, ind+sll-1