Naturalize function now removes punctuation as well.

This commit is contained in:
Josh Washburne 2018-01-06 12:57:10 -05:00
parent 6ca8b848d2
commit 037dd98b49

View file

@ -56,7 +56,7 @@ def set_setting(name, value, setting_type=None):
return return
def naturalize(string): def naturalize(text):
""" """
Return a normalized unicode string, with removed starting articles, for use Return a normalized unicode string, with removed starting articles, for use
in natural sorting. in natural sorting.
@ -67,10 +67,12 @@ def naturalize(string):
def naturalize_int_match(match): def naturalize_int_match(match):
return '%08d' % (int(match.group(0)),) return '%08d' % (int(match.group(0)),)
string = normalize('NFKD', string).encode('ascii', 'ignore').decode('ascii') text = normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
string = string.lower() text = text.lower()
string = string.strip() punc = re.compile('[{}]'.format(re.escape(string.punctuation)))
string = re.sub(r'^(a|an|the)\s+', '', string) text = re.sub(punc, ' ', text)
string = re.sub(r'\d+', naturalize_int_match, string) text = text.strip()
text = re.sub(r'^(a|an|the)\s+', '', text)
text = re.sub(r'\d+', naturalize_int_match, text)
return string return text