Skip to content

Commit bd48adb

Browse files
authored
feat(tn): remove_erhua (#149)
1 parent 52f2504 commit bd48adb

3 files changed

Lines changed: 15 additions & 4 deletions

File tree

tn/chinese/normalizer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@ def __init__(self,
3636
cache_dir=None,
3737
overwrite_cache=False,
3838
remove_interjections=True,
39+
remove_erhua=True,
3940
traditional_to_simple=True,
4041
remove_puncts=False,
4142
full_to_half=True,
4243
tag_oov=False):
4344
super().__init__(name='normalizer')
4445
self.remove_interjections = remove_interjections
46+
self.remove_erhua = remove_erhua
4547
self.traditional_to_simple = traditional_to_simple
4648
self.remove_puncts = remove_puncts
4749
self.full_to_half = full_to_half
@@ -81,7 +83,7 @@ def build_verbalizer(self):
8183
money = Money().verbalizer
8284
sport = Sport().verbalizer
8385
time = Time().verbalizer
84-
whitelist = Whitelist().verbalizer
86+
whitelist = Whitelist(remove_erhua=self.remove_erhua).verbalizer
8587

8688
verbalizer = (cardinal | char | date | fraction | math | measure
8789
| money | sport | time | whitelist).optimize()

tn/chinese/rules/whitelist.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@
2020

2121
class Whitelist(Processor):
2222

23-
def __init__(self):
23+
def __init__(self, remove_erhua=True):
2424
super().__init__(name='whitelist')
25+
self.remove_erhua = remove_erhua
2526
self.build_tagger()
2627
self.build_verbalizer()
2728

@@ -35,5 +36,9 @@ def build_tagger(self):
3536

3637
def build_verbalizer(self):
3738
super().build_verbalizer()
38-
verbalizer = self.delete_tokens(delete('erhua: "儿"'))
39+
if self.remove_erhua:
40+
verbalizer = self.delete_tokens(delete('erhua: "儿"'))
41+
else:
42+
verbalizer = self.delete_tokens(delete('erhua: \"') +
43+
accep('儿') + delete('\"'))
3944
self.verbalizer |= verbalizer

tn/main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ def main():
3030
help='rebuild *.fst')
3131
parser.add_argument('--remove_interjections', type=str,
3232
default='True',
33-
help='remove interjections like "啊" and "儿"')
33+
help='remove interjections like "啊"')
34+
parser.add_argument('--remove_erhua', type=str,
35+
default='True',
36+
help='remove "儿"')
3437
parser.add_argument('--traditional_to_simple', type=str,
3538
default='True',
3639
help='i.e., "喆" -> "哲"')
@@ -48,6 +51,7 @@ def main():
4851
normalizer = Normalizer(cache_dir=args.cache_dir,
4952
overwrite_cache=args.overwrite_cache,
5053
remove_interjections=str2bool(args.remove_interjections),
54+
remove_erhua=str2bool(args.remove_erhua),
5155
traditional_to_simple=str2bool(args.traditional_to_simple),
5256
remove_puncts=str2bool(args.remove_puncts),
5357
full_to_half=str2bool(args.full_to_half),

0 commit comments

Comments
 (0)