File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -36,12 +36,14 @@ def __init__(self,
3636 cache_dir = None ,
3737 overwrite_cache = False ,
3838 remove_interjections = True ,
39+ remove_erhua = True ,
3940 traditional_to_simple = True ,
4041 remove_puncts = False ,
4142 full_to_half = True ,
4243 tag_oov = False ):
4344 super ().__init__ (name = 'normalizer' )
4445 self .remove_interjections = remove_interjections
46+ self .remove_erhua = remove_erhua
4547 self .traditional_to_simple = traditional_to_simple
4648 self .remove_puncts = remove_puncts
4749 self .full_to_half = full_to_half
@@ -81,7 +83,7 @@ def build_verbalizer(self):
8183 money = Money ().verbalizer
8284 sport = Sport ().verbalizer
8385 time = Time ().verbalizer
84- whitelist = Whitelist ().verbalizer
86+ whitelist = Whitelist (remove_erhua = self . remove_erhua ).verbalizer
8587
8688 verbalizer = (cardinal | char | date | fraction | math | measure
8789 | money | sport | time | whitelist ).optimize ()
Original file line number Diff line number Diff line change 2020
2121class Whitelist (Processor ):
2222
23- def __init__ (self ):
23+ def __init__ (self , remove_erhua = True ):
2424 super ().__init__ (name = 'whitelist' )
25+ self .remove_erhua = remove_erhua
2526 self .build_tagger ()
2627 self .build_verbalizer ()
2728
@@ -35,5 +36,9 @@ def build_tagger(self):
3536
3637 def build_verbalizer (self ):
3738 super ().build_verbalizer ()
38- verbalizer = self .delete_tokens (delete ('erhua: "儿"' ))
39+ if self .remove_erhua :
40+ verbalizer = self .delete_tokens (delete ('erhua: "儿"' ))
41+ else :
42+ verbalizer = self .delete_tokens (delete ('erhua: \" ' ) +
43+ accep ('儿' ) + delete ('\" ' ))
3944 self .verbalizer |= verbalizer
Original file line number Diff line number Diff line change @@ -30,7 +30,10 @@ def main():
3030 help = 'rebuild *.fst' )
3131 parser .add_argument ('--remove_interjections' , type = str ,
3232 default = 'True' ,
33- help = 'remove interjections like "啊" and "儿"' )
33+ help = 'remove interjections like "啊"' )
34+ parser .add_argument ('--remove_erhua' , type = str ,
35+ default = 'True' ,
36+ help = 'remove "儿"' )
3437 parser .add_argument ('--traditional_to_simple' , type = str ,
3538 default = 'True' ,
3639 help = 'i.e., "喆" -> "哲"' )
@@ -48,6 +51,7 @@ def main():
4851 normalizer = Normalizer (cache_dir = args .cache_dir ,
4952 overwrite_cache = args .overwrite_cache ,
5053 remove_interjections = str2bool (args .remove_interjections ),
54+ remove_erhua = str2bool (args .remove_erhua ),
5155 traditional_to_simple = str2bool (args .traditional_to_simple ),
5256 remove_puncts = str2bool (args .remove_puncts ),
5357 full_to_half = str2bool (args .full_to_half ),
You can’t perform that action at this time.
0 commit comments