@@ -36,6 +36,10 @@ def __init__(self,
3636 def build_tagger (self ):
3737 zero = string_file ('itn/chinese/data/number/zero.tsv' ) # 0
3838 digit = string_file ('itn/chinese/data/number/digit.tsv' ) # 1 ~ 9
39+ special_tilde = string_file (
40+ 'itn/chinese//data/number/special_tilde.tsv' ) # 七八十->70~80
41+ special_dash = string_file (
42+ 'itn/chinese//data/number/special_dash.tsv' ) # 七八十->70-80
3943 sign = string_file ('itn/chinese/data/number/sign.tsv' ) # + -
4044 dot = string_file ('itn/chinese/data/number/dot.tsv' ) # .
4145
@@ -90,23 +94,17 @@ def build_tagger(self):
9094 (number + accep ('亿' ) + delete ('零' ).ques ).ques + number )
9195 # 负的xxx 1.11, 1.01
9296 number = sign .ques + number + (dot + digits .plus ).ques
93- # 五六万,三五千,六七百,三四十
94- special_2number = digit + insert ("0~" ) + digit + cross ("十" , "0" )
95- special_2number |= digit + insert ("00~" ) + digit + cross ("百" , "00" )
96- special_2number |= digit + insert ("000~" ) + digit + cross ("千" , "000" )
97- special_2number |= digit + insert ("0000~" ) + digit + cross ("万" , "0000" )
98- number |= special_2number
99- # 十七八美元 => $17~18, 四十五六岁 => 45-6岁,
100- # 三百七八公里 => 370-80km, 三百七八十千克 => 370-80kg
101- special_3number = cross ('十' , '1' ) + digit + insert ("~1" ) + digit
102- special_3number |= digit + delete ('十' ) + digit + insert ("-" ) + digit
103- special_3number |= digit + delete ('百' ) + digit + insert ("0-" ) + digit \
104- + (insert ("0" ) | add_weight (cross ("十" , "0" ), - 0.1 ))
105- number |= add_weight (special_3number , - 100.0 )
97+ # 五六万 => 5~6万,三五千 => 3000~5000,六七百 => 600~700,三四十 => 30~40
98+ number |= special_tilde
99+ # 十七八 => 17-8, 四十五六 => 45-6, 三百七八十 => 370-80
100+ _special_dash = cross ('十' , '1' ) + special_dash
101+ _special_dash |= digit + delete ('十' ) + special_dash
102+ _special_dash |= digit + delete ('百' ) + special_dash
103+ number |= add_weight (_special_dash , - 100.0 )
106104
107105 self .number = number .optimize ()
108- self .special_2number = special_2number .optimize ()
109- self .special_3number = special_3number .optimize ()
106+ self .special_tilde = special_tilde .optimize ()
107+ self .special_dash = _special_dash .optimize ()
110108
111109 # 2. 利用基础数字所构建的不包含0~9的完整数字
112110 # 十/百/千/万
@@ -123,8 +121,8 @@ def build_tagger(self):
123121 # 五六万,三五千,六七百,三四十
124122 # 十七八美元 => $17~18, 四十五六岁 => 45-6岁,
125123 # 三百七八公里 => 370-80km, 三百七八十千克 => 370-80kg
126- number_exclude_0_to_9 |= special_2number
127- number_exclude_0_to_9 |= add_weight (special_3number , - 100.0 )
124+ number_exclude_0_to_9 |= special_tilde
125+ number_exclude_0_to_9 |= add_weight (_special_dash , - 100.0 )
128126
129127 self .number_exclude_0_to_9 = (sign .ques +
130128 number_exclude_0_to_9 ).optimize ()
0 commit comments