Skip to content

Commit a12c85f

Browse files
authored
fix(itn): 十七八万=>17~18万,四十五六块=>45-6块 (#123)
* fix(itn): 十七八美元 => ~18, 四十五六岁 => 45-6岁 * fix(itn): 十七八万=>17~18万,四十五六块=>45-6块
1 parent fc1b2ca commit a12c85f

9 files changed

Lines changed: 47 additions & 20 deletions

File tree

itn/chinese/data/measure/units_en.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
千赫兹 khz
2929
平方千米 km²
3030
公里 km
31+
公里每小时 km/h
32+
公里一小时 km/h
3133
千米 km
3234
千米每小时 km/h
3335
千米一小时 km/h

itn/chinese/data/money/symbol.tsv

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
乌拉圭比索 $U
44
美元 $
55
英镑 £
6-
英镑
76
¥
8-
97
泰铢 ฿
108
柬埔寨瑞尔
119
哥斯达黎加科隆
@@ -40,7 +38,6 @@
4038
阿尔巴尼亚列克 Lek
4139
洪都拉斯伦皮拉 L
4240
莫桑比克梅蒂卡尔 MT
43-
新台币 NT$
4441
博茨瓦纳普拉 P
4542
危地马拉格查尔 Q
4643
巴西雷亚尔 R$

itn/chinese/rules/cardinal.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,22 @@ def build_tagger(self):
7373
# 负的xxx 1.11, 1.01
7474
number = sign.ques + number + (dot + digits.plus).ques
7575
# 五六万,三五千,六七百,三四十
76-
number |= add_weight(
77-
(digit + insert("0~") + digit + cross("十", "0")) |
78-
(digit + insert("00~") + digit + cross("百", "00")) |
79-
(digit + insert("000~") + digit + cross("千", "000")) |
80-
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
81-
)
76+
special_2number = digit + insert("0~") + digit + cross("十", "0")
77+
special_2number |= digit + insert("00~") + digit + cross("百", "00")
78+
special_2number |= digit + insert("000~") + digit + cross("千", "000")
79+
special_2number |= digit + insert("0000~") + digit + cross("万", "0000")
80+
number |= special_2number
81+
# 十七八美元 => $17~18, 四十五六岁 => 45-6岁,
82+
# 三百七八公里 => 370-80km, 三百七八十千克 => 370-80kg
83+
special_3number = cross('十', '1') + digit + insert("~1") + digit
84+
special_3number |= digit + delete('十') + digit + insert("-") + digit
85+
special_3number |= digit + delete('百') + digit + insert("0-") + digit \
86+
+ (insert("0") | add_weight(cross("十", "0"), -0.1))
87+
number |= add_weight(special_3number, -100.0)
88+
8289
self.number = number.optimize()
90+
self.special_2number = special_2number.optimize()
91+
self.special_3number = special_3number.optimize()
8392

8493
# 十/百/千/万
8594
number_exclude_0_to_9 = teen | tens | hundred | thousand | ten_thousand
@@ -95,12 +104,11 @@ def build_tagger(self):
95104
(dot + digits.plus).plus
96105
)
97106
# 五六万,三五千,六七百,三四十
98-
number_exclude_0_to_9 |= add_weight(
99-
(digit + insert("0~") + digit + cross("十", "0")) |
100-
(digit + insert("00~") + digit + cross("百", "00")) |
101-
(digit + insert("000~") + digit + cross("千", "000")) |
102-
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
103-
)
107+
# 十七八美元 => $17~18, 四十五六岁 => 45-6岁,
108+
# 三百七八公里 => 370-80km, 三百七八十千克 => 370-80kg
109+
number_exclude_0_to_9 |= special_2number
110+
number_exclude_0_to_9 |= add_weight(special_3number, -100.0)
111+
104112
self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa
105113

106114
# cardinal string like 127.0.0.1, used in ID, IP, etc.

itn/chinese/rules/measure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def build_tagger(self):
3636
to = cross('到', '~') | cross('到百分之', '~')
3737

3838
units = add_weight(units_en, -1.0) | \
39-
((accep('亿') | accep('兆') | accep('万')).ques + units_zh)
39+
add_weight((accep('亿') | accep('兆') | accep('万')), -0.5).ques + units_zh
4040

4141
number = Cardinal().number if self.enable_0_to_9 else \
4242
Cardinal().number_exclude_0_to_9
@@ -50,7 +50,7 @@ def build_tagger(self):
5050
# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
5151
measure = number + (to + number).ques + units
5252
# 七八块钱
53-
measure |= add_weight(digit + insert("~") + digit + units, -1.0)
53+
measure |= digit + insert("~") + digit + units
5454
tagger = insert('value: "') + (measure | percent) + insert('"')
5555

5656
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h

itn/chinese/rules/money.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ def __init__(self, enable_0_to_9=True):
3030
def build_tagger(self):
3131
code = string_file('itn/chinese/data/money/code.tsv')
3232
symbol = string_file('itn/chinese/data/money/symbol.tsv')
33+
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
3334

3435
number = Cardinal().number if self.enable_0_to_9 else \
3536
Cardinal().number_exclude_0_to_9
37+
# 七八美元 => $7~8
38+
number |= digit + insert("~") + digit
3639
tagger = (insert('value: "') + number + insert('"') +
3740
insert(' currency: "') + (code | symbol) + insert('"'))
3841
self.tagger = self.add_tokens(tagger)

itn/chinese/test/data/cardinal.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,9 @@
66
三五万 => 30000~50000
77
三四万 => 30000~40000
88
五六十 => 50~60
9+
十五六 => 15~16
10+
四十五六 => 45-6
11+
七百三四 => 730-40
12+
七百三四十 => 730-40
13+
十七八万 => 17~18万
14+
六十三四万 => 63-4万

itn/chinese/test/data/math.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
一加二加三 => 1+2+3
44
二等于一加一 => 2=1+1
55
二十一到一千零一 => 21~1001
6+
六百三到六百四 => 630~640

itn/chinese/test/data/measure.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@
3333
十一到一百千米每小时 => 11~100km/h
3434
每小时三十到三百一十一千米 => 30~311km/h
3535
七八公斤 => 7~8kg
36-
五六十块钱 => 50~60块钱
36+
五六十摩尔 => 50~60mol
3737
三五百公里 => 300~500km
38-
八九千美元 => $8000~9000
39-
三四万吨 => 30000~40000吨
38+
三四万吨 => 3~4万吨
39+
三四万伏特 => 30000~40000v
40+
十五六千瓦时 => 15~16kwh
41+
四十五六度 => 45-6°
42+
七百三四秒 => 730-40s
43+
七百三四十分钟 => 730-40min

itn/chinese/test/data/money.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
一点二五元 => ¥1.25
22
一点二五人民币 => CNY1.25
33
三十四点五二一元 => ¥34.521
4+
八九千美元 => $8000~9000
5+
七八英镑 => £7~8
6+
十五六卢布 => ₽15~16
7+
四十五六新台币 => TWD45-6
8+
七百三四欧元 => €730-40
9+
七百三四十马来西亚令吉 => RM730-40

0 commit comments

Comments
 (0)