Skip to content

Commit fc1b2ca

Browse files
authored
fix(itn): 七八公斤 => 7~8kg, 七八百块 => 700~800块 (#116)
1 parent 75ac443 commit fc1b2ca

5 files changed

Lines changed: 28 additions & 0 deletions

File tree

itn/chinese/data/measure/units_en.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
千卡 kcal
2525
千克力 kgf
2626
千克 kg
27+
公斤 kg
2728
千赫兹 khz
2829
平方千米 km²
2930
公里 km

itn/chinese/rules/cardinal.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@ def build_tagger(self):
7272
(number + accep('亿') + delete('零').ques).ques + number)
7373
# 负的xxx 1.11, 1.01
7474
number = sign.ques + number + (dot + digits.plus).ques
75+
# 五六万,三五千,六七百,三四十
76+
number |= add_weight(
77+
(digit + insert("0~") + digit + cross("十", "0")) |
78+
(digit + insert("00~") + digit + cross("百", "00")) |
79+
(digit + insert("000~") + digit + cross("千", "000")) |
80+
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
81+
)
7582
self.number = number.optimize()
7683

7784
# 十/百/千/万
@@ -87,6 +94,13 @@ def build_tagger(self):
8794
(number_exclude_0_to_9 | digits) +
8895
(dot + digits.plus).plus
8996
)
97+
# 五六万,三五千,六七百,三四十
98+
number_exclude_0_to_9 |= add_weight(
99+
(digit + insert("0~") + digit + cross("十", "0")) |
100+
(digit + insert("00~") + digit + cross("百", "00")) |
101+
(digit + insert("000~") + digit + cross("千", "000")) |
102+
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
103+
)
90104
self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa
91105

92106
# cardinal string like 127.0.0.1, used in ID, IP, etc.

itn/chinese/rules/measure.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def __init__(self, exclude_one=True, enable_0_to_9=True):
3131
def build_tagger(self):
3232
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
3333
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
34+
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
3435
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
3536
to = cross('到', '~') | cross('到百分之', '~')
3637

@@ -48,6 +49,8 @@ def build_tagger(self):
4849

4950
# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
5051
measure = number + (to + number).ques + units
52+
# 七八块钱
53+
measure |= add_weight(digit + insert("~") + digit + units, -1.0)
5154
tagger = insert('value: "') + (measure | percent) + insert('"')
5255

5356
# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h

itn/chinese/test/data/cardinal.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
幺幺零 => 110
22
幺二七点零点零点幺 => 127.0.0.1
33
这是手机一八五四四一三九一二一 => 这是手机18544139121
4+
三五百 => 300~500
5+
三五千 => 3000~5000
6+
三五万 => 30000~50000
7+
三四万 => 30000~40000
8+
五六十 => 50~60

itn/chinese/test/data/measure.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,8 @@
3232
百分之三十一到百分之百 => 31~100%
3333
十一到一百千米每小时 => 11~100km/h
3434
每小时三十到三百一十一千米 => 30~311km/h
35+
七八公斤 => 7~8kg
36+
五六十块钱 => 50~60块钱
37+
三五百公里 => 300~500km
38+
八九千美元 => $8000~9000
39+
三四万吨 => 30000~40000吨

0 commit comments

Comments
 (0)