Skip to content

Commit 4dc6f87

Browse files
authored
[tn] fix phone and keep space for chinese-english mixture (#80)
1 parent d865aa2 commit 4dc6f87

5 files changed

Lines changed: 13 additions & 12 deletions

File tree

tn/chinese/data/char/fullwidth_to_halfwidth.tsv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
,
2-
.
3-
.
1+
,
2+
.
3+
.
44
"
55
"
66
!

tn/chinese/rules/cardinal.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from tn.processor import Processor
1616

1717
from pynini import accep, cross, string_file
18-
from pynini.lib.pynutil import delete, insert
18+
from pynini.lib.pynutil import add_weight, delete, insert
1919

2020

2121
class Cardinal(Processor):
@@ -67,10 +67,14 @@ def build_tagger(self):
6767
| cross('二万', '两万'))
6868
self.number = accep('约').ques + number.optimize()
6969

70-
# cardinal string like 110 or 127.0.0.1, used in phone, ID, IP, etc.
70+
# cardinal string like 110 or 127.0.0.1, used in ID, IP, etc.
7171
cardinal = digits.plus + (digits | dot).plus.ques + digits.plus
7272
# xxxx-xxx-xxx
73-
cardinal |= digits.plus + (delete('-') + digits.plus).closure(2)
73+
cardinal |= digits.plus + (delete('-') + digits.plus)**2
74+
# three or five or eleven phone numbers
75+
phone_digits = digits @ self.build_rule(cross('一', '幺'))
76+
phone = phone_digits**3 | phone_digits**5 | phone_digits**11
77+
cardinal |= add_weight(phone, -1.0)
7478

7579
tagger = insert('value: "') + cardinal + insert('"')
7680
self.tagger = self.add_tokens(tagger)

tn/chinese/rules/postprocessor.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,4 @@ def __init__(self, remove_puncts=False, full_to_half=True, tag_oov=False):
4444
oov = difference(self.VCHAR, charset)
4545
processor @= Tagger('oov', oov, self.VSIGMA)._tagger
4646

47-
# remove leading and trailing spaces
48-
processor @= self.build_rule(delete(' ').star, l='[BOS]')
49-
processor @= self.build_rule(delete(' ').star, r='[EOS]')
5047
self.processor = processor.optimize()

tn/chinese/test/data/cardinal.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
110 => 一一零
1+
110 => 幺幺零
22
127.0.0.1 => 一二七点零点零点一

tn/chinese/test/data/normalizer.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
最高气温38°C => 最高气温三十八摄氏度
2929
实际面积120m² => 实际面积一百二十平方米
3030
渲染速度10ms一帧 => 渲染速度十毫秒一帧
31-
可以打我手机13501234567 => 可以打我手机一三五零一二三四五六七
32-
可以拨打12306来咨询 => 可以拨打一二三零六来咨询
31+
可以打我手机13501234567 => 可以打我手机幺三五零幺二三四五六七
32+
可以拨打12306来咨询 => 可以拨打幺二三零六来咨询
3333
这儿有只鸟儿 => 这有只鸟
3434
这事儿好办 => 这事好办
3535
我儿子喜欢这地儿 => 我儿子喜欢这地

0 commit comments

Comments
 (0)