Skip to content

Commit fd99ba6

Browse files
authored
[tn] refine date and time (#65)
* [tn] refine date and time * [tn] refine measure * [tn] fix lint
1 parent a379094 commit fd99ba6

6 files changed

Lines changed: 29 additions & 15 deletions

File tree

tn/chinese/normalizer.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,6 @@ def build_tagger(self):
8383
math = add_weight(Math().tagger, 1.08)
8484
char = add_weight(Char().tagger, 100)
8585

86-
to = (delete('-') | delete('~')) + insert(' char { value: "到" } ')
87-
date = date + (to + date).ques
88-
time = time + (to + time).ques
89-
9086
tagger = (date | whitelist | sport | fraction | measure | money | time
9187
| cardinal | math | char).optimize()
9288
tagger = (processor @ tagger).star

tn/chinese/rules/cardinal.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Cardinal(Processor):
2323
def __init__(self):
2424
super().__init__('cardinal')
2525
self.number = None
26+
self.digits = None
2627
self.build_tagger()
2728
self.build_verbalizer()
2829

@@ -36,6 +37,7 @@ def build_tagger(self):
3637
rmzero = delete('0')
3738
rmpunct = delete(',').ques
3839
digits = zero | digit
40+
self.digits = digits
3941

4042
# 11 => 十一
4143
ten = teen + insert('十') + (digit | rmzero)
@@ -69,10 +71,6 @@ def build_tagger(self):
6971
cardinal = digits.plus + (digits | dot).plus.ques + digits.plus
7072
# xxxx-xxx-xxx
7173
cardinal |= digits.plus + (delete('-') + digits.plus).closure(2)
72-
# -xxxx年, -xx年, ~xxxx年, ~xx年
73-
unit = accep('年') | accep('赛季')
74-
rmsign = (delete('-') | delete('~')) + insert('到')
75-
cardinal |= rmsign + (digits**2 | digits**4) + unit
7674

7775
tagger = insert('value: "') + cardinal + insert('"')
7876
self.tagger = self.add_tokens(tagger)

tn/chinese/rules/date.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,10 @@ def build_tagger(self):
4848
| (year + rmsign + mm)
4949
| (mm + rmsign + year)
5050
| (mm + rmsign + day))
51-
self.tagger = self.add_tokens(date)
51+
tagger = self.add_tokens(date)
52+
53+
to = (delete('-') | delete('~')) + insert(' char { value: "到" } ')
54+
self.tagger = tagger + (to + tagger).ques
5255

5356
def build_verbalizer(self):
5457
year = delete('year: "') + self.SIGMA + delete('" ')

tn/chinese/rules/measure.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,31 @@ def build_tagger(self):
3131
units_zh = string_file('tn/chinese/data/measure/units_zh.tsv')
3232
units = units_en | units_zh
3333
rmspace = delete(' ').ques
34+
to = cross('-', '到') | cross('~', '到') | accep('到')
3435

3536
number = Cardinal().number
3637
percent = insert('百分之') + number + delete('%')
3738

3839
number @= self.build_rule(cross('二', '两'), '[BOS]', '[EOS]')
3940
# 1-11个,1个-11个
40-
prefix = (number + (rmspace + units).ques +
41-
(cross('-', '到') | accep('到')))
41+
prefix = number + (rmspace + units).ques + to
4242
measure = prefix.ques + number + rmspace + units
43-
measure @= self.build_rule(cross('两两', '二两'), '[BOS]', '')
44-
tagger = insert('value: "') + (measure | percent) + insert('"')
43+
44+
for unit in ['两', '月', '号']:
45+
measure @= self.build_rule(cross('两' + unit, '二' + unit),
46+
l='[BOS]')
47+
measure @= self.build_rule(cross('到两' + unit, '到二' + unit),
48+
r='[EOS]')
49+
50+
# -xxxx年, -xx年
51+
digits = Cardinal().digits
52+
cardinal = digits**2 | digits**4
53+
unit = accep('年') | accep('年度') | accep('赛季')
54+
prefix = cardinal + (rmspace + unit).ques + to
55+
annual = prefix.ques + cardinal + unit
56+
57+
tagger = insert('value: "') + (measure | percent
58+
| annual) + insert('"')
4559

4660
# 10km/h
4761
rmsign = rmspace + delete('/') + rmspace

tn/chinese/rules/sport.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from tn.chinese.rules.cardinal import Cardinal
1616
from tn.processor import Processor
1717

18-
from pynini import string_file
18+
from pynini import cross, string_file
1919
from pynini.lib.pynutil import delete, insert
2020

2121

tn/chinese/rules/time.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ def build_tagger(self):
3636
(delete(':') + insert(' second: "') + s + insert('"')).ques +
3737
delete(' ').ques +
3838
(insert(' noon: "') + noon + insert('"')).ques)
39-
self.tagger = self.add_tokens(tagger)
39+
tagger = self.add_tokens(tagger)
40+
41+
to = (delete('-') | delete('~')) + insert(' char { value: "到" } ')
42+
self.tagger = tagger + (to + tagger).ques
4043

4144
def build_verbalizer(self):
4245
noon = delete('noon: "') + self.SIGMA + delete('" ')

0 commit comments

Comments
 (0)