Skip to content

Commit 06cbf2c

Browse files
authored
[tn] english tn, support time (#210)
1 parent 71a7745 commit 06cbf2c

7 files changed

Lines changed: 248 additions & 2 deletions

File tree

tn/english/data/time/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

tn/english/data/time/suffix.tsv

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
p.m. PM
2+
p.m PM
3+
pm PM
4+
P.M. PM
5+
P.M PM
6+
PM PM
7+
a.m. AM
8+
a.m AM
9+
am AM
10+
A.M. AM
11+
A.M AM
12+
AM AM

tn/english/data/time/zone.tsv

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
cst CST
2+
c.s.t CST
3+
cet CET
4+
c.e.t CET
5+
pst PST
6+
p.s.t PST
7+
est EST
8+
e.s.t EST
9+
pt PT
10+
p.t PT
11+
et ET
12+
e.t ET
13+
gmt GMT
14+
g.m.t GMT

tn/english/normalizer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from tn.english.rules.fraction import Fraction
2121
from tn.english.rules.word import Word
2222
from tn.english.rules.date import Date
23+
from tn.english.rules.time import Time
2324

2425
from pynini.lib.pynutil import add_weight, delete
2526
from importlib_resources import files
@@ -39,9 +40,11 @@ def build_tagger(self):
3940
decimal = add_weight(Decimal().tagger, 1.0)
4041
fraction = add_weight(Fraction().tagger, 1.0)
4142
date = add_weight(Date().tagger, 0.99)
43+
time = add_weight(Time().tagger, 1.00)
4244
word = add_weight(Word().tagger, 100)
4345
tagger = (cardinal | ordinal | word
44-
| date | decimal | fraction).optimize() + self.DELETE_SPACE
46+
| date | decimal | fraction
47+
| time).optimize() + self.DELETE_SPACE
4548
# delete the last space
4649
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
4750

@@ -52,7 +55,8 @@ def build_verbalizer(self):
5255
fraction = Fraction().verbalizer
5356
word = Word().verbalizer
5457
date = Date().verbalizer
58+
time = Time().verbalizer
5559
verbalizer = (cardinal | ordinal | word
5660
| date | decimal
57-
| fraction).optimize() + self.INSERT_SPACE
61+
| fraction | time).optimize() + self.INSERT_SPACE
5862
self.verbalizer = verbalizer.star

tn/english/rules/time.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from tn.processor import Processor
19+
from tn.utils import get_abs_path, load_labels, augment_labels_with_punct_at_end
20+
from tn.english.rules.cardinal import Cardinal
21+
22+
23+
class Time(Processor):
24+
25+
def __init__(self, deterministic: bool = False):
26+
"""
27+
Args:
28+
deterministic: if True will provide a single transduction option,
29+
for False multiple transduction are generated (used for audio-based normalization)
30+
"""
31+
super().__init__('time', ordertype="en_tn")
32+
self.deterministic = deterministic
33+
self.build_tagger()
34+
self.build_verbalizer()
35+
36+
def build_tagger(self):
37+
"""
38+
Finite state transducer for classifying time, e.g.
39+
12:30 a.m. est -> time { hours: "twelve" minutes: "thirty" suffix: "a m" zone: "e s t" }
40+
2.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
41+
02.30 a.m. -> time { hours: "two" minutes: "thirty" suffix: "a m" }
42+
2.00 a.m. -> time { hours: "two" suffix: "a m" }
43+
2 a.m. -> time { hours: "two" suffix: "a m" }
44+
02:00 -> time { hours: "two" }
45+
2:00 -> time { hours: "two" }
46+
10:00:05 a.m. -> time { hours: "ten" minutes: "zero" seconds: "five" suffix: "a m" }
47+
"""
48+
suffix_labels = load_labels(
49+
get_abs_path("english/data/time/suffix.tsv"))
50+
suffix_labels.extend(augment_labels_with_punct_at_end(suffix_labels))
51+
suffix_graph = pynini.string_map(suffix_labels)
52+
53+
time_zone_graph = pynini.string_file(
54+
get_abs_path("english/data/time/zone.tsv"))
55+
56+
# only used for < 1000 thousand -> 0 weight
57+
cardinal = Cardinal(self.deterministic).graph
58+
59+
labels_hour = [str(x) for x in range(0, 24)]
60+
labels_minute_single = [str(x) for x in range(1, 10)]
61+
labels_minute_double = [str(x) for x in range(10, 60)]
62+
63+
delete_leading_zero_to_double_digit = (self.DIGIT + self.DIGIT) | (
64+
pynini.closure(pynutil.delete("0"), 0, 1) + self.DIGIT)
65+
66+
graph_hour = delete_leading_zero_to_double_digit @ pynini.union(
67+
*labels_hour) @ cardinal
68+
69+
graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
70+
graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
71+
72+
final_graph_hour = pynutil.insert(
73+
"hours: \"") + graph_hour + pynutil.insert("\"")
74+
final_graph_minute = (
75+
pynutil.insert("minutes: \"") +
76+
(pynini.cross("0", "o") + self.INSERT_SPACE + graph_minute_single
77+
| graph_minute_double) + pynutil.insert("\""))
78+
final_graph_second = (
79+
pynutil.insert("seconds: \"") +
80+
(pynini.cross("0", "o") + self.INSERT_SPACE + graph_minute_single
81+
| graph_minute_double) + pynutil.insert("\""))
82+
final_suffix = pynutil.insert(
83+
"suffix: \"") + suffix_graph + pynutil.insert("\"")
84+
final_suffix_optional = pynini.closure(
85+
self.DELETE_SPACE + self.INSERT_SPACE + final_suffix, 0, 1)
86+
final_time_zone_optional = pynini.closure(
87+
self.DELETE_SPACE + self.INSERT_SPACE +
88+
pynutil.insert("zone: \"") + time_zone_graph +
89+
pynutil.insert("\""),
90+
0,
91+
1,
92+
)
93+
94+
# 2:30 pm, 02:30, 2:00
95+
graph_hm = (
96+
final_graph_hour + pynutil.delete(":") +
97+
(pynutil.delete("00") | self.INSERT_SPACE + final_graph_minute) +
98+
final_suffix_optional + final_time_zone_optional)
99+
100+
# 10:30:05 pm,
101+
graph_hms = (final_graph_hour + pynutil.delete(":") +
102+
(pynini.cross("00", " minutes: \"zero\"")
103+
| self.INSERT_SPACE + final_graph_minute) +
104+
pynutil.delete(":") +
105+
(pynini.cross("00", " seconds: \"zero\"")
106+
| self.INSERT_SPACE + final_graph_second) +
107+
final_suffix_optional + final_time_zone_optional)
108+
109+
# 2.xx pm/am
110+
graph_hm2 = (
111+
final_graph_hour + pynutil.delete(".") +
112+
(pynutil.delete("00") | self.INSERT_SPACE + final_graph_minute) +
113+
self.DELETE_SPACE + self.INSERT_SPACE + final_suffix +
114+
final_time_zone_optional)
115+
# 2 pm est
116+
graph_h = final_graph_hour + self.DELETE_SPACE + self.INSERT_SPACE + final_suffix + final_time_zone_optional
117+
final_graph = (graph_hm | graph_h | graph_hm2 | graph_hms).optimize()
118+
119+
final_graph = self.add_tokens(final_graph)
120+
self.tagger = final_graph.optimize()
121+
122+
def build_verbalizer(self):
123+
"""
124+
Finite state transducer for verbalizing time, e.g.
125+
time { hours: "twelve" minutes: "thirty" suffix: "a m" zone: "e s t" } -> twelve thirty a m e s t
126+
time { hours: "twelve" } -> twelve o'clock
127+
"""
128+
hour = (pynutil.delete("hours:") + self.DELETE_SPACE +
129+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
130+
pynutil.delete("\""))
131+
minute = (pynutil.delete("minutes:") + self.DELETE_SPACE +
132+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
133+
pynutil.delete("\""))
134+
suffix = (pynutil.delete("suffix:") + self.DELETE_SPACE +
135+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
136+
pynutil.delete("\""))
137+
optional_suffix = pynini.closure(
138+
self.DELETE_SPACE + self.INSERT_SPACE + suffix, 0, 1)
139+
zone = (pynutil.delete("zone:") + self.DELETE_SPACE +
140+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
141+
pynutil.delete("\""))
142+
optional_zone = pynini.closure(
143+
self.DELETE_SPACE + self.INSERT_SPACE + zone, 0, 1)
144+
second = (pynutil.delete("seconds:") + self.DELETE_SPACE +
145+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
146+
pynutil.delete("\""))
147+
graph_hms = (hour + pynutil.insert(" hours ") + self.DELETE_SPACE +
148+
minute + pynutil.insert(" minutes and ") +
149+
self.DELETE_SPACE + second + pynutil.insert(" seconds") +
150+
optional_suffix + optional_zone)
151+
graph_hms @= pynini.cdrewrite(
152+
pynutil.delete("o ")
153+
| pynini.cross("one minutes", "one minute")
154+
| pynini.cross("one seconds", "one second")
155+
| pynini.cross("one hours", "one hour"),
156+
pynini.union(" ", "[BOS]"),
157+
"",
158+
pynini.closure(self.VCHAR),
159+
)
160+
graph = hour + self.DELETE_SPACE + self.INSERT_SPACE + minute + optional_suffix + optional_zone
161+
graph |= hour + self.INSERT_SPACE + pynutil.insert(
162+
"o'clock") + optional_zone
163+
graph |= hour + self.DELETE_SPACE + self.INSERT_SPACE + suffix + optional_zone
164+
graph |= graph_hms
165+
delete_tokens = self.delete_tokens(graph)
166+
self.verbalizer = delete_tokens.optimize()

tn/english/test/data/time.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
12:30 a.m. est => twelve thirty AM EST
2+
2.30 a.m. => two thirty AM
3+
02.30 a.m. => two thirty AM
4+
2.00 a.m. => two AM
5+
2 a.m. => two AM
6+
02:00 => two o'clock
7+
02:30 => two thirty
8+
2:00 => two o'clock
9+
10:00:05 a.m. => ten hours zero minutes and five seconds AM

tn/english/test/time_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
from tn.english.rules.time import Time
18+
from tn.english.test.utils import parse_test_case
19+
20+
21+
class Testtime:
22+
23+
time = Time(deterministic=False)
24+
time_cases = parse_test_case('data/time.txt')
25+
26+
@pytest.mark.parametrize("written, spoken", time_cases)
27+
def test_time(self, written, spoken):
28+
assert self.time.normalize(written) == spoken

0 commit comments

Comments
 (0)