Skip to content

Commit af0d01c

Browse files
authored
[tn] english tn, support ordinal (#204)
1 parent f4554ef commit af0d01c

7 files changed

Lines changed: 152 additions & 6 deletions

File tree

tn/english/data/ordinal/__init__.py

Whitespace-only changes.

tn/english/data/ordinal/digit.tsv

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
first one
2+
second two
3+
third three
4+
fourth four
5+
fifth five
6+
sixth sixth
7+
seventh seven
8+
eighth eight
9+
ninth nine

tn/english/data/ordinal/teen.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
twelfth twelve

tn/english/normalizer.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from tn.processor import Processor
1717
from tn.english.rules.cardinal import Cardinal
18+
from tn.english.rules.ordinal import Ordinal
1819
from tn.english.rules.word import Word
1920

2021
from pynini.lib.pynutil import add_weight, delete
@@ -23,23 +24,23 @@
2324

2425
class Normalizer(Processor):
2526

26-
def __init__(self,
27-
cache_dir=None,
28-
overwrite_cache=False):
27+
def __init__(self, cache_dir=None, overwrite_cache=False):
2928
super().__init__(name='en_normalizer')
3029
if cache_dir is None:
3130
cache_dir = files("tn")
3231
self.build_fst('en_tn', cache_dir, overwrite_cache)
3332

3433
def build_tagger(self):
35-
cardinal = add_weight(Cardinal().tagger, 1.06)
34+
cardinal = add_weight(Cardinal().tagger, 1.0)
35+
ordinal = add_weight(Ordinal().tagger, 1.0)
3636
word = add_weight(Word().tagger, 100)
37-
tagger = (cardinal | word).optimize() + self.DELETE_SPACE
37+
tagger = (cardinal | ordinal | word).optimize() + self.DELETE_SPACE
3838
# delete the last space
3939
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
4040

4141
def build_verbalizer(self):
4242
cardinal = Cardinal().verbalizer
43+
ordinal = Ordinal().verbalizer
4344
word = Word().verbalizer
44-
verbalizer = (cardinal | word).optimize() + self.INSERT_SPACE
45+
verbalizer = (cardinal | ordinal | word).optimize() + self.INSERT_SPACE
4546
self.verbalizer = verbalizer.star

tn/english/rules/ordinal.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2024, WENET COMMUNITY. Xingchen Song (sxc19@tsinghua.org.cn).
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import pynini
17+
from pynini.lib import pynutil
18+
19+
from tn.processor import Processor
20+
from tn.utils import get_abs_path
21+
from tn.english.rules.cardinal import Cardinal
22+
23+
24+
class Ordinal(Processor):
25+
26+
def __init__(self, deterministic: bool = False):
27+
"""
28+
Args:
29+
deterministic: if True will provide a single transduction option,
30+
for False multiple transduction are generated (used for audio-based normalization)
31+
"""
32+
super().__init__("ordinal", ordertype="tn")
33+
self.deterministic = deterministic
34+
self.build_tagger()
35+
self.build_verbalizer()
36+
37+
def build_tagger(self):
38+
"""
39+
Finite state transducer for classifying ordinal, e.g.
40+
13th -> ordinal { integer: "thirteen" }
41+
"""
42+
cardinal = Cardinal(self.deterministic)
43+
cardinal_graph = cardinal.graph
44+
cardinal_format = pynini.closure(self.DIGIT | pynini.accep(","))
45+
st_format = (pynini.closure(cardinal_format +
46+
(self.DIGIT - "1"), 0, 1) +
47+
pynini.accep("1") +
48+
pynutil.delete(pynini.union("st", "ST", "ˢᵗ")))
49+
nd_format = (pynini.closure(cardinal_format +
50+
(self.DIGIT - "1"), 0, 1) +
51+
pynini.accep("2") +
52+
pynutil.delete(pynini.union("nd", "ND", "ⁿᵈ")))
53+
rd_format = (pynini.closure(cardinal_format +
54+
(self.DIGIT - "1"), 0, 1) +
55+
pynini.accep("3") +
56+
pynutil.delete(pynini.union("rd", "RD", "ʳᵈ")))
57+
th_format = pynini.closure(
58+
(self.DIGIT - "1" - "2" - "3")
59+
| (cardinal_format + "1" + self.DIGIT)
60+
| (cardinal_format + (self.DIGIT - "1") +
61+
(self.DIGIT - "1" - "2" - "3")),
62+
1,
63+
) + pynutil.delete(pynini.union("th", "TH", "ᵗʰ"))
64+
self.graph = (st_format | nd_format | rd_format
65+
| th_format) @ cardinal_graph
66+
final_graph = pynutil.insert(
67+
"integer: \"") + self.graph + pynutil.insert("\"")
68+
final_graph = self.add_tokens(final_graph)
69+
self.tagger = final_graph.optimize()
70+
71+
def build_verbalizer(self):
72+
"""
73+
Finite state transducer for verbalizing ordinal, e.g.
74+
ordinal { integer: "thirteen" } } -> thirteenth
75+
"""
76+
graph_digit = pynini.string_file(
77+
get_abs_path("english/data/ordinal/digit.tsv")).invert()
78+
graph_teens = pynini.string_file(
79+
get_abs_path("english/data/ordinal/teen.tsv")).invert()
80+
81+
graph = (pynutil.delete("integer:") + self.DELETE_SPACE +
82+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
83+
pynutil.delete("\""))
84+
convert_rest = pynutil.insert("th")
85+
86+
suffix = pynini.cdrewrite(
87+
graph_digit | graph_teens | pynini.cross("ty", "tieth")
88+
| convert_rest,
89+
"",
90+
"[EOS]",
91+
pynini.closure(self.VCHAR),
92+
).optimize()
93+
self.graph = pynini.compose(graph, suffix)
94+
self.suffix = suffix
95+
delete_tokens = self.delete_tokens(self.graph)
96+
self.verbalizer = delete_tokens.optimize()

tn/english/test/data/ordinal.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
1st => first
2+
2nd => second
3+
3rd => third
4+
5th => fifth
5+
11th => eleventh
6+
13th => thirteenth
7+
20th => twentieth
8+
21st => twenty first
9+
30th => thirtieth
10+
100th => one hundredth
11+
1000th => one thousandth

tn/english/test/ordinal_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
from tn.english.rules.ordinal import Ordinal
18+
from tn.english.test.utils import parse_test_case
19+
20+
21+
class TestOrdinal:
22+
23+
ordinal = Ordinal(deterministic=False)
24+
ordinal_cases = parse_test_case('data/ordinal.txt')
25+
26+
@pytest.mark.parametrize("written, spoken", ordinal_cases)
27+
def test_ordinal(self, written, spoken):
28+
assert self.ordinal.normalize(written) == spoken

0 commit comments

Comments
 (0)