Skip to content

Commit f9f3eb0

Browse files
authored
[tn] english, support electronic (#214)
* [tn] english, support electronic * [tn] tn english, support electronic * [tn] tn english, support electronic
1 parent 59dd3f3 commit f9f3eb0

8 files changed

Lines changed: 294 additions & 2 deletions

File tree

tn/english/data/electronic/__init__.py

Whitespace-only changes.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
.com dot com
2+
.org dot org
3+
.gov dot gov
4+
.uk dot UK
5+
.fr dot FR
6+
.net dot net
7+
.br dot BR
8+
.in dot IN
9+
.ru dot RU
10+
.de dot DE
11+
.it dot IT
12+
.jpg dot jpeg
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
. dot
2+
- dash
3+
_ underscore
4+
! exclamation mark
5+
# number sign
6+
$ dollar sign
7+
% percent
8+
& ampersand
9+
' quote
10+
* asterisk
11+
+ plus
12+
/ slash
13+
= equal sign
14+
? question mark
15+
^ circumflex
16+
` right single quote
17+
| vertical bar
18+
~ tilde
19+
, comma
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
drive
2+
sim
3+
early
4+
access
5+
program
6+
rtx RTX
7+
developer
8+
basepod BASEPOD
9+
cuda CUDA
10+
cv
11+
enterprise
12+
services
13+
nvidia NVIDIA
14+
dgx DGX
15+
pro
16+
help

tn/english/normalizer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from tn.english.rules.measure import Measure
2525
from tn.english.rules.money import Money
2626
from tn.english.rules.telephone import Telephone
27+
from tn.english.rules.electronic import Electronic
2728

2829
from pynini.lib.pynutil import add_weight, delete
2930
from importlib_resources import files
@@ -47,11 +48,12 @@ def build_tagger(self):
4748
measure = add_weight(Measure().tagger, 1.00)
4849
money = add_weight(Money().tagger, 1.00)
4950
telephone = add_weight(Telephone().tagger, 1.00)
51+
electronic = add_weight(Electronic().tagger, 1.00)
5052
word = add_weight(Word().tagger, 100)
5153
tagger = (cardinal | ordinal | word
5254
| date | decimal | fraction
5355
| time | measure | money
54-
| telephone).optimize() + self.DELETE_SPACE
56+
| telephone | electronic).optimize() + self.DELETE_SPACE
5557
# delete the last space
5658
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
5759

@@ -66,9 +68,11 @@ def build_verbalizer(self):
6668
measure = Measure().verbalizer
6769
money = Money().verbalizer
6870
telephone = Telephone().verbalizer
71+
electronic = Electronic().verbalizer
6972
verbalizer = (cardinal | ordinal | word
7073
| date | decimal
7174
| fraction | time
7275
| measure | money
73-
| telephone).optimize() + self.INSERT_SPACE
76+
| telephone
77+
| electronic).optimize() + self.INSERT_SPACE
7478
self.verbalizer = verbalizer.star

tn/english/rules/electronic.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
from pynini.examples import plurals
18+
19+
from tn.processor import Processor
20+
from tn.utils import get_abs_path
21+
from tn.english.rules.cardinal import Cardinal
22+
23+
24+
class Electronic(Processor):
25+
26+
def __init__(self, deterministic: bool = False):
27+
"""
28+
Args:
29+
deterministic: if True will provide a single transduction option,
30+
for False multiple transduction are generated (used for audio-based normalization)
31+
"""
32+
super().__init__('electronic', ordertype="en_tn")
33+
self.deterministic = deterministic
34+
self.build_tagger()
35+
self.build_verbalizer()
36+
37+
def build_tagger(self):
38+
"""
39+
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
40+
e.g. cdf1@abc.edu -> tokens { electronic { username: "cdf one" domain: "abc.edu" } }
41+
"""
42+
cardinal = Cardinal(self.deterministic)
43+
if self.deterministic:
44+
numbers = self.DIGIT
45+
else:
46+
numbers = pynutil.insert(
47+
" ") + cardinal.long_numbers + pynutil.insert(" ")
48+
49+
accepted_symbols = pynini.project(
50+
pynini.string_file(
51+
get_abs_path("english/data/electronic/symbol.tsv")), "input")
52+
accepted_common_domains = pynini.project(
53+
pynini.string_file(
54+
get_abs_path("english/data/electronic/domain.tsv")), "input")
55+
56+
dict_words = pynutil.add_weight(
57+
pynini.string_file(
58+
get_abs_path("english/data/electronic/words.tsv")), -0.0001)
59+
60+
dict_words_without_delimiter = dict_words + pynini.closure(
61+
pynutil.add_weight(pynutil.insert(" ") + dict_words, -0.0001), 1)
62+
dict_words_graph = dict_words_without_delimiter | dict_words
63+
64+
all_accepted_symbols_start = (dict_words_graph
65+
| pynini.closure(self.ALPHA)
66+
| accepted_symbols).optimize()
67+
68+
all_accepted_symbols_end = (dict_words_graph | numbers
69+
| pynini.closure(self.ALPHA)
70+
| accepted_symbols).optimize()
71+
72+
graph_symbols = pynini.string_file(
73+
get_abs_path("english/data/electronic/symbol.tsv")).optimize()
74+
username = (self.ALPHA | dict_words_graph) + pynini.closure(
75+
self.ALPHA | numbers | accepted_symbols | dict_words_graph)
76+
77+
username = pynutil.insert("username: \"") + username + pynutil.insert(
78+
"\"") + pynini.cross('@', ' ')
79+
80+
domain_graph = all_accepted_symbols_start + pynini.closure(
81+
all_accepted_symbols_end
82+
| pynutil.add_weight(accepted_common_domains, -0.0001))
83+
84+
protocol_symbols = pynini.closure((graph_symbols
85+
| pynini.cross(":", "colon")) +
86+
pynutil.insert(" "))
87+
protocol_start = (pynini.cross("https", "HTTPS ") | pynini.cross(
88+
"http", "HTTP ")) + (pynini.accep("://") @ protocol_symbols)
89+
protocol_file_start = pynini.accep("file") + self.INSERT_SPACE + (
90+
pynini.accep(":///") @ protocol_symbols)
91+
92+
protocol_end = pynutil.add_weight(
93+
pynini.cross("www", "WWW ") + pynini.accep(".") @ protocol_symbols,
94+
-1000)
95+
protocol = protocol_file_start | protocol_start | protocol_end | (
96+
protocol_start + protocol_end)
97+
98+
domain_graph_with_class_tags = (
99+
pynutil.insert("domain: \"") + pynini.compose(
100+
self.ALPHA + pynini.closure(self.NOT_SPACE) +
101+
(self.ALPHA | self.DIGIT | pynini.accep("/")),
102+
domain_graph,
103+
).optimize() + pynutil.insert("\""))
104+
105+
protocol = pynutil.insert("protocol: \"") + pynutil.add_weight(
106+
protocol, -0.0001) + pynutil.insert("\"")
107+
# email
108+
graph = pynini.compose(
109+
pynini.closure(self.VCHAR) + pynini.accep("@") +
110+
pynini.closure(self.VCHAR) + pynini.accep(".") +
111+
pynini.closure(self.VCHAR),
112+
username + domain_graph_with_class_tags,
113+
)
114+
115+
# abc.com, abc.com/123-sm
116+
# when only domain, make sure it starts and end with self.ALPHA
117+
graph |= (pynutil.insert("domain: \"") + pynini.compose(
118+
self.ALPHA + pynini.closure(self.NOT_SPACE) +
119+
accepted_common_domains + pynini.closure(self.NOT_SPACE),
120+
domain_graph,
121+
).optimize() + pynutil.insert("\""))
122+
# www.abc.com/sdafsdf, or https://www.abc.com/asdfad or www.abc.abc/asdfad
123+
graph |= protocol + pynutil.insert(" ") + domain_graph_with_class_tags
124+
125+
final_graph = self.add_tokens(graph)
126+
127+
self.tagger = final_graph.optimize()
128+
129+
def build_verbalizer(self):
130+
"""
131+
Finite state transducer for verbalizing electronic
132+
e.g. electronic { username: "cdf one" domain: "abc.edu" } -> cdf one at abc dot edu
133+
"""
134+
graph_digit_no_zero = pynini.invert(
135+
pynini.string_file(
136+
get_abs_path("english/data/number/digit.tsv"))).optimize()
137+
graph_zero = pynini.cross("0", "zero")
138+
long_numbers = pynutil.add_weight(
139+
graph_digit_no_zero + pynini.cross("000", " thousand"), -0.0001)
140+
141+
if not self.deterministic:
142+
graph_zero |= pynini.cross("0", "o") | pynini.cross("0", "oh")
143+
144+
graph_digit = graph_digit_no_zero | graph_zero
145+
graph_symbols = pynini.string_file(
146+
get_abs_path("english/data/electronic/symbol.tsv")).optimize()
147+
148+
NEMO_NOT_BRACKET = pynini.difference(self.VCHAR,
149+
pynini.union("{",
150+
"}")).optimize()
151+
dict_words = pynini.project(
152+
pynini.string_file(
153+
get_abs_path("english/data/electronic/words.tsv")), "output")
154+
default_chars_symbols = pynini.cdrewrite(
155+
pynutil.insert(" ") +
156+
(graph_symbols | graph_digit | long_numbers) + pynutil.insert(" "),
157+
"",
158+
"",
159+
pynini.closure(self.VCHAR),
160+
)
161+
default_chars_symbols = pynini.compose(
162+
pynini.closure(NEMO_NOT_BRACKET),
163+
default_chars_symbols.optimize()).optimize()
164+
165+
# this is far cases when user name was split by dictionary words, i.e. "sevicepart@ab.com" -> "service part"
166+
space_separated_dict_words = pynutil.add_weight(
167+
self.ALPHA + pynini.closure(self.ALPHA | " ") + " " +
168+
pynini.closure(self.ALPHA | " "),
169+
-0.0001,
170+
)
171+
172+
user_name = (
173+
pynutil.delete("username:") + self.DELETE_SPACE +
174+
pynutil.delete("\"") +
175+
(default_chars_symbols | space_separated_dict_words).optimize() +
176+
pynutil.delete("\""))
177+
178+
domain_common = pynini.string_file(
179+
get_abs_path("english/data/electronic/domain.tsv"))
180+
181+
# this will be used for a safe fallback
182+
domain_all = pynini.compose(
183+
default_chars_symbols,
184+
pynini.closure(self.ALPHA | " "
185+
| pynutil.add_weight(dict_words, -0.0001)),
186+
)
187+
188+
domain = (
189+
domain_all + self.INSERT_SPACE + plurals._priority_union(
190+
domain_common,
191+
pynutil.add_weight(pynini.cross(".", "dot"), weight=0.0001),
192+
pynini.closure(self.VCHAR)) +
193+
pynini.closure(self.INSERT_SPACE + default_chars_symbols, 0, 1))
194+
195+
domain = (
196+
pynutil.delete("domain:") + self.DELETE_SPACE +
197+
pynutil.delete("\"") +
198+
(domain | pynutil.add_weight(domain_all, weight=100)).optimize() +
199+
self.DELETE_SPACE + pynutil.delete("\"")).optimize()
200+
201+
protocol = pynutil.delete("protocol: \"") + pynini.closure(
202+
self.NOT_QUOTE, 1) + pynutil.delete("\"")
203+
graph = (pynini.closure(protocol + self.DELETE_SPACE, 0, 1) +
204+
pynini.closure(
205+
user_name + self.DELETE_SPACE + pynutil.insert(" at ") +
206+
self.DELETE_SPACE, 0, 1) + domain +
207+
self.DELETE_SPACE).optimize() @ pynini.cdrewrite(
208+
self.DELETE_EXTRA_SPACE, "", "", pynini.closure(
209+
self.VCHAR))
210+
211+
delete_tokens = self.delete_tokens(graph)
212+
self.verbalizer = delete_tokens.optimize()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
cdf1@abc.edu => cdf one at abc dot edu

tn/english/test/electronic_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
from tn.english.rules.electronic import Electronic
18+
from tn.english.test.utils import parse_test_case
19+
20+
21+
class TestElectronic:
22+
23+
electronic = Electronic(deterministic=False)
24+
electronic_cases = parse_test_case('data/electronic.txt')
25+
26+
@pytest.mark.parametrize("written, spoken", electronic_cases)
27+
def test_electronic(self, written, spoken):
28+
assert self.electronic.normalize(written) == spoken

0 commit comments

Comments
 (0)