Skip to content

Commit c632358

Browse files
authored
[tn] support punct (#225)
* [tn] support punct * [tn] support punct
1 parent a7d4529 commit c632358

7 files changed

Lines changed: 116 additions & 34 deletions

File tree

tn/english/data/measure/unit.tsv

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ ghz gigahertz
2626
gw gigawatt
2727
gwh gigawatt hour
2828
hz hertz
29-
" inch
3029
kbps kilobit per second
3130
kcal kilo calory
3231
kgf kilogram force
@@ -124,4 +123,4 @@ ps PS
124123
s S
125124
tb TB
126125
tb YB
127-
zb ZB
126+
zb ZB

tn/english/normalizer.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tn.english.rules.telephone import Telephone
2727
from tn.english.rules.electronic import Electronic
2828
from tn.english.rules.whitelist import WhiteList
29+
from tn.english.rules.punctuation import Punctuation
2930

3031
from pynini.lib.pynutil import add_weight, delete
3132
from importlib_resources import files
@@ -52,12 +53,15 @@ def build_tagger(self):
5253
electronic = add_weight(Electronic().tagger, 1.00)
5354
word = add_weight(Word().tagger, 100)
5455
whitelist = add_weight(WhiteList().tagger, 1.00)
56+
punct = add_weight(Punctuation().tagger, 2.00)
5557
# TODO(xcsong): add roman
56-
tagger = (cardinal | ordinal | word
57-
| date | decimal | fraction
58-
| time | measure | money
59-
| telephone | electronic
60-
| whitelist).optimize() + self.DELETE_SPACE
58+
tagger = punct.star + \
59+
(cardinal | ordinal | word
60+
| date | decimal | fraction
61+
| time | measure | money
62+
| telephone | electronic
63+
| whitelist
64+
| punct).optimize() + (punct.plus | self.DELETE_SPACE)
6165
# delete the last space
6266
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
6367

@@ -74,12 +78,15 @@ def build_verbalizer(self):
7478
telephone = Telephone().verbalizer
7579
electronic = Electronic().verbalizer
7680
whitelist = WhiteList().verbalizer
77-
verbalizer = (cardinal | ordinal | word
78-
| date | decimal
79-
| fraction | time
80-
| measure | money
81-
| telephone
82-
| electronic
83-
| whitelist).optimize() + self.INSERT_SPACE
81+
punct = Punctuation().verbalizer
82+
verbalizer = \
83+
(cardinal | ordinal | word
84+
| date | decimal
85+
| fraction | time
86+
| measure | money
87+
| telephone
88+
| electronic
89+
| whitelist
90+
| punct).optimize() + punct.ques + self.INSERT_SPACE
8491
self.verbalizer = verbalizer.star @ self.build_rule(delete(' '),
8592
r='[EOS]')

tn/english/rules/date.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -282,12 +282,6 @@ def build_tagger(self):
282282

283283
final_graph |= graph_fy
284284

285-
prefix = pynutil.delete(pynini.union("{", "(", "<", "\"",
286-
"'")).ques + self.DELETE_SPACE
287-
suffix = self.DELETE_SPACE + pynutil.delete(
288-
pynini.union("}", ")", ">", "\"", "'")).ques
289-
final_graph = pynutil.add_weight(
290-
prefix, -0.1) + final_graph + pynutil.add_weight(suffix, -0.1)
291285
self.tagger = self.add_tokens(final_graph)
292286

293287
def build_verbalizer(self):

tn/english/rules/punctuation.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import sys
16+
from unicodedata import category
17+
18+
from pynini.examples import plurals
19+
from pynini import cross, union, closure, accep
20+
from pynini.lib.pynutil import delete, insert
21+
22+
from tn.processor import Processor
23+
from tn.utils import get_abs_path, load_labels
24+
25+
26+
class Punctuation(Processor):
27+
28+
def __init__(self, deterministic: bool = False):
29+
"""
30+
Args:
31+
deterministic: if True will provide a single transduction option,
32+
for False multiple transduction are generated (used for audio-based normalization)
33+
"""
34+
super().__init__('p', ordertype="en_tn")
35+
self.deterministic = deterministic
36+
self.build_tagger()
37+
self.build_verbalizer()
38+
39+
def build_tagger(self):
40+
"""
41+
Finite state transducer for classifying punctuation
42+
e.g. a, -> w { v: "a" } p { v: "," }
43+
"""
44+
s = "!#%&\'()*+,-./:;<=>?@^_`{|}~"
45+
46+
punct_symbols_to_exclude = ["[", "]", "\"", "\\"]
47+
punct_unicode = [
48+
chr(i) for i in range(sys.maxunicode)
49+
if category(chr(i)).startswith("P")
50+
and chr(i) not in punct_symbols_to_exclude
51+
]
52+
53+
whitelist_symbols = load_labels(
54+
get_abs_path("english/data/whitelist/symbol.tsv"))
55+
whitelist_symbols = [x[0] for x in whitelist_symbols]
56+
self.punct_marks = [
57+
p for p in punct_unicode + list(s) if p not in whitelist_symbols
58+
]
59+
60+
self.punct = union(*self.punct_marks)
61+
punct = closure(self.punct | cross('\\', '\\\\\\') | cross('"', '\\"'),
62+
1)
63+
64+
emphasis = (
65+
accep("<") +
66+
((
67+
closure(self.NOT_SPACE - union("<", ">"), 1) + # noqa
68+
closure(accep("/"), 0, 1)) # noqa
69+
| (accep("/") + closure(self.NOT_SPACE - union("<", ">"), 1))) +
70+
accep(">")) # noqa
71+
punct = plurals._priority_union(emphasis, punct, closure(self.VCHAR))
72+
73+
self.graph = punct
74+
final_graph = insert("v: \"") + punct + insert("\"")
75+
self.tagger = self.add_tokens(final_graph)
76+
77+
def build_verbalizer(self):
78+
punct = closure(self.punct | cross('\\\\\\', '\\') | cross('\\"', '"'),
79+
1)
80+
verbalizer = delete('v: "') + punct + delete('"')
81+
self.verbalizer = self.delete_tokens(verbalizer)

tn/english/rules/word.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
# limitations under the License.
1515

1616
from pynini.lib import pynutil
17-
from pynini import cross, difference, union
17+
from pynini import difference, union
1818

1919
from tn.processor import Processor
20+
from tn.english.rules.punctuation import Punctuation
2021

2122

2223
class Word(Processor):
@@ -27,30 +28,32 @@ def __init__(self, deterministic: bool = False):
2728
deterministic: if True will provide a single transduction option,
2829
for False multiple transduction are generated (used for audio-based normalization)
2930
"""
30-
super().__init__("word", ordertype="en_tn")
31+
super().__init__("w", ordertype="en_tn")
3132
self.deterministic = deterministic
3233
self.build_tagger()
3334
self.build_verbalizer()
3435

3536
def build_tagger(self):
3637
"""
3738
Finite state transducer for classifying word. Considers sentence boundary exceptions.
38-
e.g. sleep -> word { value: "sleep" }
39+
e.g. sleep -> w { v: "sleep" }
3940
"""
40-
self.char = difference(self.VCHAR, union('\\', '"', self.SPACE))
41-
chars = (self.char | cross('\\', '\\\\\\') | cross('"', '\\"')).plus
42-
graph = (pynutil.insert("value: \"") + chars +
41+
punct = Punctuation(self.deterministic).graph
42+
default_graph = difference(self.NOT_SPACE, punct.project("input"))
43+
symbols_to_exclude = union("$", "€", "₩", "£", "¥", "#",
44+
"%") | self.DIGIT
45+
self.char = difference(default_graph, symbols_to_exclude)
46+
graph = (pynutil.insert("v: \"") + self.char.plus +
4347
pynutil.insert("\"")).optimize()
4448
final_graph = self.add_tokens(graph)
4549
self.tagger = final_graph.optimize()
4650

4751
def build_verbalizer(self):
4852
"""
4953
Finite state transducer for verbalizing word
50-
e.g. word { value: "sleep" } -> sleep
54+
e.g. w { v: "sleep" } -> sleep
5155
"""
52-
chars = (self.char | cross('\\\\\\', '\\') | cross('\\"', '"')).plus
53-
graph = pynutil.delete("value: ") + pynutil.delete(
54-
"\"") + chars + pynutil.delete("\"")
56+
graph = pynutil.delete("v: ") + pynutil.delete(
57+
"\"") + self.char.plus + pynutil.delete("\"")
5558
final_graph = self.delete_tokens(graph)
5659
self.verbalizer = final_graph.optimize()
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. ¾ people like chattts, let's eat at 03:43 p.m. run 10 km, give me $12.345 please, call 123-123-5678-1 Mt Hill "HAHAHA" billion 4 March => this is twelfth game, number 256, the sixth of may twenty twenty four , the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts, let's eat at three forty three PM run ten kilometers, give me twelve point three four five dollars please, call one two three, one two three, five six seven eight, one Mt Hill "HAHAHA" billion the fourth of march
2-
The National Map, accessed April 1, 2011" Site Description of Koppers Co. From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War II. 1988 (1988) ( 1988) ( 1988). Starling, Arthur E.( 1988 ). this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. 3/4 people like chattts Retrieved December 2011. Information on Album" Thepodule.com"" Biography by Amy Hanson". => The National Map, accessed the first of april , twenty eleven Site Description of Koppers company From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War two nineteen eighty eight nineteen eighty eight nineteen eighty eight nineteen eighty eight ). Starling, Arthur E.( nineteen eighty eight ). this is twelfth game, number 256, the sixth of may twenty twenty four , the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts Retrieved december twenty eleven. Information on Album" Thepodule.com"" Biography by Amy Hanson".
1+
this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. ¾ people like chattts, let's eat at 03:43 p.m. run 10 km, give me $12.345 please, call 123-123-5678-1 Mt Hill "HAHAHA" billion 4 March => this is twelfth game, number two hundred and fifty six, the sixth of may twenty twenty four, the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts, let' s eat at three forty three PM run ten kilometers, give me twelve point three four five dollars please, call one two three, one two three, five six seven eight, one Mt Hill" HAHAHA" billion the fourth of march
2+
The National Map, accessed April 1, 2011" Site Description of Koppers Co. From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War II. 1988 (1988) ( 1988) ( 1988). Starling, Arthur E.( 1988 ). this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. 3/4 people like chattts Retrieved December 2011. Information on Album" Thepodule.com"" Biography by Amy Hanson". => The National Map, accessed the first of april , twenty eleven" Site Description of Koppers company From the quartet' s recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War two nineteen eighty eight( nineteen eighty eight )( nineteen eighty eight )( nineteen eighty eight). Starling, Arthur E.( nineteen eighty eight). this is twelfth game, number two fifty six, the sixth of may twenty twenty four, the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts Retrieved december twenty eleven. Information on Album" Thepodule dot com"" Biography by Amy Hanson".

tn/english/test/data/word.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,3 @@ smile => smile
44
国 => 国
55
A => A
66
a => a
7-
"HAHAH" => "HAHAH"
8-
"HAHAH," => "HAHAH,"

0 commit comments

Comments
 (0)