Skip to content

Commit 28e6dae

Browse files
authored
[tn] english, support money (#212)
* [tn] english, support money * [tn] english, support money * [tn] english, support money
1 parent 8b043aa commit 28e6dae

9 files changed

Lines changed: 347 additions & 2 deletions

File tree

tn/english/data/money/__init__.py

Whitespace-only changes.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
$ dollar
2+
$ us dollar
3+
US$ us dollar
4+
฿ Thai Baht
5+
£ pound
6+
euro
7+
won
8+
nzd new zealand dollar
9+
rs rupee
10+
chf swiss franc
11+
dkk danish kroner
12+
fim finnish markka
13+
aed arab emirates dirham
14+
¥ yen
15+
czk czech koruna
16+
mro mauritanian ouguiya
17+
pkr pakistani rupee
18+
crc costa rican colon
19+
hk$ hong kong dollar
20+
npr nepalese rupee
21+
awg aruban florin
22+
nok norwegian kroner
23+
tzs tanzanian shilling
24+
sek swedish kronor
25+
cyp cypriot pound
26+
r real
27+
sar saudi riyal
28+
cve cape verde escudo
29+
rsd serbian dinar
30+
dm german mark
31+
shp saint helena pounds
32+
php philippine peso
33+
cad canadian dollar
34+
ssp south sudanese pound
35+
scr seychelles rupee
36+
mvr maldivian rufiyaa
37+
DH dirham
38+
Dh dirham
39+
Dhs. dirham
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
$ cents
2+
US$ cents
3+
cents
4+
£ pence
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
$ cent
2+
cent
3+
£ penny

tn/english/data/money/per_unit.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/ea each
2+
/dozen

tn/english/normalizer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from tn.english.rules.date import Date
2323
from tn.english.rules.time import Time
2424
from tn.english.rules.measure import Measure
25+
from tn.english.rules.money import Money
2526

2627
from pynini.lib.pynutil import add_weight, delete
2728
from importlib_resources import files
@@ -43,10 +44,11 @@ def build_tagger(self):
4344
date = add_weight(Date().tagger, 0.99)
4445
time = add_weight(Time().tagger, 1.00)
4546
measure = add_weight(Measure().tagger, 1.00)
47+
money = add_weight(Money().tagger, 1.00)
4648
word = add_weight(Word().tagger, 100)
4749
tagger = (cardinal | ordinal | word
4850
| date | decimal | fraction
49-
| time | measure).optimize() + self.DELETE_SPACE
51+
| time | measure | money).optimize() + self.DELETE_SPACE
5052
# delete the last space
5153
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
5254

@@ -59,8 +61,9 @@ def build_verbalizer(self):
5961
date = Date().verbalizer
6062
time = Time().verbalizer
6163
measure = Measure().verbalizer
64+
money = Money().verbalizer
6265
verbalizer = (cardinal | ordinal | word
6366
| date | decimal
6467
| fraction | time
65-
| measure).optimize() + self.INSERT_SPACE
68+
| measure | money).optimize() + self.INSERT_SPACE
6669
self.verbalizer = verbalizer.star

tn/english/rules/money.py

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from tn.processor import Processor
19+
from tn.utils import get_abs_path, load_labels
20+
from tn.english.rules.cardinal import Cardinal
21+
from tn.english.rules.decimal import Decimal
22+
from tn.english.rules.measure import SINGULAR_TO_PLURAL
23+
24+
min_singular = pynini.string_file(
25+
get_abs_path("english/data/money/currency_minor_singular.tsv"))
26+
min_plural = pynini.string_file(
27+
get_abs_path("english/data/money/currency_minor_plural.tsv"))
28+
maj_singular = pynini.string_file(
29+
(get_abs_path("english/data/money/currency_major.tsv")))
30+
31+
32+
class Money(Processor):
33+
34+
def __init__(self, deterministic: bool = False):
35+
"""
36+
Args:
37+
deterministic: if True will provide a single transduction option,
38+
for False multiple transduction are generated (used for audio-based normalization)
39+
"""
40+
super().__init__('money', ordertype="en_tn")
41+
self.deterministic = deterministic
42+
self.build_tagger()
43+
self.build_verbalizer()
44+
45+
def build_tagger(self):
46+
"""
47+
Finite state transducer for classifying money, suppletive aware, e.g.
48+
$12.05 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: \"true\" }
49+
$12.0500 -> money { integer_part: "twelve" currency_maj: "dollars" fractional_part: "five" currency_min: "cents" preserve_order: \"true\" }
50+
$1 -> money { currency_maj: "dollar" integer_part: "one" }
51+
$1.00 -> money { currency_maj: "dollar" integer_part: "one" }
52+
$0.05 -> money { fractional_part: "five" currency_min: "cents" preserve_order: \"true\" }
53+
$1 million -> money { currency_maj: "dollars" integer_part: "one" quantity: "million" }
54+
$1.2 million -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two" quantity: "million" }
55+
$1.2320 -> money { currency_maj: "dollars" integer_part: "one" fractional_part: "two three two" }
56+
"""
57+
cardinal = Cardinal(self.deterministic)
58+
decimal = Decimal(self.deterministic)
59+
cardinal_graph = cardinal.graph_with_and
60+
graph_decimal_final = decimal.final_graph_wo_negative_w_abbr
61+
62+
maj_singular_labels = load_labels(
63+
get_abs_path("english/data/money/currency_major.tsv"))
64+
maj_unit_plural = maj_singular @ SINGULAR_TO_PLURAL
65+
maj_unit_singular = maj_singular
66+
67+
graph_maj_singular = pynutil.insert(
68+
"currency_maj: \"") + maj_unit_singular + pynutil.insert("\"")
69+
graph_maj_plural = pynutil.insert(
70+
"currency_maj: \"") + maj_unit_plural + pynutil.insert("\"")
71+
72+
optional_delete_fractional_zeros = pynini.closure(
73+
pynutil.delete(".") + pynini.closure(pynutil.delete("0"), 1), 0, 1)
74+
75+
graph_integer_one = pynutil.insert("integer_part: \"") + pynini.cross(
76+
"1", "one") + pynutil.insert("\"")
77+
# only for decimals where third decimal after comma is non-zero or with quantity
78+
decimal_delete_last_zeros = (
79+
pynini.closure(self.DIGIT | pynutil.delete(",")) +
80+
pynini.accep(".") + pynini.closure(self.DIGIT, 2) +
81+
(self.DIGIT - "0") + pynini.closure(pynutil.delete("0")))
82+
decimal_with_quantity = pynini.closure(self.VCHAR) + self.ALPHA
83+
84+
graph_decimal = (graph_maj_plural + self.INSERT_SPACE +
85+
(decimal_delete_last_zeros | decimal_with_quantity)
86+
@ graph_decimal_final)
87+
88+
graph_integer = (
89+
pynutil.insert("integer_part: \"") +
90+
((pynini.closure(self.VCHAR) - "1") @ cardinal_graph) +
91+
pynutil.insert("\"")) # noqa
92+
93+
graph_integer_only = graph_maj_singular + self.INSERT_SPACE + graph_integer_one
94+
graph_integer_only |= graph_maj_plural + self.INSERT_SPACE + graph_integer
95+
96+
final_graph = (graph_integer_only +
97+
optional_delete_fractional_zeros) | graph_decimal
98+
99+
# remove trailing zeros of non zero number in the first 2 digits and fill up to 2 digits
100+
# e.g. 2000 -> 20, 0200->02, 01 -> 01, 10 -> 10
101+
# not accepted: 002, 00, 0,
102+
two_digits_fractional_part = (
103+
pynini.closure(self.DIGIT) +
104+
(self.DIGIT - "0") + pynini.closure(pynutil.delete("0"))) @ (
105+
(pynutil.delete("0") + (self.DIGIT - "0"))
106+
| ((self.DIGIT - "0") + pynutil.insert("0"))
107+
| ((self.DIGIT - "0") + self.DIGIT))
108+
109+
graph_min_singular = pynutil.insert(
110+
"currency_min: \"") + min_singular + pynutil.insert("\"")
111+
graph_min_plural = pynutil.insert(
112+
"currency_min: \"") + min_plural + pynutil.insert("\"")
113+
# format ** dollars ** cent
114+
decimal_graph_with_minor = None
115+
integer_graph_reordered = None
116+
decimal_default_reordered = None
117+
for curr_symbol, _ in maj_singular_labels:
118+
preserve_order = pynutil.insert(" preserve_order: \"true\"")
119+
integer_plus_maj = graph_integer + self.INSERT_SPACE + pynutil.insert(
120+
curr_symbol) @ graph_maj_plural
121+
integer_plus_maj |= graph_integer_one + self.INSERT_SPACE + pynutil.insert(
122+
curr_symbol) @ graph_maj_singular
123+
124+
integer_plus_maj_with_comma = pynini.compose(
125+
self.DIGIT - "0" +
126+
pynini.closure(self.DIGIT | pynutil.delete(",")),
127+
integer_plus_maj)
128+
integer_plus_maj = pynini.compose(
129+
pynini.closure(self.DIGIT) - "0", integer_plus_maj)
130+
integer_plus_maj |= integer_plus_maj_with_comma
131+
132+
graph_fractional_one = two_digits_fractional_part @ pynini.cross(
133+
"1", "one")
134+
graph_fractional_one = pynutil.insert(
135+
"fractional_part: \"") + graph_fractional_one + pynutil.insert(
136+
"\"")
137+
graph_fractional = (two_digits_fractional_part @ (
138+
pynini.closure(self.DIGIT, 1, 2) - "1"
139+
) @ cardinal.graph_hundred_component_at_least_one_none_zero_digit)
140+
graph_fractional = pynutil.insert(
141+
"fractional_part: \"") + graph_fractional + pynutil.insert(
142+
"\"")
143+
144+
fractional_plus_min = graph_fractional + self.INSERT_SPACE + pynutil.insert(
145+
curr_symbol) @ graph_min_plural
146+
fractional_plus_min |= (
147+
graph_fractional_one + self.INSERT_SPACE +
148+
pynutil.insert(curr_symbol) @ graph_min_singular)
149+
150+
decimal_graph_with_minor_curr = integer_plus_maj + pynini.cross(
151+
".", " ") + fractional_plus_min
152+
153+
if not self.deterministic:
154+
decimal_graph_with_minor_curr |= pynutil.add_weight(
155+
integer_plus_maj + pynini.cross(".", " ") +
156+
pynutil.insert("fractional_part: \"") +
157+
two_digits_fractional_part @ cardinal.
158+
graph_hundred_component_at_least_one_none_zero_digit +
159+
pynutil.insert("\""),
160+
weight=0.0001,
161+
)
162+
default_fraction_graph = (
163+
decimal_delete_last_zeros
164+
| decimal_with_quantity) @ graph_decimal_final
165+
decimal_graph_with_minor_curr |= (
166+
pynini.closure(pynutil.delete("0"), 0, 1) +
167+
pynutil.delete(".") + fractional_plus_min)
168+
decimal_graph_with_minor_curr = (pynutil.delete(curr_symbol) +
169+
decimal_graph_with_minor_curr +
170+
preserve_order)
171+
172+
decimal_graph_with_minor = (
173+
decimal_graph_with_minor_curr
174+
if decimal_graph_with_minor is None else pynini.union(
175+
decimal_graph_with_minor,
176+
decimal_graph_with_minor_curr).optimize())
177+
178+
if not self.deterministic:
179+
integer_graph_reordered_curr = (pynutil.delete(curr_symbol) +
180+
integer_plus_maj +
181+
preserve_order).optimize()
182+
183+
integer_graph_reordered = (
184+
integer_graph_reordered_curr
185+
if integer_graph_reordered is None else pynini.union(
186+
integer_graph_reordered,
187+
integer_graph_reordered_curr).optimize())
188+
decimal_default_reordered_curr = (
189+
pynutil.delete(curr_symbol) + default_fraction_graph +
190+
self.INSERT_SPACE +
191+
pynutil.insert(curr_symbol) @ graph_maj_plural)
192+
193+
decimal_default_reordered = (
194+
decimal_default_reordered_curr
195+
if decimal_default_reordered is None else pynini.union(
196+
decimal_default_reordered,
197+
decimal_default_reordered_curr)).optimize()
198+
199+
# weight for SH
200+
final_graph |= pynutil.add_weight(decimal_graph_with_minor, -0.0001)
201+
202+
if not self.deterministic:
203+
final_graph |= pynutil.add_weight(
204+
integer_graph_reordered | decimal_default_reordered, -0.1)
205+
# to handle "$2.00" cases
206+
final_graph |= pynini.compose(
207+
pynini.closure(self.VCHAR) + pynutil.delete(".") +
208+
pynini.closure(pynutil.delete("0"), 1),
209+
integer_graph_reordered)
210+
final_graph = self.add_tokens(final_graph.optimize())
211+
self.tagger = final_graph.optimize()
212+
213+
def build_verbalizer(self):
214+
"""
215+
Finite state transducer for verbalizing money, e.g.
216+
money { integer_part: "twelve" fractional_part: "o five" currency: "dollars" } -> twelve o five dollars
217+
"""
218+
decimal = Decimal(self.deterministic)
219+
delete_preserve_order = pynini.closure(
220+
pynutil.delete(" preserve_order: \"true\"")
221+
| (pynutil.delete(" field_order: \"") + self.NOT_QUOTE +
222+
pynutil.delete("\"")))
223+
keep_space = pynini.accep(" ")
224+
maj = pynutil.delete("currency_maj: \"") + pynini.closure(
225+
self.NOT_QUOTE, 1) + pynutil.delete("\"")
226+
min = pynutil.delete("currency_min: \"") + pynini.closure(
227+
self.NOT_QUOTE, 1) + pynutil.delete("\"")
228+
229+
fractional_part = (pynutil.delete("fractional_part: \"") +
230+
pynini.closure(self.NOT_QUOTE, 1) +
231+
pynutil.delete("\""))
232+
233+
integer_part = decimal.integer
234+
235+
# *** currency_maj
236+
graph_integer = integer_part + keep_space + maj
237+
238+
# *** currency_maj + (***) | ((and) *** current_min)
239+
fractional = fractional_part + self.DELETE_EXTRA_SPACE + min
240+
241+
if not self.deterministic:
242+
fractional |= pynutil.insert("and ") + fractional
243+
244+
graph_integer_with_minor = integer_part + keep_space + maj + keep_space + fractional + delete_preserve_order
245+
246+
# *** point *** currency_maj
247+
graph_decimal = decimal.numbers + keep_space + maj
248+
249+
# *** current_min
250+
graph_minor = fractional_part + self.DELETE_EXTRA_SPACE + min + delete_preserve_order
251+
252+
graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor
253+
254+
if not self.deterministic:
255+
graph |= graph_integer + delete_preserve_order
256+
257+
delete_tokens = self.delete_tokens(graph)
258+
self.verbalizer = delete_tokens.optimize()

tn/english/test/data/money.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
$12.05 => twelve dollars five cents
2+
$12.0500 => twelve dollars five cents
3+
$1 => one dollar
4+
$1.00 => one dollar
5+
$0.05 => five cents
6+
$1 million => one million dollars
7+
$1.2 million => one point two million dollars
8+
$1.2320 => one point two three two dollars

tn/english/test/money_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
from tn.english.rules.money import Money
18+
from tn.english.test.utils import parse_test_case
19+
20+
21+
class TestMoney:
22+
23+
money = Money(deterministic=False)
24+
money_cases = parse_test_case('data/money.txt')
25+
26+
@pytest.mark.parametrize("written, spoken", money_cases)
27+
def test_money(self, written, spoken):
28+
assert self.money.normalize(written) == spoken

0 commit comments

Comments
 (0)