Skip to content

Commit 385ce76

Browse files
authored
[tn] simplify tn (#221)
1 parent 20c2603 commit 385ce76

13 files changed

Lines changed: 172 additions & 420 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,36 @@
11
jan january
2+
Jan january
3+
JAN january
24
feb february
5+
Feb february
6+
FEB february
37
mar march
8+
Mar march
9+
MAR march
410
apr april
11+
Apr april
12+
APR april
513
jun june
14+
Jun june
15+
JUN june
616
jul july
17+
Jul july
18+
JUL july
719
aug august
20+
Aug august
21+
AUG august
822
sep september
23+
Sep september
24+
SEP september
925
sept september
26+
Sept september
27+
SEPT september
1028
oct october
29+
Oct october
30+
OCT october
1131
nov november
32+
Nov november
33+
NOV november
1234
dec december
35+
Dec december
36+
DEC december
Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,34 @@
1-
january
2-
february
3-
march
4-
april
5-
may
6-
june
7-
july
8-
august
9-
september
10-
october
11-
november
12-
december
1+
january january
2+
february february
3+
march march
4+
april april
5+
may may
6+
june june
7+
july july
8+
august august
9+
september september
10+
october october
11+
november november
12+
december december
13+
January january
14+
JANUARY january
15+
February february
16+
FEBRUARY february
17+
March march
18+
MARCH march
19+
April april
20+
APRIL april
21+
June june
22+
JUNE june
23+
July july
24+
JULY july
25+
August august
26+
AUGUST august
27+
September september
28+
SEPTEMBER september
29+
October october
30+
OCTOBER october
31+
November november
32+
NOVEMBER november
33+
December december
34+
DECEMBER december

tn/english/normalizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,4 +81,5 @@ def build_verbalizer(self):
8181
| telephone
8282
| electronic
8383
| whitelist).optimize() + self.INSERT_SPACE
84-
self.verbalizer = verbalizer.star
84+
self.verbalizer = verbalizer.star @ self.build_rule(delete(' '),
85+
r='[EOS]')

tn/english/rules/date.py

Lines changed: 29 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -174,38 +174,26 @@ def __init__(self, deterministic: bool = False):
174174
def build_tagger(self):
175175
"""
176176
Finite state transducer for classifying date, e.g.
177-
jan. 5, 2012 -> date { month: "january" day: "five" year: ", twenty twelve" preserve_order: "true" }
178-
jan. 5 -> date { month: "january" day: "five" preserve_order: "true" }
179-
5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" preserve_order: "true" }
177+
jan. 5, 2012 -> date { month: "january" day: "five" year: ", twenty twelve" }
178+
jan. 5 -> date { month: "january" day: "five" }
179+
5 january 2012 -> date { day: "five" month: "january" year: "twenty twelve" }
180180
2012-01-05 -> date { year: "twenty twelve" month: "january" day: "five" }
181181
2012.01.05 -> date { year: "twenty twelve" month: "january" day: "five" }
182182
2012/01/05 -> date { year: "twenty twelve" month: "january" day: "five" }
183183
2012 -> date { year: "twenty twelve" }
184184
"""
185185
cardinal = Cardinal(self.deterministic)
186-
# january
186+
# january, January, JANUARY
187187
month_graph = pynini.string_file(
188-
get_abs_path("english/data/date/month_name.tsv")).optimize()
189-
# January, JANUARY
190-
month_graph |= pynini.compose(
191-
self.TO_LOWER + pynini.closure(self.VCHAR),
192-
month_graph) | pynini.compose(self.TO_LOWER**(2, ...), month_graph)
193-
194-
# jan
195-
month_abbr_graph = pynini.string_file(
196-
get_abs_path("english/data/date/month_abbr.tsv")).optimize()
188+
get_abs_path("english/data/date/month_name.tsv"))
197189
# jan, Jan, JAN
198-
month_abbr_graph = (
199-
month_abbr_graph
200-
| pynini.compose(self.TO_LOWER + pynini.closure(self.LOWER, 1),
201-
month_abbr_graph).optimize()
202-
| pynini.compose(self.TO_LOWER**(2, ...),
203-
month_abbr_graph).optimize()) + pynini.closure(
204-
pynutil.delete("."), 0, 1)
205-
month_graph |= month_abbr_graph.optimize()
190+
month_abbr_graph = pynini.string_file(
191+
get_abs_path("english/data/date/month_abbr.tsv"))
192+
month_graph |= month_abbr_graph
193+
month_graph += pynutil.delete(self.PUNCT).ques
206194

207195
month_numbers_labels = pynini.string_file(
208-
get_abs_path("english/data/date/month_number.tsv")).optimize()
196+
get_abs_path("english/data/date/month_number.tsv"))
209197
cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit
210198

211199
year_graph = _get_year_graph(cardinal_graph=cardinal_graph,
@@ -235,13 +223,14 @@ def build_tagger(self):
235223
cardinal_graph=cardinal_graph,
236224
single_digits_graph=cardinal.single_digits_graph)
237225
two_digit_year = pynutil.insert(
238-
"year: \"") + two_digit_year + pynutil.insert("\"")
226+
"year: \"") + two_digit_year + self.PUNCT.ques + pynutil.insert(
227+
"\"")
239228

240229
graph_year = pynutil.insert(" year: \"") + pynutil.delete(
241-
" ") + year_graph + pynutil.insert("\"")
230+
" ") + year_graph + self.PUNCT.ques + pynutil.insert("\"")
242231
graph_year |= (pynutil.insert(" year: \"") + pynini.accep(",") +
243232
pynini.closure(pynini.accep(" "), 0, 1) + year_graph +
244-
pynutil.insert("\""))
233+
self.PUNCT.ques + pynutil.insert("\""))
245234
optional_graph_year = pynini.closure(graph_year, 0, 1)
246235

247236
year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert(
@@ -252,134 +241,53 @@ def build_tagger(self):
252241
| (pynini.accep(" ") + day_graph)
253242
| graph_year
254243
| (self.DELETE_EXTRA_SPACE + day_graph + graph_year))
255-
256244
graph_mdy |= (month_graph + pynini.cross("-", " ") + day_graph +
257245
pynini.closure(
258246
((pynini.cross("-", " ") +
259247
pynini.closure(self.VCHAR)) @ graph_year), 0, 1))
260-
261248
for x in ["-", "/", "."]:
262249
delete_sep = pynutil.delete(x)
263250
graph_mdy |= (month_numbers_graph + delete_sep +
264251
self.INSERT_SPACE +
265252
pynini.closure(pynutil.delete("0"), 0, 1) +
266253
day_graph + delete_sep + self.INSERT_SPACE +
267-
(year_graph | two_digit_year))
254+
(pynutil.add_weight(year_graph, -1.0)))
268255

269-
graph_dmy = day_graph + self.DELETE_EXTRA_SPACE + month_graph + optional_graph_year
256+
graph_dmy = day_graph + self.DELETE_EXTRA_SPACE + self.INSERT_SPACE + month_graph + optional_graph_year
270257
day_ex_month = (self.DIGIT**2 - pynini.project(month_numbers_graph,
271258
"input")) @ day_graph
272259
for x in ["-", "/", "."]:
273260
delete_sep = pynutil.delete(x)
274261
graph_dmy |= (day_ex_month + delete_sep + self.INSERT_SPACE +
275262
month_numbers_graph + delete_sep +
276-
self.INSERT_SPACE + (year_graph | two_digit_year))
263+
self.INSERT_SPACE +
264+
(pynutil.add_weight(year_graph, -1.0)))
277265

278-
graph_ymd = pynini.accep("")
266+
graph_ymd = year_graph + self.DELETE_EXTRA_SPACE + self.INSERT_SPACE + month_graph + self.DELETE_EXTRA_SPACE + self.INSERT_SPACE + day_graph
279267
for x in ["-", "/", "."]:
280268
delete_sep = pynutil.delete(x)
281-
graph_ymd |= ((year_graph | two_digit_year) + delete_sep +
269+
graph_ymd |= ((pynutil.add_weight(year_graph, -1.0)) + delete_sep +
282270
self.INSERT_SPACE + month_numbers_graph +
283271
delete_sep + self.INSERT_SPACE +
284272
pynini.closure(pynutil.delete("0"), 0, 1) +
285273
day_graph)
286274

287-
final_graph = graph_mdy | graph_dmy
288-
289-
if not self.deterministic:
290-
final_graph += pynini.closure(
291-
pynutil.insert(" preserve_order: \"true\""), 0, 1)
292-
m_sep_d = (month_numbers_graph +
293-
pynutil.delete(pynini.union("-", "/")) +
294-
self.INSERT_SPACE +
295-
pynini.closure(pynutil.delete("0"), 0, 1) + day_graph)
296-
final_graph |= m_sep_d
297-
else:
298-
final_graph += pynutil.insert(" preserve_order: \"true\"")
275+
final_graph = pynutil.add_weight(graph_mdy | graph_dmy | graph_ymd,
276+
-0.1) | year_graph
299277

300278
period_fy = pynutil.insert(
301279
"text: \"") + _get_financial_period_graph() + pynutil.insert("\"")
302280
graph_fy = period_fy + self.INSERT_SPACE + two_digit_year
303281

304-
final_graph |= graph_ymd | year_graph | graph_fy
305-
306-
ymd_to_mdy_graph = None
307-
ymd_to_dmy_graph = None
308-
mdy_to_dmy_graph = None
309-
md_to_dm_graph = None
310-
311-
for month in [
312-
x[0] for x in load_labels(
313-
get_abs_path("english/data/date/month_name.tsv"))
314-
]:
315-
for day in [
316-
x[0] for x in load_labels(
317-
get_abs_path("english/data/date/day.tsv"))
318-
]:
319-
ymd_to_mdy_curr = (pynutil.insert("month: \"" + month +
320-
"\" day: \"" + day + "\" ") +
321-
pynini.accep('year:') +
322-
pynini.closure(self.VCHAR) +
323-
pynutil.delete(" month: \"" + month +
324-
"\" day: \"" + day + "\""))
325-
326-
# YY-MM-DD -> MM-DD-YY
327-
ymd_to_mdy_curr = pynini.compose(graph_ymd, ymd_to_mdy_curr)
328-
ymd_to_mdy_graph = (ymd_to_mdy_curr if
329-
ymd_to_mdy_graph is None else pynini.union(
330-
ymd_to_mdy_curr, ymd_to_mdy_graph))
331-
332-
ymd_to_dmy_curr = (
333-
pynutil.insert("day: \"" + day + "\" month: \"" + month +
334-
"\" ") + pynini.accep('year:') +
335-
pynini.closure(self.VCHAR) +
336-
pynutil.delete(" month: \"" + month + "\" day: \"" + day +
337-
"\""))
338-
339-
# YY-MM-DD -> MM-DD-YY
340-
ymd_to_dmy_curr = pynini.compose(graph_ymd,
341-
ymd_to_dmy_curr).optimize()
342-
ymd_to_dmy_graph = (ymd_to_dmy_curr if
343-
ymd_to_dmy_graph is None else pynini.union(
344-
ymd_to_dmy_curr, ymd_to_dmy_graph))
345-
346-
mdy_to_dmy_curr = (
347-
pynutil.insert("day: \"" + day + "\" month: \"" + month +
348-
"\" ") +
349-
pynutil.delete("month: \"" + month + "\" day: \"" + day +
350-
"\" ") + pynini.accep('year:') +
351-
pynini.closure(self.VCHAR)).optimize()
352-
# MM-DD-YY -> verbalize as MM-DD-YY (February fourth 1991) or DD-MM-YY (the fourth of February 1991)
353-
mdy_to_dmy_curr = pynini.compose(graph_mdy,
354-
mdy_to_dmy_curr).optimize()
355-
mdy_to_dmy_graph = (
356-
mdy_to_dmy_curr if mdy_to_dmy_graph is None else
357-
pynini.union(mdy_to_dmy_curr,
358-
mdy_to_dmy_graph).optimize()).optimize()
359-
360-
md_to_dm_curr = pynutil.insert(
361-
"day: \"" + day + "\" month: \"" + month +
362-
"\"") + pynutil.delete("month: \"" + month + "\" day: \"" +
363-
day + "\"")
364-
md_to_dm_curr = pynini.compose(m_sep_d,
365-
md_to_dm_curr).optimize()
366-
367-
md_to_dm_graph = (
368-
md_to_dm_curr if md_to_dm_graph is None else pynini.union(
369-
md_to_dm_curr, md_to_dm_graph).optimize()).optimize()
370-
371-
if not self.deterministic:
372-
final_graph |= pynutil.add_weight(
373-
mdy_to_dmy_graph | md_to_dm_graph | ymd_to_dmy_graph, -0.1)
374-
375-
final_graph = self.add_tokens(final_graph)
376-
self.tagger = final_graph.optimize()
282+
final_graph |= graph_fy
283+
284+
self.tagger = self.add_tokens(final_graph)
377285

378286
def build_verbalizer(self):
379287
"""
380288
Finite state transducer for verbalizing date, e.g.
381-
date { month: "february" day: "five" year: "twenty twelve" preserve_order: "true" } -> february fifth twenty twelve
382-
date { day: "five" month: "february" year: "twenty twelve" preserve_order: "true" } -> the fifth of february twenty twelve
289+
date { month: "february" day: "five" year: "twenty twelve" } -> the fifth of february twenty twelve
290+
date { day: "five" month: "february" year: "twenty twelve" } -> the fifth of february twenty twelve
383291
"""
384292
ordinal = Ordinal(self.deterministic)
385293
phrase = pynini.closure(self.NOT_QUOTE, 1)
@@ -401,31 +309,10 @@ def build_verbalizer(self):
401309
graph_fy = (pynutil.insert("the ") + period + pynutil.insert(" of") +
402310
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
403311

404-
# month (day) year
405-
graph_mdy = (month +
406-
pynini.closure(self.DELETE_EXTRA_SPACE + day, 0, 1) +
407-
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
408-
# may 5 -> may five
409-
if not self.deterministic:
410-
graph_mdy |= (
411-
month +
412-
pynini.closure(self.DELETE_EXTRA_SPACE + day_cardinal, 0, 1) +
413-
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
414-
415312
# day month year
416313
graph_dmy = (pynutil.insert("the ") + day + self.DELETE_EXTRA_SPACE +
417314
pynutil.insert("of ") + month +
418315
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
419316

420-
optional_preserve_order = pynini.closure(
421-
pynutil.delete("preserve_order:") + self.DELETE_SPACE +
422-
pynutil.delete("\"true\"") + self.DELETE_SPACE
423-
| pynutil.delete("field_order:") + self.DELETE_SPACE +
424-
pynutil.delete("\"") + self.NOT_QUOTE + pynutil.delete("\"") +
425-
self.DELETE_SPACE)
426-
427-
final_graph = (
428-
(graph_dmy | pynutil.add_weight(graph_mdy, 0.0001) | year
429-
| graph_fy) + self.DELETE_SPACE + optional_preserve_order) # noqa
430-
delete_tokens = self.delete_tokens(final_graph)
431-
self.verbalizer = delete_tokens.optimize()
317+
final_graph = ((graph_dmy | year | graph_fy) + self.DELETE_SPACE)
318+
self.verbalizer = self.delete_tokens(final_graph)

tn/english/rules/decimal.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ def get_quantity(decimal: 'pynini.FstLike',
5555
else:
5656
quantity = quantities
5757
res |= (decimal + pynini.closure(pynutil.delete(" "), 0, 1) +
58-
pynutil.insert(" quantity: \"") + quantity + pynutil.insert("\""))
58+
pynutil.insert(" quantity: \"") + quantity +
59+
Processor("tmp").PUNCT.ques + pynutil.insert("\""))
5960
return res
6061

6162

@@ -94,9 +95,11 @@ def build_tagger(self):
9495
1)
9596

9697
self.graph_fractional = pynutil.insert(
97-
"fractional_part: \"") + self.graph + pynutil.insert("\"")
98+
"fractional_part: \""
99+
) + self.graph + self.PUNCT.ques + pynutil.insert("\"")
98100
self.graph_integer = pynutil.insert(
99-
"integer_part: \"") + cardinal_graph + pynutil.insert("\"")
101+
"integer_part: \""
102+
) + cardinal_graph + self.PUNCT.ques + pynutil.insert("\"")
100103
final_graph_wo_sign = (
101104
pynini.closure(self.graph_integer + pynutil.insert(" "), 0, 1) +
102105
point + pynutil.insert(" ") + self.graph_fractional)

0 commit comments

Comments
 (0)