Skip to content

Commit a7d4529

Browse files
authored
[fix] english tn, fix crash on double quote "" (#224)
1 parent f5965f7 commit a7d4529

6 files changed

Lines changed: 25 additions & 29 deletions

File tree

runtime/processor/wetext_token_parser.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,10 @@ std::string TokenParser::ParseValue() {
116116
std::string value = "";
117117
while (ch_ != "\"") {
118118
value += ch_;
119-
escape = ch_ == "\\" && !escape;
119+
escape = ch_ == "\\";
120120
Read();
121121
if (escape) {
122+
escape = false;
122123
value += ch_;
123124
Read();
124125
}

tn/english/data/whitelist/tts.tsv

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@ bldg. building
1818
Bldg. Building
1919
apt. apartment
2020
Apt. Apartment
21-
World War I World War one
21+
World War II. World War two
2222
World War II World War two
23+
World War I. World War one
24+
World War I World War one
2325
etc. etcetera.
2426
SnO2 tin four oxide
2527
dept department

tn/english/rules/date.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -223,14 +223,15 @@ def build_tagger(self):
223223
cardinal_graph=cardinal_graph,
224224
single_digits_graph=cardinal.single_digits_graph)
225225
two_digit_year = pynutil.insert(
226-
"year: \"") + two_digit_year + self.PUNCT.ques + pynutil.insert(
227-
"\"")
226+
"year: \"") + two_digit_year + pynini.union(
227+
",", ".").ques + pynutil.insert("\"")
228228

229-
graph_year = pynutil.insert(" year: \"") + pynutil.delete(
230-
" ") + year_graph + self.PUNCT.ques + pynutil.insert("\"")
229+
graph_year = pynutil.insert(
230+
" year: \"") + pynutil.delete(" ") + year_graph + pynini.union(
231+
",", ".").ques + pynutil.insert("\"")
231232
graph_year |= (pynutil.insert(" year: \"") + pynini.accep(",") +
232233
pynini.closure(pynini.accep(" "), 0, 1) + year_graph +
233-
self.PUNCT.ques + pynutil.insert("\""))
234+
pynini.union(",", ".").ques + pynutil.insert("\""))
234235
optional_graph_year = pynini.closure(graph_year, 0, 1)
235236

236237
year_graph = pynutil.insert("year: \"") + year_graph + pynutil.insert(
@@ -281,6 +282,12 @@ def build_tagger(self):
281282

282283
final_graph |= graph_fy
283284

285+
prefix = pynutil.delete(pynini.union("{", "(", "<", "\"",
286+
"'")).ques + self.DELETE_SPACE
287+
suffix = self.DELETE_SPACE + pynutil.delete(
288+
pynini.union("}", ")", ">", "\"", "'")).ques
289+
final_graph = pynutil.add_weight(
290+
prefix, -0.1) + final_graph + pynutil.add_weight(suffix, -0.1)
284291
self.tagger = self.add_tokens(final_graph)
285292

286293
def build_verbalizer(self):
@@ -309,9 +316,9 @@ def build_verbalizer(self):
309316
graph_fy = (pynutil.insert("the ") + period + pynutil.insert(" of") +
310317
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
311318

312-
# day month year
313-
graph_dmy = (pynutil.insert("the ") + day + self.DELETE_EXTRA_SPACE +
314-
pynutil.insert("of ") + month +
319+
# day month year, month year
320+
graph_dmy = ((pynutil.insert("the ") + day + self.DELETE_EXTRA_SPACE +
321+
pynutil.insert("of ")).ques + month +
315322
pynini.closure(self.DELETE_EXTRA_SPACE + year, 0, 1))
316323

317324
final_graph = ((graph_dmy | year | graph_fy) + self.DELETE_SPACE)

tn/english/rules/whitelist.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
from tn.processor import Processor
2323
from tn.utils import get_abs_path, load_labels, augment_labels_with_punct_at_end
24-
from tn.english.rules.measure import SINGULAR_TO_PLURAL
2524
from tn.english.rules.roman import get_names
2625

2726

@@ -93,21 +92,6 @@ def _get_whitelist_graph(input_case,
9392
pynini.closure(pynutil.delete(x) + self.UPPER, 2) +
9493
pynini.closure(pynutil.delete("."), 0, 1))
9594

96-
if not self.deterministic:
97-
multiple_forms_whitelist_graph = get_formats(
98-
get_abs_path(
99-
"english/data/whitelist/alternatives_all_format.tsv"))
100-
graph |= multiple_forms_whitelist_graph
101-
102-
graph_unit = pynini.string_file(
103-
get_abs_path("english/data/measure/unit.tsv")
104-
) | pynini.string_file(
105-
get_abs_path("english/data/measure/unit_alternatives.tsv"))
106-
graph_unit_plural = graph_unit @ SINGULAR_TO_PLURAL
107-
units_graph = pynini.compose(self.VCHAR**(3, ...),
108-
graph_unit | graph_unit_plural)
109-
graph |= units_graph
110-
11195
# convert to states only if comma is present before the abbreviation to avoid converting all caps words,
11296
# e.g. "IN", "OH", "OK"
11397
# TODO or only exclude above?
@@ -133,7 +117,7 @@ def _get_whitelist_graph(input_case,
133117

134118
def build_verbalizer(self):
135119
graph = (pynutil.delete("name:") + self.DELETE_SPACE +
136-
pynutil.delete("\"") + pynini.closure(self.VCHAR - " ", 1) +
120+
pynutil.delete("\"") + pynini.closure(self.NOT_QUOTE, 1) +
137121
pynutil.delete("\""))
138122
final_graph = graph.optimize()
139123
self.verbalizer = self.delete_tokens(final_graph)
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. ¾ people like chattts, let's eat at 03:43 p.m. run 10 km, give me $12.345 please, call 123-123-5678-1 Mt Hill "HAHAHA" billion 4 March => this is twelfth game, number 256, the sixth of may twenty twenty four , the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts, let's eat at three forty three PM run ten kilometers, give me twelve point three four five dollars please, call one two three, one two three, five six seven eight, one Mount Hill "HAHAHA" billion the fourth of march
1+
this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. ¾ people like chattts, let's eat at 03:43 p.m. run 10 km, give me $12.345 please, call 123-123-5678-1 Mt Hill "HAHAHA" billion 4 March => this is twelfth game, number 256, the sixth of may twenty twenty four , the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts, let's eat at three forty three PM run ten kilometers, give me twelve point three four five dollars please, call one two three, one two three, five six seven eight, one Mt Hill "HAHAHA" billion the fourth of march
2+
The National Map, accessed April 1, 2011" Site Description of Koppers Co. From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War II. 1988 (1988) ( 1988) ( 1988). Starling, Arthur E.( 1988 ). this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. 3/4 people like chattts Retrieved December 2011. Information on Album" Thepodule.com"" Biography by Amy Hanson". => The National Map, accessed the first of april , twenty eleven Site Description of Koppers company From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War two nineteen eighty eight nineteen eighty eight nineteen eighty eight nineteen eighty eight ). Starling, Arthur E.( nineteen eighty eight ). this is twelfth game, number 256, the sixth of may twenty twenty four , the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts Retrieved december twenty eleven. Information on Album" Thepodule.com"" Biography by Amy Hanson".

tn/token_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,10 @@ def parse_value(self):
122122
value = ''
123123
while self.char != '"':
124124
value += self.char
125-
escape = self.char == '\\' and not escape
125+
escape = self.char == '\\'
126126
self.read()
127127
if escape:
128+
escape = False
128129
value += self.char
129130
self.read()
130131
return value

0 commit comments

Comments
 (0)