Skip to content

Commit 8ad5c31

Browse files
authored
fix(tn&itn): fix 语气词 (#104)
* fix(tn&itn): fix a * fix(tn&itn): fix a * fix(tn&itn): add unittest
1 parent c439bd5 commit 8ad5c31

12 files changed

Lines changed: 36 additions & 24 deletions

File tree

itn/chinese/inverse_normalizer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from itn.chinese.rules.money import Money
2323
from itn.chinese.rules.whitelist import Whitelist
2424
from itn.chinese.rules.time import Time
25-
from itn.chinese.rules.preprocessor import PreProcessor
25+
from itn.chinese.rules.postprocessor import PostProcessor
2626

2727
from pynini.lib.pynutil import add_weight, delete
2828
from importlib_resources import files
@@ -45,14 +45,13 @@ def build_tagger(self):
4545
| add_weight(Whitelist().tagger, 1.01)
4646
| add_weight(Fraction().tagger, 1.05)
4747
| add_weight(Measure().tagger, 1.05)
48-
| add_weight(Money().tagger, 1.05)
48+
| add_weight(Money().tagger, 1.04)
4949
| add_weight(Time().tagger, 1.05)
5050
| add_weight(Cardinal(self.convert_number, self.enable_0_to_9).tagger, 1.06)
5151
| add_weight(Math().tagger, 1.10)
5252
| add_weight(Char().tagger, 100)).optimize()
5353

54-
processor = PreProcessor(remove_interjections=True).processor
55-
tagger = (processor @ tagger).star
54+
tagger = tagger.star
5655
# remove the last space
5756
self.tagger = tagger @ self.build_rule(delete(' '), '', '[EOS]')
5857

@@ -65,6 +64,7 @@ def build_verbalizer(self):
6564
| Measure().verbalizer
6665
| Money().verbalizer
6766
| Time().verbalizer
68-
| Whitelist().verbalizer).optimize().star
67+
| Whitelist().verbalizer).optimize()
68+
postprocessor = PostProcessor(remove_interjections=True).processor
6969

70-
self.verbalizer = verbalizer
70+
self.verbalizer = (verbalizer @ postprocessor).star

itn/chinese/rules/measure.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def build_tagger(self):
3131
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
3232
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
3333
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
34-
units = units_en | ((accep('亿') | accep('兆') | accep('万')).ques
35-
+ units_zh)
34+
units = add_weight(units_en, -1.0) | \
35+
((accep('亿') | accep('兆') | accep('万')).ques + units_zh)
3636

3737
number = Cardinal().number
3838
# 百分之三十, 百分三十, 百分之百
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2+
# Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
45
# you may not use this file except in compliance with the License.
@@ -18,10 +19,10 @@
1819
from pynini.lib.pynutil import delete
1920

2021

21-
class PreProcessor(Processor):
22+
class PostProcessor(Processor):
2223

2324
def __init__(self, remove_interjections=True):
24-
super().__init__(name='preprocessor')
25+
super().__init__(name='postprocessor')
2526
blacklist = string_file('itn/chinese/data/default/blacklist.tsv')
2627

2728
processor = self.VSIGMA

itn/chinese/test/data/measure.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,13 @@
55
一千克 => 1kg
66
百分之二 => 2%
77
百分三十 => 30%
8+
百分之三十 => 30%
89
百分百 => 100%
910
百分之百 => 100%
1011
百分之二点一五 => 2.15%
1112
每小时十公里 => 10km/h
13+
每小时十千米 => 10km/h
14+
十千米每小时 => 10km/h
1215
三百二十四点七五克 => 324.75g
1316
二十点一万 => 20.1万
1417
一百块钱 => 100块钱

itn/chinese/test/data/normalizer.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
一共有多少人 => 一共有多少人
22
呃这个呃啊我不知道 => 这个我不知道
3+
呃呃啊 =>
34
共四百六十五篇,约三百一十五万字 => 共465篇,约315万字
45
共计六点四二万人 => 共计6.42万人
56
同比升高零点六个百分点 => 同比升高0.6个百分点

runtime/processor/processor.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ std::string Processor::Compose(const std::string& input,
5454
}
5555

5656
std::string Processor::Tag(const std::string& input) {
57+
if (input.empty()) {
58+
return "";
59+
}
5760
return Compose(input, tagger_.get());
5861
}
5962

tn/chinese/normalizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from tn.chinese.rules.time import Time
2727
from tn.chinese.rules.whitelist import Whitelist
2828

29-
from pynini.lib.pynutil import add_weight, delete, insert
29+
from pynini.lib.pynutil import add_weight, delete
3030
from importlib_resources import files
3131

3232

@@ -52,7 +52,6 @@ def __init__(self,
5252

5353
def build_tagger(self):
5454
processor = PreProcessor(
55-
remove_interjections=self.remove_interjections,
5655
traditional_to_simple=self.traditional_to_simple).processor
5756

5857
date = add_weight(Date().tagger, 1.02)
@@ -87,7 +86,9 @@ def build_verbalizer(self):
8786
verbalizer = (cardinal | char | date | fraction | math | measure
8887
| money | sport | time | whitelist).optimize()
8988

90-
processor = PostProcessor(remove_puncts=self.remove_puncts,
91-
full_to_half=self.full_to_half,
92-
tag_oov=self.tag_oov).processor
89+
processor = PostProcessor(
90+
remove_interjections=self.remove_interjections,
91+
remove_puncts=self.remove_puncts,
92+
full_to_half=self.full_to_half,
93+
tag_oov=self.tag_oov).processor
9394
self.verbalizer = (verbalizer @ processor).star

tn/chinese/rules/postprocessor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121

2222
class PostProcessor(Processor):
2323

24-
def __init__(self, remove_puncts=False, full_to_half=True, tag_oov=False):
24+
def __init__(self, remove_interjections=True, remove_puncts=False,
25+
full_to_half=True, tag_oov=False):
2526
super().__init__(name='postprocessor')
27+
blacklist = string_file('tn/chinese/data/default/blacklist.tsv')
2628
puncts = string_file('tn/chinese/data/char/punctuations_zh.tsv')
2729
full2half = string_file(
2830
'tn/chinese/data/char/fullwidth_to_halfwidth.tsv')
@@ -32,6 +34,9 @@ def __init__(self, remove_puncts=False, full_to_half=True, tag_oov=False):
3234
'tn/chinese/data/char/charset_extension.tsv')
3335

3436
processor = self.build_rule('')
37+
if remove_interjections:
38+
processor @= self.build_rule(delete(blacklist))
39+
3540
if remove_puncts:
3641
processor @= self.build_rule(delete(puncts | self.PUNCT))
3742

tn/chinese/rules/preprocessor.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,17 @@
1515
from tn.processor import Processor
1616

1717
from pynini import string_file
18-
from pynini.lib.pynutil import delete
1918

2019

2120
class PreProcessor(Processor):
2221

23-
def __init__(self, remove_interjections=True, traditional_to_simple=True):
22+
def __init__(self, traditional_to_simple=True):
2423
super().__init__(name='preprocessor')
25-
blacklist = string_file('tn/chinese/data/default/blacklist.tsv')
2624
traditional2simple = string_file(
2725
'tn/chinese/data/char/traditional_to_simple.tsv')
2826

2927
processor = self.build_rule('')
3028
if traditional_to_simple:
3129
processor @= self.build_rule(traditional2simple)
3230

33-
if remove_interjections:
34-
processor @= self.build_rule(delete(blacklist))
35-
3631
self.processor = processor.optimize()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
好! => 好!
2+
好啊 => 好
3+
啊呃呃 =>
24
我们안녕 => 我们<oov>안</oov><oov>녕</oov>
35
雪の花 => 雪<oov>の</oov>花

0 commit comments

Comments
 (0)