Skip to content

Commit 0f2c8a2

Browse files
authored
feat(itn): 支持车牌号,皖C九B三四E = >皖C9B34E (#124)
1 parent a12c85f commit 0f2c8a2

5 files changed

Lines changed: 73 additions & 0 deletions

File tree

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
2+
3+
4+
5+
6+
7+
8+
9+
10+
11+
12+
13+
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+
27+
28+
29+
30+
31+

itn/chinese/inverse_normalizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from itn.chinese.rules.whitelist import Whitelist
2424
from itn.chinese.rules.time import Time
2525
from itn.chinese.rules.postprocessor import PostProcessor
26+
from itn.chinese.rules.license_plate import LicensePlate
2627

2728
from pynini.lib.pynutil import add_weight, delete
2829
from importlib_resources import files
@@ -49,6 +50,7 @@ def build_tagger(self):
4950
| add_weight(Time().tagger, 1.05)
5051
| add_weight(Cardinal(self.convert_number, self.enable_0_to_9).tagger, 1.06) # noqa
5152
| add_weight(Math().tagger, 1.10)
53+
| add_weight(LicensePlate().tagger, 1.0)
5254
| add_weight(Char().tagger, 100)).optimize()
5355

5456
tagger = tagger.star
@@ -64,6 +66,7 @@ def build_verbalizer(self):
6466
| Measure(enable_0_to_9=self.enable_0_to_9).verbalizer
6567
| Money(enable_0_to_9=self.enable_0_to_9).verbalizer
6668
| Time().verbalizer
69+
| LicensePlate().verbalizer
6770
| Whitelist().verbalizer).optimize()
6871
postprocessor = PostProcessor(remove_interjections=True).processor
6972

itn/chinese/rules/license_plate.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from tn.processor import Processor
16+
17+
from pynini import string_file
18+
from pynini.lib.pynutil import insert
19+
20+
21+
class LicensePlate(Processor):
22+
23+
def __init__(self):
24+
super().__init__(name='licenseplate')
25+
self.build_tagger()
26+
self.build_verbalizer()
27+
28+
def build_tagger(self):
29+
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
30+
province = string_file('itn/chinese/data/license_plate/province.tsv') # 皖
31+
license_plate = province + self.ALPHA + (self.ALPHA | digit)**5
32+
tagger = insert('value: "') + license_plate + insert('"')
33+
self.tagger = self.add_tokens(tagger)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
鄂a七l六二u => 鄂a7l62u
2+
皖C九B三四E => 皖C9B34E

itn/chinese/test/normalizer_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class TestNormalizer:
3838
parse_test_case('data/time.txt'),
3939
parse_test_case('data/whitelist.txt'),
4040
parse_test_case('data/number.txt'),
41+
parse_test_case('data/license_plate.txt'),
4142
parse_test_case('data/normalizer.txt'))
4243

4344
@pytest.mark.parametrize("spoken, written", normalizer_cases)
@@ -61,6 +62,7 @@ class TestNormalizerDisablestandalonenumberEnable0to9:
6162
parse_test_case('data/money.txt'),
6263
parse_test_case('data/time.txt'),
6364
parse_test_case('data/whitelist.txt'),
65+
parse_test_case('data/license_plate.txt'),
6466
parse_test_case('data/normalizer_disable_standalone_number_enable_0_to_9.txt'))
6567

6668
@pytest.mark.parametrize("spoken, written", normalizer_cases)
@@ -83,6 +85,7 @@ class TestNormalizerEnablestandalonenumberDisable0to9:
8385
parse_test_case('data/money.txt'),
8486
parse_test_case('data/time.txt'),
8587
parse_test_case('data/whitelist.txt'),
88+
parse_test_case('data/license_plate.txt'),
8689
parse_test_case('data/normalizer_enable_standalone_number_disable_0_to_9.txt'))
8790

8891
@pytest.mark.parametrize("spoken, written", normalizer_cases)
@@ -105,6 +108,7 @@ class TestNormalizerDisablestandalonenumberDisable0to9:
105108
parse_test_case('data/money.txt'),
106109
parse_test_case('data/time.txt'),
107110
parse_test_case('data/whitelist.txt'),
111+
parse_test_case('data/license_plate.txt'),
108112
parse_test_case('data/normalizer_disable_standalone_number_disable_0_to_9.txt'))
109113

110114
@pytest.mark.parametrize("spoken, written", normalizer_cases)

0 commit comments

Comments
 (0)