|
| 1 | +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import sys |
| 16 | +from unicodedata import category |
| 17 | + |
| 18 | +from pynini.examples import plurals |
| 19 | +from pynini import cross, union, closure, accep |
| 20 | +from pynini.lib.pynutil import delete, insert |
| 21 | + |
| 22 | +from tn.processor import Processor |
| 23 | +from tn.utils import get_abs_path, load_labels |
| 24 | + |
| 25 | + |
| 26 | +class Punctuation(Processor): |
| 27 | + |
| 28 | + def __init__(self, deterministic: bool = False): |
| 29 | + """ |
| 30 | + Args: |
| 31 | + deterministic: if True will provide a single transduction option, |
| 32 | + for False multiple transduction are generated (used for audio-based normalization) |
| 33 | + """ |
| 34 | + super().__init__('p', ordertype="en_tn") |
| 35 | + self.deterministic = deterministic |
| 36 | + self.build_tagger() |
| 37 | + self.build_verbalizer() |
| 38 | + |
| 39 | + def build_tagger(self): |
| 40 | + """ |
| 41 | + Finite state transducer for classifying punctuation |
| 42 | + e.g. a, -> w { v: "a" } p { v: "," } |
| 43 | + """ |
| 44 | + s = "!#%&\'()*+,-./:;<=>?@^_`{|}~" |
| 45 | + |
| 46 | + punct_symbols_to_exclude = ["[", "]", "\"", "\\"] |
| 47 | + punct_unicode = [ |
| 48 | + chr(i) for i in range(sys.maxunicode) |
| 49 | + if category(chr(i)).startswith("P") |
| 50 | + and chr(i) not in punct_symbols_to_exclude |
| 51 | + ] |
| 52 | + |
| 53 | + whitelist_symbols = load_labels( |
| 54 | + get_abs_path("english/data/whitelist/symbol.tsv")) |
| 55 | + whitelist_symbols = [x[0] for x in whitelist_symbols] |
| 56 | + self.punct_marks = [ |
| 57 | + p for p in punct_unicode + list(s) if p not in whitelist_symbols |
| 58 | + ] |
| 59 | + |
| 60 | + self.punct = union(*self.punct_marks) |
| 61 | + punct = closure(self.punct | cross('\\', '\\\\\\') | cross('"', '\\"'), |
| 62 | + 1) |
| 63 | + |
| 64 | + emphasis = ( |
| 65 | + accep("<") + |
| 66 | + (( |
| 67 | + closure(self.NOT_SPACE - union("<", ">"), 1) + # noqa |
| 68 | + closure(accep("/"), 0, 1)) # noqa |
| 69 | + | (accep("/") + closure(self.NOT_SPACE - union("<", ">"), 1))) + |
| 70 | + accep(">")) # noqa |
| 71 | + punct = plurals._priority_union(emphasis, punct, closure(self.VCHAR)) |
| 72 | + |
| 73 | + self.graph = punct |
| 74 | + final_graph = insert("v: \"") + punct + insert("\"") |
| 75 | + self.tagger = self.add_tokens(final_graph) |
| 76 | + |
| 77 | + def build_verbalizer(self): |
| 78 | + punct = closure(self.punct | cross('\\\\\\', '\\') | cross('\\"', '"'), |
| 79 | + 1) |
| 80 | + verbalizer = delete('v: "') + punct + delete('"') |
| 81 | + self.verbalizer = self.delete_tokens(verbalizer) |
0 commit comments