|
| 1 | +""" |
| 2 | +adapted from chemdataextractor.text.normalize |
| 3 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 4 | +Tools for normalizing text. |
| 5 | +https://github.com/mcs07/ChemDataExtractor |
| 6 | +:copyright: Copyright 2016 by Matt Swain. |
| 7 | +:license: MIT |
| 8 | +
|
| 9 | +Permission is hereby granted, free of charge, to any person obtaining |
| 10 | +a copy of this software and associated documentation files (the |
| 11 | +'Software'), to deal in the Software without restriction, including |
| 12 | +without limitation the rights to use, copy, modify, merge, publish, |
| 13 | +distribute, sublicense, and/or sell copies of the Software, and to |
| 14 | +permit persons to whom the Software is furnished to do so, subject to |
| 15 | +the following conditions: |
| 16 | +
|
| 17 | +The above copyright notice and this permission notice shall be |
| 18 | +included in all copies or substantial portions of the Software. |
| 19 | +
|
| 20 | +THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, |
| 21 | +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 22 | +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 23 | +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| 24 | +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 25 | +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 26 | +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 27 | +""" |
| 28 | + |
| 29 | +#: Control characters. |
| 30 | +CONTROLS = { |
| 31 | + '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011', |
| 32 | + '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b', |
| 33 | +} |
| 34 | +# There are further control characters, but they are instead replaced with a space by unicode normalization |
| 35 | +# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f' |
| 36 | + |
| 37 | + |
| 38 | +#: Hyphen and dash characters. |
| 39 | +HYPHENS = { |
| 40 | + '-', # \u002d Hyphen-minus |
| 41 | + '‐', # \u2010 Hyphen |
| 42 | + '‑', # \u2011 Non-breaking hyphen |
| 43 | + '⁃', # \u2043 Hyphen bullet |
| 44 | + '‒', # \u2012 figure dash |
| 45 | + '–', # \u2013 en dash |
| 46 | + '—', # \u2014 em dash |
| 47 | + '―', # \u2015 horizontal bar |
| 48 | +} |
| 49 | + |
| 50 | +#: Minus characters. |
| 51 | +MINUSES = { |
| 52 | + '-', # \u002d Hyphen-minus |
| 53 | + '−', # \u2212 Minus |
| 54 | + '-', # \uff0d Full-width Hyphen-minus |
| 55 | + '⁻', # \u207b Superscript minus |
| 56 | +} |
| 57 | + |
| 58 | +#: Plus characters. |
| 59 | +PLUSES = { |
| 60 | + '+', # \u002b Plus |
| 61 | + '+', # \uff0b Full-width Plus |
| 62 | + '⁺', # \u207a Superscript plus |
| 63 | +} |
| 64 | + |
| 65 | +#: Slash characters. |
| 66 | +SLASHES = { |
| 67 | + '/', # \u002f Solidus |
| 68 | + '⁄', # \u2044 Fraction slash |
| 69 | + '∕', # \u2215 Division slash |
| 70 | +} |
| 71 | + |
| 72 | +#: Tilde characters. |
| 73 | +TILDES = { |
| 74 | + '~', # \u007e Tilde |
| 75 | + '˜', # \u02dc Small tilde |
| 76 | + '⁓', # \u2053 Swung dash |
| 77 | + '∼', # \u223c Tilde operator #in mbert vocab |
| 78 | + '∽', # \u223d Reversed tilde |
| 79 | + '∿', # \u223f Sine wave |
| 80 | + '〜', # \u301c Wave dash #in mbert vocab |
| 81 | + '~', # \uff5e Full-width tilde #in mbert vocab |
| 82 | +} |
| 83 | + |
| 84 | +#: Apostrophe characters. |
| 85 | +APOSTROPHES = { |
| 86 | + "'", # \u0027 |
| 87 | + '’', # \u2019 |
| 88 | + '՚', # \u055a |
| 89 | + 'Ꞌ', # \ua78b |
| 90 | + 'ꞌ', # \ua78c |
| 91 | + ''', # \uff07 |
| 92 | +} |
| 93 | + |
| 94 | +#: Single quote characters. |
| 95 | +SINGLE_QUOTES = { |
| 96 | + "'", # \u0027 |
| 97 | + '‘', # \u2018 |
| 98 | + '’', # \u2019 |
| 99 | + '‚', # \u201a |
| 100 | + '‛', # \u201b |
| 101 | + |
| 102 | +} |
| 103 | + |
| 104 | +#: Double quote characters. |
| 105 | +DOUBLE_QUOTES = { |
| 106 | + '"', # \u0022 |
| 107 | + '“', # \u201c |
| 108 | + '”', # \u201d |
| 109 | + '„', # \u201e |
| 110 | + '‟', # \u201f |
| 111 | +} |
| 112 | + |
| 113 | +#: Accent characters. |
| 114 | +ACCENTS = { |
| 115 | + '`', # \u0060 |
| 116 | + '´', # \u00b4 |
| 117 | +} |
| 118 | + |
| 119 | +#: Prime characters. |
| 120 | +PRIMES = { |
| 121 | + '′', # \u2032 |
| 122 | + '″', # \u2033 |
| 123 | + '‴', # \u2034 |
| 124 | + '‵', # \u2035 |
| 125 | + '‶', # \u2036 |
| 126 | + '‷', # \u2037 |
| 127 | + '⁗', # \u2057 |
| 128 | +} |
| 129 | + |
| 130 | +#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes. |
| 131 | +QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES |
| 132 | + |
| 133 | +def normalize_text(text: str): |
| 134 | + for control in CONTROLS: |
| 135 | + text = text.replace(control, '') |
| 136 | + text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ') |
| 137 | + |
| 138 | + for hyphen in HYPHENS | MINUSES: |
| 139 | + text = text.replace(hyphen, '-') |
| 140 | + text = text.replace('\u00ad', '') |
| 141 | + |
| 142 | + for double_quote in DOUBLE_QUOTES: |
| 143 | + text = text.replace(double_quote, '"') # \u0022 |
| 144 | + for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS): |
| 145 | + text = text.replace(single_quote, "'") # \u0027 |
| 146 | + text = text.replace('′', "'") # \u2032 prime |
| 147 | + text = text.replace('‵', "'") # \u2035 reversed prime |
| 148 | + text = text.replace('″', "''") # \u2033 double prime |
| 149 | + text = text.replace('‶', "''") # \u2036 reversed double prime |
| 150 | + text = text.replace('‴', "'''") # \u2034 triple prime |
| 151 | + text = text.replace('‷', "'''") # \u2037 reversed triple prime |
| 152 | + text = text.replace('⁗', "''''") # \u2057 quadruple prime |
| 153 | + |
| 154 | + text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026 |
| 155 | + |
| 156 | + for slash in SLASHES: |
| 157 | + text = text.replace(slash, '/') |
| 158 | + |
| 159 | + #for tilde in TILDES: |
| 160 | + # text = text.replace(tilde, '~') |
| 161 | + |
| 162 | + return text |
0 commit comments