Skip to content

Commit 2bdd0f0

Browse files
committed
update tutorials
1 parent 1374b98 commit 2bdd0f0

5 files changed

Lines changed: 263 additions & 15 deletions

File tree

Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
"cell_type": "markdown",
7272
"metadata": {},
7373
"source": [
74-
"Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
74+
"Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
7575
"\n",
7676
"Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
7777
]
@@ -391,7 +391,7 @@
391391
"cell_type": "markdown",
392392
"metadata": {},
393393
"source": [
394-
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
394+
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
395395
]
396396
}
397397
],

Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@
568568
"cell_type": "markdown",
569569
"metadata": {},
570570
"source": [
571-
"## Evaluate using FlagEmbedding"
571+
"## 3. Evaluate using FlagEmbedding"
572572
]
573573
},
574574
{

Tutorials/4_Evaluation/4.5.2_MLDR.ipynb

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
"metadata": {},
3535
"outputs": [],
3636
"source": [
37-
"% pip install FlagEmbedding"
37+
"% pip install FlagEmbedding pytrec_eval"
3838
]
3939
},
4040
{
@@ -318,7 +318,7 @@
318318
"cell_type": "markdown",
319319
"metadata": {},
320320
"source": [
321-
"Use the Faiss index to search for each query."
321+
"Use the Faiss index to search answers for each query."
322322
]
323323
},
324324
{
@@ -456,7 +456,7 @@
456456
"cell_type": "markdown",
457457
"metadata": {},
458458
"source": [
459-
"## Evaluate using FlagEmbedding"
459+
"## 3. Evaluate using FlagEmbedding"
460460
]
461461
},
462462
{
@@ -496,15 +496,6 @@
496496
"sys.argv = arguments.split()"
497497
]
498498
},
499-
{
500-
"cell_type": "code",
501-
"execution_count": 3,
502-
"metadata": {},
503-
"outputs": [],
504-
"source": [
505-
"os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""
506-
]
507-
},
508499
{
509500
"cell_type": "code",
510501
"execution_count": 4,
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
"""
2+
Ref: https://github.com/facebookresearch/contriever
3+
"""
4+
import regex
5+
import unicodedata
6+
from functools import partial
7+
from typing import List, Union
8+
9+
10+
class SimpleTokenizer:
11+
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
12+
NON_WS = r'[^\p{Z}\p{C}]'
13+
14+
def __init__(self):
15+
"""
16+
Args:
17+
annotators: None or empty set (only tokenizes).
18+
"""
19+
self._regexp = regex.compile(
20+
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
21+
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
22+
)
23+
24+
def tokenize(self, text, uncased=False):
25+
matches = [m for m in self._regexp.finditer(text)]
26+
if uncased:
27+
tokens = [m.group().lower() for m in matches]
28+
else:
29+
tokens = [m.group() for m in matches]
30+
return tokens
31+
32+
33+
def _normalize(text):
34+
return unicodedata.normalize('NFD', text)
35+
36+
37+
def has_answer(answers, text, tokenizer) -> bool:
38+
"""Check if a document contains an answer string."""
39+
text = _normalize(text)
40+
text = tokenizer.tokenize(text, uncased=True)
41+
42+
for answer in answers:
43+
answer = _normalize(answer)
44+
answer = tokenizer.tokenize(answer, uncased=True)
45+
for i in range(0, len(text) - len(answer) + 1):
46+
if answer == text[i: i + len(answer)]:
47+
return True
48+
return False
49+
50+
51+
def check_answer(example, tokenizer) -> List[bool]:
52+
"""Search through all the top docs to see if they have any of the answers."""
53+
answers = example['answers']
54+
ctxs = example['ctxs']
55+
56+
hits = []
57+
for i, text in enumerate(ctxs):
58+
if text is None: # cannot find the document for some reason
59+
hits.append(False)
60+
continue
61+
hits.append(has_answer(answers, text, tokenizer))
62+
return hits
63+
64+
65+
def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
66+
# compute Recall@k for QA task
67+
data = []
68+
assert len(ctxs) == len(answers)
69+
for i in range(len(ctxs)):
70+
_ctxs, _answers = ctxs[i], answers[i]
71+
data.append({
72+
'answers': _answers,
73+
'ctxs': _ctxs,
74+
})
75+
tokenizer = SimpleTokenizer()
76+
get_score_partial = partial(check_answer, tokenizer=tokenizer)
77+
78+
scores = map(get_score_partial, data)
79+
80+
n_docs = len(data[0]['ctxs'])
81+
top_k_hits = [0] * n_docs
82+
for question_hits in scores:
83+
best_hit = next((i for i, x in enumerate(question_hits) if x), None)
84+
if best_hit is not None:
85+
top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]
86+
87+
if isinstance(k_values, int):
88+
k = min(k_values, len(top_k_hits))
89+
return top_k_hits[k - 1] / len(data)
90+
else:
91+
scores = []
92+
for k in k_values:
93+
k = min(k, len(top_k_hits))
94+
scores.append(top_k_hits[k - 1] / len(data))
95+
return scores
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""
2+
adapted from chemdataextractor.text.normalize
3+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4+
Tools for normalizing text.
5+
https://github.com/mcs07/ChemDataExtractor
6+
:copyright: Copyright 2016 by Matt Swain.
7+
:license: MIT
8+
9+
Permission is hereby granted, free of charge, to any person obtaining
10+
a copy of this software and associated documentation files (the
11+
'Software'), to deal in the Software without restriction, including
12+
without limitation the rights to use, copy, modify, merge, publish,
13+
distribute, sublicense, and/or sell copies of the Software, and to
14+
permit persons to whom the Software is furnished to do so, subject to
15+
the following conditions:
16+
17+
The above copyright notice and this permission notice shall be
18+
included in all copies or substantial portions of the Software.
19+
20+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
21+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27+
"""
28+
29+
#: Control characters.
30+
CONTROLS = {
31+
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
32+
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
33+
}
34+
# There are further control characters, but they are instead replaced with a space by unicode normalization
35+
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
36+
37+
38+
#: Hyphen and dash characters.
39+
HYPHENS = {
40+
'-', # \u002d Hyphen-minus
41+
'‐', # \u2010 Hyphen
42+
'‑', # \u2011 Non-breaking hyphen
43+
'⁃', # \u2043 Hyphen bullet
44+
'‒', # \u2012 figure dash
45+
'–', # \u2013 en dash
46+
'—', # \u2014 em dash
47+
'―', # \u2015 horizontal bar
48+
}
49+
50+
#: Minus characters.
51+
MINUSES = {
52+
'-', # \u002d Hyphen-minus
53+
'−', # \u2212 Minus
54+
'-', # \uff0d Full-width Hyphen-minus
55+
'⁻', # \u207b Superscript minus
56+
}
57+
58+
#: Plus characters.
59+
PLUSES = {
60+
'+', # \u002b Plus
61+
'+', # \uff0b Full-width Plus
62+
'⁺', # \u207a Superscript plus
63+
}
64+
65+
#: Slash characters.
66+
SLASHES = {
67+
'/', # \u002f Solidus
68+
'⁄', # \u2044 Fraction slash
69+
'∕', # \u2215 Division slash
70+
}
71+
72+
#: Tilde characters.
73+
TILDES = {
74+
'~', # \u007e Tilde
75+
'˜', # \u02dc Small tilde
76+
'⁓', # \u2053 Swung dash
77+
'∼', # \u223c Tilde operator #in mbert vocab
78+
'∽', # \u223d Reversed tilde
79+
'∿', # \u223f Sine wave
80+
'〜', # \u301c Wave dash #in mbert vocab
81+
'~', # \uff5e Full-width tilde #in mbert vocab
82+
}
83+
84+
#: Apostrophe characters.
85+
APOSTROPHES = {
86+
"'", # \u0027
87+
'’', # \u2019
88+
'՚', # \u055a
89+
'Ꞌ', # \ua78b
90+
'ꞌ', # \ua78c
91+
''', # \uff07
92+
}
93+
94+
#: Single quote characters.
95+
SINGLE_QUOTES = {
96+
"'", # \u0027
97+
'‘', # \u2018
98+
'’', # \u2019
99+
'‚', # \u201a
100+
'‛', # \u201b
101+
102+
}
103+
104+
#: Double quote characters.
105+
DOUBLE_QUOTES = {
106+
'"', # \u0022
107+
'“', # \u201c
108+
'”', # \u201d
109+
'„', # \u201e
110+
'‟', # \u201f
111+
}
112+
113+
#: Accent characters.
114+
ACCENTS = {
115+
'`', # \u0060
116+
'´', # \u00b4
117+
}
118+
119+
#: Prime characters.
120+
PRIMES = {
121+
'′', # \u2032
122+
'″', # \u2033
123+
'‴', # \u2034
124+
'‵', # \u2035
125+
'‶', # \u2036
126+
'‷', # \u2037
127+
'⁗', # \u2057
128+
}
129+
130+
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
131+
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
132+
133+
def normalize_text(text: str):
134+
for control in CONTROLS:
135+
text = text.replace(control, '')
136+
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
137+
138+
for hyphen in HYPHENS | MINUSES:
139+
text = text.replace(hyphen, '-')
140+
text = text.replace('\u00ad', '')
141+
142+
for double_quote in DOUBLE_QUOTES:
143+
text = text.replace(double_quote, '"') # \u0022
144+
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
145+
text = text.replace(single_quote, "'") # \u0027
146+
text = text.replace('′', "'") # \u2032 prime
147+
text = text.replace('‵', "'") # \u2035 reversed prime
148+
text = text.replace('″', "''") # \u2033 double prime
149+
text = text.replace('‶', "''") # \u2036 reversed double prime
150+
text = text.replace('‴', "'''") # \u2034 triple prime
151+
text = text.replace('‷', "'''") # \u2037 reversed triple prime
152+
text = text.replace('⁗', "''''") # \u2057 quadruple prime
153+
154+
text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
155+
156+
for slash in SLASHES:
157+
text = text.replace(slash, '/')
158+
159+
#for tilde in TILDES:
160+
# text = text.replace(tilde, '~')
161+
162+
return text

0 commit comments

Comments
 (0)