update tutorials

ZiyiXia · ZiyiXia · commit 2bdd0f0542b8 · 2024-12-03T11:49:52.000Z
diff --git a/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb b/Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb
@@ -71,7 +71,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
+    "Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
     "\n",
     "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
    ]
@@ -391,7 +391,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
+    "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
    ]
   }
  ],
diff --git a/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb b/Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb
@@ -568,7 +568,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Evaluate using FlagEmbedding"
+    "## 3. Evaluate using FlagEmbedding"
    ]
   },
   {
diff --git a/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb b/Tutorials/4_Evaluation/4.5.2_MLDR.ipynb
@@ -34,7 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "% pip install FlagEmbedding"
+    "% pip install FlagEmbedding pytrec_eval"
    ]
   },
   {
@@ -318,7 +318,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Use the Faiss index to search for each query."
+    "Use the Faiss index to search answers for each query."
    ]
   },
   {
@@ -456,7 +456,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Evaluate using FlagEmbedding"
+    "## 3. Evaluate using FlagEmbedding"
    ]
   },
   {
@@ -496,15 +496,6 @@
     "sys.argv = arguments.split()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 4,
diff --git a/Tutorials/4_Evaluation/utils/compute_metrics.py b/Tutorials/4_Evaluation/utils/compute_metrics.py
@@ -0,0 +1,95 @@
+"""
+Ref: https://github.com/facebookresearch/contriever
+"""
+import regex
+import unicodedata
+from functools import partial
+from typing import List, Union
+
+
+class SimpleTokenizer:
+    ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
+    NON_WS = r'[^\p{Z}\p{C}]'
+
+    def __init__(self):
+        """
+        Args:
+            annotators: None or empty set (only tokenizes).
+        """
+        self._regexp = regex.compile(
+            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
+            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
+        )
+
+    def tokenize(self, text, uncased=False):
+        matches = [m for m in self._regexp.finditer(text)]
+        if uncased:
+            tokens = [m.group().lower() for m in matches]
+        else:
+            tokens = [m.group() for m in matches]
+        return tokens
+
+
+def _normalize(text):
+    return unicodedata.normalize('NFD', text)
+
+
+def has_answer(answers, text, tokenizer) -> bool:
+    """Check if a document contains an answer string."""
+    text = _normalize(text)
+    text = tokenizer.tokenize(text, uncased=True)
+
+    for answer in answers:
+        answer = _normalize(answer)
+        answer = tokenizer.tokenize(answer, uncased=True)
+        for i in range(0, len(text) - len(answer) + 1):
+            if answer == text[i: i + len(answer)]:
+                return True
+    return False
+
+
+def check_answer(example, tokenizer) -> List[bool]:
+    """Search through all the top docs to see if they have any of the answers."""
+    answers = example['answers']
+    ctxs = example['ctxs']
+
+    hits = []
+    for i, text in enumerate(ctxs):
+        if text is None:  # cannot find the document for some reason
+            hits.append(False)
+            continue
+        hits.append(has_answer(answers, text, tokenizer))
+    return hits
+
+
+def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
+    # compute Recall@k for QA task
+    data = []
+    assert len(ctxs) == len(answers)
+    for i in range(len(ctxs)):
+        _ctxs, _answers = ctxs[i], answers[i]
+        data.append({
+            'answers': _answers,
+            'ctxs': _ctxs,
+        })
+    tokenizer = SimpleTokenizer()
+    get_score_partial = partial(check_answer, tokenizer=tokenizer)
+
+    scores = map(get_score_partial, data)
+
+    n_docs = len(data[0]['ctxs'])
+    top_k_hits = [0] * n_docs
+    for question_hits in scores:
+        best_hit = next((i for i, x in enumerate(question_hits) if x), None)
+        if best_hit is not None:
+            top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]
+
+    if isinstance(k_values, int):
+        k = min(k_values, len(top_k_hits))
+        return top_k_hits[k - 1] / len(data)
+    else:
+        scores = []
+        for k in k_values:
+            k = min(k, len(top_k_hits))
+            scores.append(top_k_hits[k - 1] / len(data))
+        return scores
diff --git a/Tutorials/4_Evaluation/utils/normalize_text.py b/Tutorials/4_Evaluation/utils/normalize_text.py
@@ -0,0 +1,162 @@
+"""
+adapted from chemdataextractor.text.normalize
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Tools for normalizing text.
+https://github.com/mcs07/ChemDataExtractor
+:copyright: Copyright 2016 by Matt Swain.
+:license: MIT
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+#: Control characters.
+CONTROLS = {
+    '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
+    '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
+}
+# There are further control characters, but they are instead replaced with a space by unicode normalization
+# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c',  '\u001d', '\u001e', '\u001f'
+
+
+#: Hyphen and dash characters.
+HYPHENS = {
+    '-',  # \u002d Hyphen-minus
+    '‐',  # \u2010 Hyphen
+    '‑',  # \u2011 Non-breaking hyphen
+    '⁃',  # \u2043 Hyphen bullet
+    '‒',  # \u2012 figure dash
+    '–',  # \u2013 en dash
+    '—',  # \u2014 em dash
+    '―',  # \u2015 horizontal bar
+}
+
+#: Minus characters.
+MINUSES = {
+    '-',  # \u002d Hyphen-minus
+    '−',  # \u2212 Minus
+    '－',  # \uff0d Full-width Hyphen-minus
+    '⁻',  # \u207b Superscript minus
+}
+
+#: Plus characters.
+PLUSES = {
+    '+',  # \u002b Plus
+    '＋',  # \uff0b Full-width Plus
+    '⁺',  # \u207a Superscript plus
+}
+
+#: Slash characters.
+SLASHES = {
+    '/',  # \u002f Solidus
+    '⁄',  # \u2044 Fraction slash
+    '∕',  # \u2215 Division slash
+}
+
+#: Tilde characters.
+TILDES = {
+    '~',  # \u007e Tilde
+    '˜',  # \u02dc Small tilde
+    '⁓',  # \u2053 Swung dash
+    '∼',  # \u223c Tilde operator #in mbert vocab
+    '∽',  # \u223d Reversed tilde
+    '∿',  # \u223f Sine wave
+    '〜',  # \u301c Wave dash #in mbert vocab
+    '～',  # \uff5e Full-width tilde #in mbert vocab
+}
+
+#: Apostrophe characters.
+APOSTROPHES = {
+    "'",  # \u0027
+    '’',  # \u2019
+    '՚',  # \u055a
+    'Ꞌ',  # \ua78b
+    'ꞌ',  # \ua78c
+    '＇',  # \uff07
+}
+
+#: Single quote characters.
+SINGLE_QUOTES = {
+    "'",  # \u0027
+    '‘',  # \u2018
+    '’',  # \u2019
+    '‚',  # \u201a
+    '‛',  # \u201b
+
+}
+
+#: Double quote characters.
+DOUBLE_QUOTES = {
+    '"',  # \u0022
+    '“',  # \u201c
+    '”',  # \u201d
+    '„',  # \u201e
+    '‟',  # \u201f
+}
+
+#: Accent characters.
+ACCENTS = {
+    '`',  # \u0060
+    '´',  # \u00b4
+}
+
+#: Prime characters.
+PRIMES = {
+    '′',  # \u2032
+    '″',  # \u2033
+    '‴',  # \u2034
+    '‵',  # \u2035
+    '‶',  # \u2036
+    '‷',  # \u2037
+    '⁗',  # \u2057
+}
+
+#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
+QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
+
+def normalize_text(text: str):
+    for control in CONTROLS:
+        text = text.replace(control, '')
+    text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
+
+    for hyphen in HYPHENS | MINUSES:
+        text = text.replace(hyphen, '-')
+    text = text.replace('\u00ad', '')
+
+    for double_quote in DOUBLE_QUOTES:
+        text = text.replace(double_quote, '"')  # \u0022
+    for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
+        text = text.replace(single_quote, "'")  # \u0027
+    text = text.replace('′', "'")     # \u2032 prime
+    text = text.replace('‵', "'")     # \u2035 reversed prime
+    text = text.replace('″', "''")    # \u2033 double prime
+    text = text.replace('‶', "''")    # \u2036 reversed double prime
+    text = text.replace('‴', "'''")   # \u2034 triple prime
+    text = text.replace('‷', "'''")   # \u2037 reversed triple prime
+    text = text.replace('⁗', "''''")  # \u2057 quadruple prime
+
+    text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026
+
+    for slash in SLASHES:
+        text = text.replace(slash, '/')
+
+    #for tilde in TILDES:
+    #    text = text.replace(tilde, '~')
+
+    return text

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@`
`71`	`71`	`"cell_type": "markdown",`
`72`	`72`	`"metadata": {},`
`73`	`73`	`"source": [`
`74`		`- "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",`
	`74`	`+ "Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",`
`75`	`75`	`"\n",`
`76`	`76`	`"Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."`
`77`	`77`	`]`
`@@ -391,7 +391,7 @@`
`391`	`391`	`"cell_type": "markdown",`
`392`	`392`	`"metadata": {},`
`393`	`393`	`"source": [`
`394`		`- "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."`
	`394`	`+ "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."`
`395`	`395`	`]`
`396`	`396`	`}`
`397`	`397`	`],`
Original file line number	Diff line number	Diff line change
`@@ -568,7 +568,7 @@`
`568`	`568`	`"cell_type": "markdown",`
`569`	`569`	`"metadata": {},`
`570`	`570`	`"source": [`
`571`		`- "## Evaluate using FlagEmbedding"`
	`571`	`+ "## 3. Evaluate using FlagEmbedding"`
`572`	`572`	`]`
`573`	`573`	`},`
`574`	`574`	`{`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`"metadata": {},`
`35`	`35`	`"outputs": [],`
`36`	`36`	`"source": [`
`37`		`- "% pip install FlagEmbedding"`
	`37`	`+ "% pip install FlagEmbedding pytrec_eval"`
`38`	`38`	`]`
`39`	`39`	`},`
`40`	`40`	`{`
`@@ -318,7 +318,7 @@`
`318`	`318`	`"cell_type": "markdown",`
`319`	`319`	`"metadata": {},`
`320`	`320`	`"source": [`
`321`		`- "Use the Faiss index to search for each query."`
	`321`	`+ "Use the Faiss index to search answers for each query."`
`322`	`322`	`]`
`323`	`323`	`},`
`324`	`324`	`{`
`@@ -456,7 +456,7 @@`
`456`	`456`	`"cell_type": "markdown",`
`457`	`457`	`"metadata": {},`
`458`	`458`	`"source": [`
`459`		`- "## Evaluate using FlagEmbedding"`
	`459`	`+ "## 3. Evaluate using FlagEmbedding"`
`460`	`460`	`]`
`461`	`461`	`},`
`462`	`462`	`{`
`@@ -496,15 +496,6 @@`
`496`	`496`	`"sys.argv = arguments.split()"`
`497`	`497`	`]`
`498`	`498`	`},`
`499`		`- {`
`500`		`- "cell_type": "code",`
`501`		`- "execution_count": 3,`
`502`		`- "metadata": {},`
`503`		`- "outputs": [],`
`504`		`- "source": [`
`505`		`- "os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""`
`506`		`- ]`
`507`		`- },`
`508`	`499`	`{`
`509`	`500`	`"cell_type": "code",`
`510`	`501`	`"execution_count": 4,`