From 20ff9274139b100ba8f021438a68b5c6f7e6b03f Mon Sep 17 00:00:00 2001 From: Yeray Diaz <6739793+yeraydiazdiaz@users.noreply.github.com> Date: Sat, 6 Jul 2024 13:33:58 +0100 Subject: [PATCH 1/7] Fix black failing in CI Also move GH Actions to Python 3.11 --- lunr/query_parser.py | 8 +++++--- tox.ini | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lunr/query_parser.py b/lunr/query_parser.py index e1cdd0b..ec318bc 100644 --- a/lunr/query_parser.py +++ b/lunr/query_parser.py @@ -52,9 +52,11 @@ def parse_clause(cls, parser): raise QueryParseError( "Expected either a field or a term, found {}{}".format( lexeme["type"], - 'with value "' + lexeme["string"] + '"' - if len(lexeme["string"]) - else "", + ( + 'with value "' + lexeme["string"] + '"' + if len(lexeme["string"]) + else "" + ), ) ) diff --git a/tox.ini b/tox.ini index ea340c7..bb409d8 100644 --- a/tox.ini +++ b/tox.ini @@ -10,24 +10,24 @@ commands = pytest -m "acceptance" [testenv:black] -basepython = python3.10 +basepython = python3.11 deps= black commands={envbindir}/black --check lunr tests [testenv:flake8] -basepython = python3.10 +basepython = python3.11 deps= flake8 commands={envbindir}/flake8 lunr tests [testenv:docs] -basepython = python3.10 +basepython = python3.11 extras = docs commands={envbindir}/sphinx-build docs docs/_build/html [testenv:mypy] -basepython = python3.10 +basepython = python3.11 deps = mypy commands={envbindir}/mypy lunr @@ -45,6 +45,6 @@ python = 3.7: py37 3.8: py38 3.9: py39 - 3.10: py310,flake8,black,docs,mypy - 3.11: py311 + 3.10: py310 + 3.11: py311,flake8,black,docs,mypy pypy3: pypy3 From 64668d28657458502c259660f7916c25ba1ba5eb Mon Sep 17 00:00:00 2001 From: Yeray Diaz <6739793+yeraydiazdiaz@users.noreply.github.com> Date: Sat, 6 Jul 2024 13:44:31 +0100 Subject: [PATCH 2/7] Upgrade Codecov configuration --- .github/workflows/test-suite.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 54c7f77..9ac1668 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -12,7 +12,7 @@ jobs: name: "Python ${{ matrix.python-version }}" runs-on: "ubuntu-latest" env: - USING_COVERAGE: "3.10" + USING_COVERAGE: "3.11" strategy: matrix: @@ -46,6 +46,7 @@ jobs: - name: "Upload coverage to Codecov" if: "contains(env.USING_COVERAGE, matrix.python-version)" - uses: "codecov/codecov-action@v3" + uses: "codecov/codecov-action@v4.0.1" with: fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} From 8b0232d41d7e2f998c4514b658695ace6f8fd382 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 4 Jul 2024 11:23:58 -0400 Subject: [PATCH 3/7] docs: how to skip pipeline steps with language support --- docs/customisation.md | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/customisation.md b/docs/customisation.md index 71987e1..6dfe318 100644 --- a/docs/customisation.md +++ b/docs/customisation.md @@ -43,8 +43,11 @@ token list, and the token list itself. ## Skip a pipeline function for specific field names -The `Pipeline.skip()` method allows you to skip a pipeline function for specific field names. -This example skips the `stop_word_filter` pipeline function for the field `fullName`. +The `Pipeline.skip()` method allows you to skip a pipeline function +for specific field names. It takes the function itself (not its name +or its registered name) and the field name to skip as arguments. This +example skips the `stop_word_filter` pipeline function for the field +`fullName`. ```python from lunr import lunr, get_default_builder, stop_word_filter @@ -58,6 +61,37 @@ builder.pipeline.skip(stop_word_filter.stop_word_filter, ["fullName"]) idx = lunr(ref="id", fields=("fullName", "body"), documents=documents, builder=builder) ``` +Importantly, if you are using language support, the above code will +not work, since there is a separate builder for each language, and the +pipeline functions are generated by the code and so cannot be +imported. Instead, you can access them by name. For instance to skip +the stop word filter and stemmer for French for the field `titre`, you +could do this: + +```python +from lunr import lunr, get_default_builder, stop_word_filter + +documents = [...] + +builder = get_default_builder("fr") + +for funcname in "stopWordFilter-fr", "stemmer-fr": + builder.pipeline.skip( + builder.pipeline.registered_functions[funcname], ["titre"] + ) + +idx = lunr(ref="id", fields=("titre", "texte"), documents=documents, builder=builder) +``` + +The current language support registers the functions +`lunr-multi-trimmer-{lang}`, `stopWordFilter-{lang}` and +`stemmer-{lang}` but these are by convention only. You can access the +full list through the `registered_functions` attribute of the +pipeline, but this is not necessarily the list of actual pipeline +steps, which is contained in a private field (though you can see them +in the string representation of the pipeline). + + ## Token meta-data Lunr.py `Token` instances include meta-data information which can be used in From 98a2e56f7006758489b1d9bbf43616e8df06caac Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 4 Jul 2024 12:17:31 -0400 Subject: [PATCH 4/7] docs: unicode folding for fun and profit --- docs/languages.md | 70 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/docs/languages.md b/docs/languages.md index 15394df..e18ce14 100644 --- a/docs/languages.md +++ b/docs/languages.md @@ -72,6 +72,76 @@ If you have documents in multiple language pass a list of language codes: [{'ref': 'c', 'score': 1.106, 'match_data': }] ``` +## Folding to ASCII + +It is often useful to allow for transliterated or unaccented +characters when indexing and searching. This is not implemented in +the language support but can be done by adding a pipeline stage which +"folds" the tokens to ASCII. There are +[various](https://pypi.org/project/text-unidecode/) +[libraries](https://pypi.org/project/Unidecode/) to do this in Python +as well as in [JavaScript](https://www.npmjs.com/package/unidecode). + +On the Python side, for example, to fold accents in French text using +`text-unidecode` or `unidecode` (depending on your licensing +preferences): + +```python +import json +from lunr import lunr, get_default_builder +from lunr.pipeline import Pipeline +from text_unidecode import unidecode + +def unifold(token, _idx=None, _tokens=None): + def wrap_unidecode(text, _metadata): + return unidecode(text) + return token.update(wrap_unidecode) + +Pipeline.register_function(unifold, "unifold") +builder = get_default_builder("fr") +builder.pipeline.add(unifold) +builder.search_pipeline.add(unifold) +index = lunr( + ref="id", + fields=["titre", "texte"], + documents=[ + {"id": "1314-2023-DEM", "titre": "Règlement de démolition", "texte": "Texte"} + ], + languages="fr", + builder=builder, +) +print(index.search("reglement de demolition")) +# [{'ref': '1314-2023-DEM', 'score': 0.4072935059634513, 'match_data': }] +print(index.search("règlement de démolition")) +# [{'ref': '1314-2023-DEM', 'score': 0.4072935059634513, 'match_data': }] +with open("index.json", "wt") as outfh: + json.dump(index.serialize(), outfh) +``` + +Note that it is important to do folding on both the indexing and +search pipelines to ensure that users who have the right keyboard and +can remember which accents go where will still get the expected +results. + +On the JavaScript side [the +API](https://lunrjs.com/docs/lunr.Pipeline.html) is of course quite +similar: + +```js +const lunr = require("lunr"); +const fs = require("fs"); +const unidecode = require("unidecode"); +require("lunr-languages/lunr.stemmer.support.js")(lunr); +require("lunr-languages/lunr.fr.js")(lunr); + +lunr.Pipeline.registerFunction(token => token.update(unidecode), "unifold") +const index = lunr.Index.load(JSON.parse(fs.readFileSync("index.json", "utf8"))); +console.log(JSON.stringify(index.search("reglement de demolition"))); +# [{"ref":"1314-2023-DEM","score":0.4072935059634513,"matchData":{"metadata":{"regl":{"titre":{}},"demolit":{"titre":{}}}}}] +console.log(JSON.stringify(index.search("règlement de démolition"))); +# [{"ref":"1314-2023-DEM","score":0.4072935059634513,"matchData":{"metadata":{"regl":{"titre":{}},"demolit":{"titre":{}}}}}] +``` + ## Notes on language support - Using multiple languages means the terms will be stemmed once per language. This can yield unexpected results. From f3a95f1767024b922297e94e998ae9ebffde841f Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 4 Jul 2024 12:20:46 -0400 Subject: [PATCH 5/7] fix(docs): add a note about lunr-folding Not to use it (even though I "maintain" it) because it is not good --- docs/languages.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/languages.md b/docs/languages.md index e18ce14..40c7179 100644 --- a/docs/languages.md +++ b/docs/languages.md @@ -142,6 +142,10 @@ console.log(JSON.stringify(index.search("règlement de démolition"))); # [{"ref":"1314-2023-DEM","score":0.4072935059634513,"matchData":{"metadata":{"regl":{"titre":{}},"demolit":{"titre":{}}}}}] ``` +There is also `lunr-folding` for JavaScript, but its folding is not +the same as `unidecode` and it may not be fully compatible with +language support, so it is recommended to use the above method. + ## Notes on language support - Using multiple languages means the terms will be stemmed once per language. This can yield unexpected results. From a5f64b4bd59dfe12fe25d6a50f70d078f0cb0df7 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 4 Jul 2024 12:21:20 -0400 Subject: [PATCH 6/7] fix(docs): add the url to lunr-folding --- docs/languages.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/languages.md b/docs/languages.md index 40c7179..a321632 100644 --- a/docs/languages.md +++ b/docs/languages.md @@ -142,9 +142,11 @@ console.log(JSON.stringify(index.search("règlement de démolition"))); # [{"ref":"1314-2023-DEM","score":0.4072935059634513,"matchData":{"metadata":{"regl":{"titre":{}},"demolit":{"titre":{}}}}}] ``` -There is also `lunr-folding` for JavaScript, but its folding is not -the same as `unidecode` and it may not be fully compatible with -language support, so it is recommended to use the above method. +There is also +[lunr-folding](https://www.npmjs.com/package/lunr-folding) for +JavaScript, but its folding is not the same as `unidecode` and it may +not be fully compatible with language support, so it is recommended to +use the above method. ## Notes on language support From d07b60f8d91466364bcad268805ee6d53a36829e Mon Sep 17 00:00:00 2001 From: Yeray Diaz <6739793+yeraydiazdiaz@users.noreply.github.com> Date: Sun, 8 Sep 2024 12:25:29 +0100 Subject: [PATCH 7/7] Bump codecov-action to 4.5.0 Hopefully fixing its configuration as well. --- .github/workflows/test-suite.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 9ac1668..4cfbaf5 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -46,7 +46,7 @@ jobs: - name: "Upload coverage to Codecov" if: "contains(env.USING_COVERAGE, matrix.python-version)" - uses: "codecov/codecov-action@v4.0.1" + uses: "codecov/codecov-action@v4.5.0" with: fail_ci_if_error: true token: ${{ secrets.CODECOV_TOKEN }}