diff --git a/src/morphodict/frontend/templatetags/morphodict_extras.py b/src/morphodict/frontend/templatetags/morphodict_extras.py index 31129c154..27d9be979 100644 --- a/src/morphodict/frontend/templatetags/morphodict_extras.py +++ b/src/morphodict/frontend/templatetags/morphodict_extras.py @@ -192,13 +192,15 @@ def sort_sources(sources: list): ret = sources return ret + @register.filter(name="next") def next(some_list, current_index): try: return some_list[int(current_index) + 1] except: - return '' - + return "" + + @register.filter(name="has_next") def has_next(some_list, current_index): return not current_index >= len(some_list) - 1 diff --git a/src/morphodict/frontend/views.py b/src/morphodict/frontend/views.py index a18bed786..88cf753a4 100644 --- a/src/morphodict/frontend/views.py +++ b/src/morphodict/frontend/views.py @@ -140,17 +140,19 @@ def index(request): # pragma: no cover ) return render(request, "morphodict/index.html", context) + def wordnet(request, user_query, results): def process_result(r): return { - "wn_entry" : r[0], + "wn_entry": r[0], "results": r[1].serialized_presentation_results( - display_mode=DisplayMode.current_value_from_request(request), - animate_emoji=AnimateEmoji.current_value_from_request(request), - show_emoji=ShowEmoji.current_value_from_request(request), - dict_source=get_dict_source(request), - ) + display_mode=DisplayMode.current_value_from_request(request), + animate_emoji=AnimateEmoji.current_value_from_request(request), + show_emoji=ShowEmoji.current_value_from_request(request), + dict_source=get_dict_source(request), + ), } + context = create_context_for_index_template( "search-page", word_search_form=WordSearchForm(), @@ -160,6 +162,7 @@ def process_result(r): ) return render(request, "morphodict/wordnet-search.html", context) + def search_results(request, query_string: str): # pragma: no cover """ returns rendered boxes of search results according to user query diff --git a/src/morphodict/lexicon/admin.py b/src/morphodict/lexicon/admin.py index 77c961377..750306f17 100644 --- a/src/morphodict/lexicon/admin.py +++ b/src/morphodict/lexicon/admin.py @@ -8,11 +8,12 @@ TargetLanguageKeyword, Wordform, SourceLanguageKeyword, - WordNetSynset + WordNetSynset, ) admin.site.register(WordNetSynset) + # https://stackoverflow.com/a/1720961/14558 def admin_url_for(obj): return reverse( diff --git a/src/morphodict/lexicon/management/commands/importjsondict.py b/src/morphodict/lexicon/management/commands/importjsondict.py index 8d28a3140..a6f177066 100644 --- a/src/morphodict/lexicon/management/commands/importjsondict.py +++ b/src/morphodict/lexicon/management/commands/importjsondict.py @@ -35,7 +35,7 @@ SourceLanguageKeyword, ImportStamp, RapidWords, - WordNetSynset + WordNetSynset, ) from morphodict.lexicon.util import to_source_language_keyword @@ -301,7 +301,7 @@ def run(self): existing_slugs = self.gather_slugs() form_definitions = [] - + for entry in tqdm(self.data, smoothing=0): if "formOf" in entry: form_definitions.append(entry) @@ -369,14 +369,16 @@ def run(self): # Make sure everything is saved for upcoming formOf queries self.flush_insert_buffers() - + wordforms = Wordform.objects.all() - for wf in tqdm(wordforms.iterator(),total=wordforms.count()): + for wf in tqdm(wordforms.iterator(), total=wordforms.count()): if not wf.linguist_info: continue if "rw_indices" in wf.linguist_info: - rapidwords = {rw for l in wf.linguist_info["rw_indices"].values() for rw in l} + rapidwords = { + rw for l in wf.linguist_info["rw_indices"].values() for rw in l + } for rw in rapidwords: index = rw.strip() try: @@ -385,20 +387,30 @@ def run(self): # Try flexible search try: try: - candidates = [RapidWords.objects.get(index=".".join(index.split(".")[:-1]))] + candidates = [ + RapidWords.objects.get( + index=".".join(index.split(".")[:-1]) + ) + ] except RapidWords.DoesNotExist: - query = Q(domain__iexact=wf.linguist_info["rw_domains"][0]) + query = Q( + domain__iexact=wf.linguist_info["rw_domains"][0] + ) for domain in wf.linguist_info["rw_domains"][1:]: query |= Q(domain__iexact=domain) universe = RapidWords.objects.filter(query) - candidates = [x for x in universe if index.startswith(x.index)] + candidates = [ + x for x in universe if index.startswith(x.index) + ] except: - candidates=[] - if len(candidates)>0: - candidates.sort(key=lambda x:len(x.index),reverse=True) + candidates = [] + if len(candidates) > 0: + candidates.sort(key=lambda x: len(x.index), reverse=True) rapidword = candidates[0] else: - print(f"WARNING: ImportJSON error: Slug {wf.slug} is annotated with nonexistent {index} RW index") + print( + f"WARNING: ImportJSON error: Slug {wf.slug} is annotated with nonexistent {index} RW index" + ) if rapidword: wf.rapidwords.add(rapidword) @@ -413,12 +425,15 @@ def run(self): # which stand for ADJ, ADJ_SAT, ADV, NOUN, VERB) # - entry annotated with a non-canonical lemma. Use the canonical lemma appearing in # "name" in our wordnet instance site. - print(f"WARNING: ImportJSON error: Slug {wf.slug} is annotated with nonexistent {wn.strip()} WN domain") + print( + f"WARNING: ImportJSON error: Slug {wf.slug} is annotated with nonexistent {wn.strip()} WN domain" + ) if normalized_name: - synset, _ = WordNetSynset.objects.get_or_create(name=normalized_name) + synset, _ = WordNetSynset.objects.get_or_create( + name=normalized_name + ) wf.synsets.add(synset) - for entry in form_definitions: if self.incremental and freshness_check.is_fresh(entry["formOf"]): continue diff --git a/src/morphodict/lexicon/migrations/0014_auto_20241128_2341.py b/src/morphodict/lexicon/migrations/0014_auto_20241128_2341.py index de8b0b68f..1bf184f71 100644 --- a/src/morphodict/lexicon/migrations/0014_auto_20241128_2341.py +++ b/src/morphodict/lexicon/migrations/0014_auto_20241128_2341.py @@ -5,18 +5,21 @@ from tqdm import tqdm import json + def load_rapidwords(apps, schema_editor): RapidWords = apps.get_model("lexicon", "RapidWords") - RAPIDWORDS_JSON_FILE = settings.BASE_DIR / ".." / "morphodict" / "resources" / "rapidwords.json" + RAPIDWORDS_JSON_FILE = ( + settings.BASE_DIR / ".." / "morphodict" / "resources" / "rapidwords.json" + ) - with open(RAPIDWORDS_JSON_FILE,'r') as f: + with open(RAPIDWORDS_JSON_FILE, "r") as f: rw_data = json.load(f) - + rw_entries = [] for key, items in tqdm(rw_data.items()): rw_entries.append(RapidWords(index=key, domain=items["domain"])) - + RapidWords.objects.bulk_create(rw_entries) @@ -26,6 +29,4 @@ class Migration(migrations.Migration): ("lexicon", "0013_wordnetsynset_wordform_rapidwords_wordform_synsets"), ] - operations = [ - migrations.RunPython(load_rapidwords) - ] + operations = [migrations.RunPython(load_rapidwords)] diff --git a/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py b/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py index 263a5e66e..0234cb4fd 100644 --- a/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py +++ b/src/morphodict/lexicon/migrations/0015_auto_20241128_2351.py @@ -3,11 +3,12 @@ from django.db import migrations from morphodict.search.types import WordnetEntry + def migrate_semantic_domains(apps, schema_editor): RapidWords = apps.get_model("lexicon", "RapidWords") WordNetSynset = apps.get_model("lexicon", "WordNetSynset") Wordform = apps.get_model("lexicon", "Wordform") - + # For every wordform, collect the semantic domain information in the old # format and place it where it belongs. wordforms = Wordform.objects.all() @@ -18,10 +19,10 @@ def migrate_semantic_domains(apps, schema_editor): else: rapidwords = [] if wf.wn_synsets: - synsets = [x.strip() for x in wf.wn_synsets.split(";")] + synsets = [x.strip() for x in wf.wn_synsets.split(";")] else: - synsets = [] - + synsets = [] + for rw in rapidwords: try: if rw: @@ -32,9 +33,9 @@ def migrate_semantic_domains(apps, schema_editor): if wn: normalized_entry = str(WordnetEntry(wn)) wf.synsets.add( - WordNetSynset.objects.get_or_create( - name=normalized_entry - )) + WordNetSynset.objects.get_or_create(name=normalized_entry) + ) + class Migration(migrations.Migration): @@ -42,6 +43,4 @@ class Migration(migrations.Migration): ("lexicon", "0014_auto_20241128_2341"), ] - operations = [ - migrations.RunPython(migrate_semantic_domains) - ] + operations = [migrations.RunPython(migrate_semantic_domains)] diff --git a/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py b/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py index 780ab61f2..303ed1e78 100644 --- a/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py +++ b/src/morphodict/lexicon/migrations/0016_auto_20241202_1907.py @@ -3,11 +3,12 @@ from django.db import migrations from morphodict.search.types import WordnetEntry + def migrate_from_linguistinfo(apps, schema_editor): RapidWords = apps.get_model("lexicon", "RapidWords") WordNetSynset = apps.get_model("lexicon", "WordNetSynset") Wordform = apps.get_model("lexicon", "Wordform") - + # For every wordform, collect the semantic domain information in the old # format and place it where it belongs. wordforms = Wordform.objects.all() @@ -16,13 +17,17 @@ def migrate_from_linguistinfo(apps, schema_editor): if not wf.linguist_info: continue if "rw_indices" in wf.linguist_info: - rapidwords = {rw for l in wf.linguist_info["rw_indices"].values() for rw in l} + rapidwords = { + rw for l in wf.linguist_info["rw_indices"].values() for rw in l + } for rw in rapidwords: index = rw.strip() try: wf.rapidwords.add(RapidWords.objects.get(index=index)) except RapidWords.DoesNotExist: - print(f"ERROR: Slug {wf.slug} is annotated with nonexistent {index} RW index") + print( + f"ERROR: Slug {wf.slug} is annotated with nonexistent {index} RW index" + ) if "wn_domains" in wf.linguist_info: for wn in wf.linguist_info["wn_domains"]: @@ -36,11 +41,14 @@ def migrate_from_linguistinfo(apps, schema_editor): # which stand for ADJ, ADJ_SAT, ADV, NOUN, VERB) # - entry annotated with a non-canonical lemma. Use the canonical lemma appearing in # "name" in our wordnet instance site. - print(f"ERROR: Slug {wf.slug} is annotated with nonexistent {wn.strip()} WN domain") + print( + f"ERROR: Slug {wf.slug} is annotated with nonexistent {wn.strip()} WN domain" + ) if normalized_name: - synset, _ = WordNetSynset.objects.get_or_create(name=normalized_name) + synset, _ = WordNetSynset.objects.get_or_create( + name=normalized_name + ) wf.synsets.add(synset) - class Migration(migrations.Migration): @@ -49,6 +57,4 @@ class Migration(migrations.Migration): ("lexicon", "0015_auto_20241128_2351"), ] - operations = [ - migrations.RunPython(migrate_from_linguistinfo) - ] + operations = [migrations.RunPython(migrate_from_linguistinfo)] diff --git a/src/morphodict/lexicon/models.py b/src/morphodict/lexicon/models.py index 65b248d05..825c76e92 100644 --- a/src/morphodict/lexicon/models.py +++ b/src/morphodict/lexicon/models.py @@ -46,6 +46,7 @@ def __init__(self, *args, **kwargs): kwargs = {**kwargs, "ensure_ascii": False} super().__init__(*args, **kwargs) + class RapidWords(models.Model): index = models.CharField(max_length=MAX_WORDFORM_LENGTH, primary_key=True) domain = models.CharField(max_length=MAX_TEXT_LENGTH) @@ -53,12 +54,14 @@ class RapidWords(models.Model): def __str__(self): return self.index + class WordNetSynset(models.Model): name = models.CharField(max_length=MAX_TEXT_LENGTH, primary_key=True) def __str__(self): return self.name + class Wordform(models.Model): # Queries always do .select_related("lemma"): objects = WordformLemmaManager() @@ -165,7 +168,7 @@ class Meta: # - affix tree intialization # - sitemap generation models.Index(fields=["is_lemma", "text"]), - models.Index(fields=["slug"]) + models.Index(fields=["slug"]), ] def __str__(self): diff --git a/src/morphodict/search/__init__.py b/src/morphodict/search/__init__.py index a1072b940..f28c7cc38 100644 --- a/src/morphodict/search/__init__.py +++ b/src/morphodict/search/__init__.py @@ -4,6 +4,7 @@ from .query import Query from .wordnet import WordnetEntry + def search_with_affixes( query: str, include_auto_definitions=False, inflect_english_phrases=False ) -> SearchResults: @@ -38,10 +39,10 @@ def api_search( inflect_english_phrases=inflect_english_phrases, ).serialized_presentation_results() -def wordnet_search( - query:str) -> list[tuple[WordnetEntry, SearchResults]] | None : + +def wordnet_search(query: str) -> list[tuple[WordnetEntry, SearchResults]] | None: # If we are doing an english simple phrase search_query = Query(query) if search_query.wn: return wordnet_runner(search_query) - return None \ No newline at end of file + return None diff --git a/src/morphodict/search/lemma_freq.py b/src/morphodict/search/lemma_freq.py index 3a9eb8aea..7373b2c9f 100644 --- a/src/morphodict/search/lemma_freq.py +++ b/src/morphodict/search/lemma_freq.py @@ -16,7 +16,7 @@ def load_lemma_data(): # we want to normalize the lemma frequency # so I found the max of 32334 # and now we divide by that - LEMMA_FREQUENCY[l] = int(l_freq) #/ max + LEMMA_FREQUENCY[l] = int(l_freq) # / max def get_lemma_freq(search_results): diff --git a/src/morphodict/search/lookup.py b/src/morphodict/search/lookup.py index 6c3361c87..0d8f1aafe 100644 --- a/src/morphodict/search/lookup.py +++ b/src/morphodict/search/lookup.py @@ -28,7 +28,7 @@ def fetch_results(query: core.Query, search_results: core.SearchResults): fetch_results_from_target_language_keywords(query, search_results) fetch_results_from_source_language_keywords(query, search_results) - # Then we proceed to analyze the query, if successfull, we look for those + # Then we proceed to analyze the query, if successfull, we look for those # entries in the dictionary that share the analysis with the FST result. # This introduces source-level spelling relaxation if the FST supports it. diff --git a/src/morphodict/search/pos_matches.py b/src/morphodict/search/pos_matches.py index 6abb4ef1c..bbe18ba93 100644 --- a/src/morphodict/search/pos_matches.py +++ b/src/morphodict/search/pos_matches.py @@ -4,7 +4,9 @@ from morphodict.analysis import rich_analyze_relaxed -def find_pos_matches(tag_source: EsptSearch | None, search_results: SearchResults) -> None: +def find_pos_matches( + tag_source: EsptSearch | None, search_results: SearchResults +) -> None: if not tag_source: return tags = tag_source.tags diff --git a/src/morphodict/search/runner.py b/src/morphodict/search/runner.py index bfe77bbec..29409f405 100644 --- a/src/morphodict/search/runner.py +++ b/src/morphodict/search/runner.py @@ -20,6 +20,7 @@ from morphodict.search.wordnet import WordNetSearch from morphodict.lexicon.models import Wordform + def search( query: str, include_affixes=True, @@ -92,15 +93,17 @@ def sort_by_cvd(r: Result): do_cvd_search(search_query, search_results) # If we did an english phrase search, we have to inflect back the results! - if (search_query.espt or inflect_english_phrases) and ( - len(initial_query_terms) > 1 - ) and espt_search: + if ( + (search_query.espt or inflect_english_phrases) + and (len(initial_query_terms) > 1) + and espt_search + ): espt_search.inflect_search_results() - # Annotate every entry in search results with the POS match when that is available + # Annotate every entry in search results with the POS match when that is available if espt_search: find_pos_matches(espt_search, search_results) - + # Annotate every entry with a frequency count from the glossary get_glossary_count(search_results) @@ -135,7 +138,8 @@ def is_almost_certainly_cree(query: Query, search_results: SearchResults) -> boo return False -def wordnet_search(query:Query) -> list[tuple[WordnetEntry,SearchResults]] | None : + +def wordnet_search(query: Query) -> list[tuple[WordnetEntry, SearchResults]] | None: wordnet_search = WordNetSearch(query) if len(wordnet_search.synsets) > 0: # Wordnet search was successful _at the wordnet level_ @@ -143,21 +147,18 @@ def wordnet_search(query:Query) -> list[tuple[WordnetEntry,SearchResults]] | Non results = [] for synset in wordnet_search.synsets: wn_results = SearchResults() - wn_results.sort_function = lambda x: 0-x.lemma_freq if x.lemma_freq else 0 + wn_results.sort_function = lambda x: 0 - x.lemma_freq if x.lemma_freq else 0 wordforms = synset.wordforms.all() - if wordforms.count() > 0 : + if wordforms.count() > 0: for wordform in wordforms: - r = Result( - wordform, - target_language_wordnet_match=[synset.name] - ) + r = Result(wordform, target_language_wordnet_match=[synset.name]) wn_results.add_result(r) wn_entry = WordnetEntry(synset.name) wn_entry.original_str = " ".join(query.query_terms) get_lemma_freq(wn_results) for result in wn_results.unsorted_results(): result.relevance_score = result.lemma_freq - results.append((wn_entry,wn_results)) + results.append((wn_entry, wn_results)) return results - return None \ No newline at end of file + return None diff --git a/src/morphodict/search/types.py b/src/morphodict/search/types.py index 97c1dd6e8..60530a400 100644 --- a/src/morphodict/search/types.py +++ b/src/morphodict/search/types.py @@ -234,7 +234,6 @@ def create_related_result(self, new_wordform, **kwargs): target_language_wordnet_match: list[str] = field(default_factory=list) - def features(self): ret = {} for field in dataclasses.fields(Result): @@ -287,63 +286,74 @@ def __lt__(self, other: Result): def __str__(self): return f"Result" -format_regexp = r'^\s*\((?P\w+)\)\s+(?P.+)\s*\#\s*(?P\d+)\s*\Z' + +format_regexp = r"^\s*\((?P\w+)\)\s+(?P.+)\s*\#\s*(?P\d+)\s*\Z" + def wordnet_for_nltk(keyword: str) -> str: matches = re.match(format_regexp, keyword) if matches: - return "_".join([x for x in matches['stem'].split(" ") if x])+'.'+matches['pos']+'.'+matches['num'] + return ( + "_".join([x for x in matches["stem"].split(" ") if x]) + + "." + + matches["pos"] + + "." + + matches["num"] + ) return keyword + class WordnetEntry: synset: Synset - original_str : str - def __init__ (self, entry:str | Synset): + original_str: str + + def __init__(self, entry: str | Synset): if isinstance(entry, str): self.synset = wn.synset(wordnet_for_nltk(entry)) self.original_str = entry else: self.synset = entry self.original_str = entry.name() - - def __str__ (self): + + def __str__(self): data = self.synset.name().split(".") entry = ".".join(data[0:-2]) return f"({data[-2]}) {entry}#{int(data[-1])}" - - def hyponyms(self) -> list[WordnetEntry] : + + def hyponyms(self) -> list[WordnetEntry]: return produce_entries(self.original_str, self.synset.hyponyms()) - - def hypernyms(self) -> list[WordnetEntry] : + + def hypernyms(self) -> list[WordnetEntry]: return produce_entries(self.original_str, self.synset.hyponyms()) - + def member_holonyms(self) -> list[WordnetEntry]: return produce_entries(self.original_str, self.synset.member_holonyms()) - + def definition(self) -> str: return self.synset.definition() - + def heading(self) -> str: - return self.synset.lemmas()[0].name()+f" ({self.synset.pos()})" - + return self.synset.lemmas()[0].name() + f" ({self.synset.pos()})" + def pos(self) -> str: return self.synset.pos() - + def synonyms(self) -> list[str]: return [" ".join(l.name().split("_")) for l in self.synset.lemmas()] - + def lemmas(self) -> list[WNLemma]: return self.synset.lemmas() def ranking(self) -> int: return sum([l.count() for l in self.synset.lemmas()]) - + def nltk_name(self) -> str: return self.synset.name() - -def produce_entries(origin: str, entries:Iterable[Synset]) -> list[WordnetEntry]: + + +def produce_entries(origin: str, entries: Iterable[Synset]) -> list[WordnetEntry]: ans = [WordnetEntry(e) for e in entries] for e in ans: e.original_str = origin - return ans \ No newline at end of file + return ans diff --git a/src/morphodict/search/wordnet.py b/src/morphodict/search/wordnet.py index cdb934565..b4d8e1f62 100644 --- a/src/morphodict/search/wordnet.py +++ b/src/morphodict/search/wordnet.py @@ -1,17 +1,23 @@ from morphodict.search.types import produce_entries, WordnetEntry from morphodict.search.query import Query from nltk.corpus import wordnet -from morphodict.lexicon.models import ( - WordNetSynset -) +from morphodict.lexicon.models import WordNetSynset + + class WordNetSearch: - synsets : list[WordNetSynset] - def __init__(self, query:Query) : + synsets: list[WordNetSynset] + + def __init__(self, query: Query): canonical_query = "_".join(query.query_terms) - self.synsets = list(WordNetSynset.objects.filter( - name__in=produce_entries(" ".join(query.query_terms), wordnet.synsets(canonical_query)) - )) - def ranking(synset: WordNetSynset) -> int : + self.synsets = list( + WordNetSynset.objects.filter( + name__in=produce_entries( + " ".join(query.query_terms), wordnet.synsets(canonical_query) + ) + ) + ) + + def ranking(synset: WordNetSynset) -> int: return WordnetEntry(synset.name).ranking() - + self.synsets.sort(key=ranking, reverse=True) diff --git a/src/morphodict/search/wordnet_test.py b/src/morphodict/search/wordnet_test.py index f870102c3..a27f98cbc 100644 --- a/src/morphodict/search/wordnet_test.py +++ b/src/morphodict/search/wordnet_test.py @@ -1,24 +1,28 @@ from morphodict.search import wordnet_search from morphodict.search.types import Result + def test_wordnet_fail(db): search_results = wordnet_search(query="failingsearchinwordnet wn:1") assert search_results is None + def test_wordnet_empty(db): search_results = wordnet_search(query="wn:1") assert search_results is None + def test_wordnet_success(db): search_results = wordnet_search(query="see wn:1") - + assert len(search_results) > 1 for wn_entry, results in search_results: assert len(results.sorted_results()) > 0 + def test_wordnet_space_success(db): search_results = wordnet_search(query="Ursa Major wn:1") - + assert len(search_results) > 0 for wn_entry, results in search_results: - assert len(results.sorted_results()) > 0 \ No newline at end of file + assert len(results.sorted_results()) > 0 diff --git a/src/morphodict/site/settings.py b/src/morphodict/site/settings.py index 156909b26..d783cb0b6 100644 --- a/src/morphodict/site/settings.py +++ b/src/morphodict/site/settings.py @@ -418,4 +418,4 @@ def defaultDatabasePath(): ) # Location of wordnet data for nltk -nltk_data.path = [ BASE_DIR / ".." / "morphodict" / "resources" / "nltk_data" ] \ No newline at end of file +nltk_data.path = [BASE_DIR / ".." / "morphodict" / "resources" / "nltk_data"]