Skip to content

Commit

Permalink
Major dictionary fixes (#165)
Browse files Browse the repository at this point in the history
* update dictionary builds to include testing against a clean dictionary
* fix tests
* force *_include files to be lower case
  • Loading branch information
barrust authored Dec 28, 2023
1 parent f2e3a61 commit 82784ca
Show file tree
Hide file tree
Showing 24 changed files with 1,620,241 additions and 646,862 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# pyspellchecker

## Version 0.7.4

* Leveraged the dictionary files from [levidromelist](https://www.levidromelist.com/levidrome-list/dictionary) to attempt to clean up the `en`, `es`, `fr`, `pt`, `'de`, and `nl`dictionaries; Attempts to resolve issues #164, #155, #150, #140, #115, and #107; see [issue #126](https://github.com/barrust/pyspellchecker/issues/126)

## Version 0.7.3

* Remove relative imports in favor of absolute imports
Expand Down
189 changes: 154 additions & 35 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,12 @@
required.
"""
import contextlib
import json
import gzip
import json
import os
import string
from collections import Counter


STRING_PUNCTUATION = tuple(string.punctuation)
DIGETS = tuple(string.digits)
MINIMUM_FREQUENCY = 50
Expand Down Expand Up @@ -80,10 +79,10 @@ def build_word_frequency(filepath, language, output_path):
"""
# NLTK is only needed in this portion of the project
try:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
except ImportError as ex:
raise ImportError("To build a dictioary from scratch, NLTK is required!\n{}".format(ex.message))

Expand Down Expand Up @@ -140,7 +139,7 @@ def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
file.write("\n")


def clean_english(word_frequency, filepath_exclude, filepath_include):
def clean_english(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean an English word frequency list
Args:
Expand Down Expand Up @@ -243,19 +242,38 @@ def clean_english(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_spanish(word_frequency, filepath_exclude, filepath_include):
def clean_spanish(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a Spanish word frequency list
Args:
Expand Down Expand Up @@ -336,19 +354,38 @@ def clean_spanish(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_german(word_frequency, filepath_exclude, filepath_include):
def clean_german(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a German word frequency list
Args:
Expand Down Expand Up @@ -393,19 +430,38 @@ def clean_german(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_french(word_frequency, filepath_exclude, filepath_include):
def clean_french(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a French word frequency list
Args:
Expand Down Expand Up @@ -450,19 +506,38 @@ def clean_french(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
def clean_portuguese(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a Portuguese word frequency list
Args:
Expand Down Expand Up @@ -507,12 +582,31 @@ def clean_portuguese(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary, encoding="latin-1") as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

Expand Down Expand Up @@ -586,9 +680,9 @@ def clean_russian(word_frequency, filepath_exclude, filepath_include):
# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

Expand Down Expand Up @@ -643,9 +737,9 @@ def clean_arabic(word_frequency, filepath_exclude, filepath_include):
# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

Expand Down Expand Up @@ -700,9 +794,9 @@ def clean_basque(word_frequency, filepath_exclude, filepath_include):
# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

Expand Down Expand Up @@ -785,16 +879,16 @@ def clean_latvian(word_frequency, filepath_exclude, filepath_include):
# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def clean_dutch(word_frequency, filepath_exclude, filepath_include):
def clean_dutch(word_frequency, filepath_exclude, filepath_include, filepath_dictionary):
"""Clean a Dutch word frequency list
Args:
Expand Down Expand Up @@ -897,12 +991,31 @@ def clean_dutch(word_frequency, filepath_exclude, filepath_include):
if line in word_frequency:
word_frequency.pop(line)

# Use a dictionary to clean up everything else...
final_words_to_remove = []
with load_file(filepath_dictionary) as fobj:
dictionary_words = []
for line in fobj:
if line[0] in letters and line.islower():
line = line.strip()
dictionary_words.append(line)

for word in word_frequency:
if word not in dictionary_words:
final_words_to_remove.append(word)
for word in final_words_to_remove:
word_frequency.pop(word)

for word in dictionary_words:
if word not in word_frequency:
word_frequency[word] = MINIMUM_FREQUENCY

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
line = line.strip().lower()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

Expand Down Expand Up @@ -984,15 +1097,20 @@ def _parse_args():

# clean up the dictionary
if args.language == "en":
word_frequency = clean_english(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/american-english-large.txt".format(data_path))
word_frequency = clean_english(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "es":
word_frequency = clean_spanish(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/spanish.txt".format(data_path))
word_frequency = clean_spanish(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "de":
word_frequency = clean_german(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/new_german.txt".format(data_path))
word_frequency = clean_german(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "fr":
word_frequency = clean_french(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/french.txt".format(data_path))
word_frequency = clean_french(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "pt":
word_frequency = clean_portuguese(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/portuguese.txt".format(data_path))
word_frequency = clean_portuguese(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "ru":
word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
elif args.language == "ar":
Expand All @@ -1002,7 +1120,8 @@ def _parse_args():
elif args.language == "lv":
word_frequency = clean_latvian(word_frequency, exclude_filepath, include_filepath)
elif args.language == "nl":
word_frequency = clean_dutch(word_frequency, exclude_filepath, include_filepath)
dict_path = os.path.abspath("{}/levidromelist-dicts/dutch.txt".format(data_path))
word_frequency = clean_dutch(word_frequency, exclude_filepath, include_filepath, dict_path)

# export word frequency for review!
word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
Expand Down
Loading

0 comments on commit 82784ca

Please sign in to comment.