From 1fa5e049b01997f0b310b2615ffdb7bed12c0648 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 17:30:19 +0800 Subject: [PATCH 01/42] show chirptext & puchikarui version --- jamdict/tools.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jamdict/tools.py b/jamdict/tools.py index 9ee028a..13922cb 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -12,6 +12,8 @@ import json import logging +from chirptext import __version__ as chirptext_version +from puchikarui import __version__ as puchikarui_version from chirptext import confirm, TextReport, Timer from chirptext.cli import CLIApp, setup_logging @@ -211,7 +213,10 @@ def show_info(cli, args): print(e) output.print("Error happened while retrieving database meta data") output.header("Others") - output.print(f"lxml availability: {jamdict.jmdict._LXML_AVAILABLE}") + output.print(f"puchikarui: version {puchikarui_version}") + output.print(f"chirptext : version {chirptext_version}") + output.print(f"lxml : {jamdict.jmdict._LXML_AVAILABLE}") + def show_version(cli, args): From c910bea3a12d2f560f219b259fb7dde4aa0af59b Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 17:31:42 +0800 Subject: [PATCH 02/42] easier version config --- jamdict/__init__.py | 2 +- jamdict/__version__.py | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/jamdict/__init__.py b/jamdict/__init__.py index 4d5fe45..0fa1d2c 100644 --- a/jamdict/__init__.py +++ b/jamdict/__init__.py @@ -45,7 +45,7 @@ from . import __version__ as version_info from .__version__ import __author__, __email__, __copyright__, __maintainer__ from .__version__ import __credits__, __license__, __description__, __url__ -from .__version__ import __version_major__, __version_long__, __version__, __status__ +from .__version__ import __version__, __version_long__, __status__ from .jmdict_sqlite import JMDictSQLite from .kanjidic2_sqlite import KanjiDic2SQLite diff --git a/jamdict/__version__.py b/jamdict/__version__.py index da3fa26..baedb11 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -9,7 +9,27 @@ __description__ = "Python library for manipulating Jim Breen's JMdict, KanjiDic2, KRADFILE and JMnedict" __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" -__version_major__ = "0.1" -__version__ = "{}a10".format(__version_major__) -__version_long__ = "{} - Alpha 10".format(__version_major__) +# ------------------------------------------------------------------------------ +# Version configuration (enforcing PEP 440) +# ------------------------------------------------------------------------------ __status__ = "3 - Alpha" +__version_tuple__ = (0, 1, 0, 10, 4) +__version_status__ = '' # a specific value ('rc', 'dev', etc.) or leave blank to be auto-filled +# ------------------------------------------------------------------------------ +__status_map__ = {'3 - Alpha': 'a', '4 - Beta': 'b', '5 - Production/Stable': '', '6 - Mature': ''} +if not __version_status__: + __version_status__ = __status_map__[__status__] +if len(__version_tuple__) == 3: + __version_build__ = '' +elif len(__version_tuple__) == 4: + __version_build__ = f"{__version_tuple__[3]}" +elif len(__version_tuple__) == 5: + __version_build__ = f"{__version_tuple__[3]}.post{__version_tuple__[4]}" +else: + raise ValueError("Invalid version information") +if __version_tuple__[2] == 0: + __version_main__ = f"{'.'.join(str(n) for n in __version_tuple__[:2])}" +else: + __version_main__ = f"{'.'.join(str(n) for n in __version_tuple__[:3])}" +__version__ = f"{__version_main__}{__version_status__}{__version_build__}" +__version_long__ = f"{__version_main__} - {__status__.split('-')[1].strip()} {__version_build__}" From 0b544ada34074f5fac8eafeda364acf0e66ac57a Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 17:32:11 +0800 Subject: [PATCH 03/42] support both chirptext 0.1.x and 0.2.x --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d4d1b0..77e676c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -chirptext >= 0.1, < 0.2 +chirptext >= 0.1, <= 0.2 puchikarui >= 0.2a1, < 0.3 From 7c9f276f5f47b32b40efd94c44265dfe94b6b509 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 21:28:23 +0800 Subject: [PATCH 04/42] info shows jamdict_home when ~ is used --- jamdict/tools.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/jamdict/tools.py b/jamdict/tools.py index 13922cb..a68f2c5 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -179,15 +179,17 @@ def show_info(cli, args): jam = get_jam(cli, args) output.header("Basic configuration") jamdict_home = jamdict.config.home_dir() - if not os.path.isdir(jamdict_home): + if not os.path.isdir(os.path.expanduser(jamdict_home)): jamdict_home += " [Missing]" - output.print(f"JAMDICT_HOME : {jamdict_home}") + else: + jamdict_home += " [OK]" + output.print(f"JAMDICT_HOME: {jamdict_home}") if jamdict.util._JAMDICT_DATA_AVAILABLE: import jamdict_data data_pkg = f"version {jamdict_data.__version__} [OK]" else: data_pkg = "Not installed" - output.print(f"jamdict-data : {data_pkg}") + output.print(f"jamdict-data: {data_pkg}") if args.config: _config_path = args.config + " [Custom]" if not os.path.isfile(args.config): @@ -196,13 +198,13 @@ def show_info(cli, args): _config_path = jamdict.config._get_config_manager().locate_config() if not _config_path: _config_path = "Not available.\n Run `python3 -m jamdict config` to create configuration file if needed." - output.print(f"Config file location: {_config_path}") + output.print(f"Config file : {_config_path}") output.header("Data files") output.print(f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}") output.print(f"JMDict XML file : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}") output.print(f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}") - output.print(f"JMnedict XML file : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}") + output.print(f"JMnedict XML file : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}") if jam.ready: output.header("Jamdict database metadata") From ac9ee192752bda9246375f568222af4b6d3e3277 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 21:30:41 +0800 Subject: [PATCH 05/42] search_ne bug --- jamdict/jmnedict_sqlite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jamdict/jmnedict_sqlite.py b/jamdict/jmnedict_sqlite.py index 5c582f5..6628906 100644 --- a/jamdict/jmnedict_sqlite.py +++ b/jamdict/jmnedict_sqlite.py @@ -80,7 +80,7 @@ def search_ne(self, query, ctx=None, **kwargs): # ensure context if ctx is None: with self.ctx() as ctx: - return self.search(query, ctx=ctx) + return self.search_ne(query, ctx=ctx) _is_wildcard_search = '_' in query or '@' in query or '%' in query if _is_wildcard_search: where = "idseq IN (SELECT idseq FROM NEKanji WHERE text like ?) OR idseq IN (SELECT idseq FROM NEKana WHERE text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransGloss ON NETranslation.ID == NETransGloss.tid WHERE NETransGloss.text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransType ON NETranslation.ID == NETransType.tid WHERE NETransType.text like ?)" From c3a1ad642038f1ac76216a6a361d19a896ebd1cf Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 21:30:57 +0800 Subject: [PATCH 06/42] use buckmode to import data & fix db path issues --- jamdict/util.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index 320e2f7..0ed19dd 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -248,7 +248,7 @@ def __make_db_ctx(self): try: if not self.reuse_ctx: return self.jmdict.ctx() - elif self.__jm_ctx is None and self.db_file and os.path.isfile(self.db_file): + elif self.__jm_ctx is None and self.db_file and (self.db_file == ":memory:" or os.path.isfile(self.db_file)): self.__jm_ctx = self.jmdict.ctx() except Exception: getLogger().warning("JMdict data could not be accessed.") @@ -397,25 +397,31 @@ def is_available(self): def import_data(self): ''' Import JMDict and KanjiDic2 data from XML to SQLite ''' + ctx = self.__make_db_ctx() + ctx.buckmode() if self.jmdict and self.jmdict_xml: getLogger().info("Importing JMDict data") - self.jmdict.insert_entries(self.jmdict_xml, ctx=self.__make_db_ctx()) + self.jmdict.insert_entries(self.jmdict_xml, ctx=ctx) # import KanjiDic2 if self.kd2 is not None and self.kd2_xml and os.path.isfile(self.kd2_xml_file): getLogger().info("Importing KanjiDic2 data") - if self.jmdict is not None and id(self.kd2) == id(self.jmdict): - self.kd2.insert_chars(self.kd2_xml, ctx=self.__make_db_ctx()) + if self.jmdict is not None and self.kd2_file == self.db_file: + self.jmdict.insert_chars(self.kd2_xml, ctx=ctx) else: - self.kd2.insert_chars(self.kd2_xml) + getLogger().warning(f"Building Kanjidic2 DB using a different DB context {self.kd2_file} vs {self.db_file}") + with self.kd2.ctx() as kd_ctx: + self.kd2.insert_chars(self.kd2_xml, ctx=kd_ctx) else: getLogger().warning("KanjiDic2 XML data is not available - skipped!") # import JMNEdict if self.jmnedict is not None and self.jmne_xml and os.path.isfile(self.jmnedict_xml_file): getLogger().info("Importing JMNEdict data") - if self.jmdict is not None and id(self.jmnedict) == id(self.jmdict): - self.jmnedict.insert_name_entities(self.jmne_xml, ctx=self.__make_db_ctx()) + if self.jmdict is not None and self.jmnedict_file == self.db_file: + self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ctx) else: - self.jmnedict.insert_name_entities(self.jmne_xml) + getLogger().warning(f"Building Kanjidic2 DB using a different DB context {self.jmne_file} vs {self.db_file}") + with self.jmnedict.ctx() as ne_ctx: + self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ne_ctx) else: getLogger().warning("JMNEdict XML data is not available - skipped!") @@ -448,7 +454,8 @@ def get_entry(self, idseq): else: raise LookupError("There is no backend data available") - def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, **kwargs): + def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, + pos=None, name_type=None, **kwargs): ''' Search words, characters, and characters. Keyword arguments: @@ -496,7 +503,7 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup if result is not None: chars.append(result) # lookup name-entities - if lookup_ne and self.has_jmne(): + if lookup_ne and self.has_jmne(ctx=ctx): names = self.jmnedict.search_ne(query, ctx=ctx) # finish return LookupResult(entries, chars, names) From 2cd84d6af2bc6c192f97f6899e7e8a435486f7e1 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 21:31:17 +0800 Subject: [PATCH 07/42] update test cases --- test/test_jamdict.py | 18 +++++++++++------- test/test_jmnedict.py | 38 +++++++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index f7c1064..e7685a3 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -118,7 +118,10 @@ def test_kanjidic2_json(self): def test_jamdict_xml(self): print("Test Jamdict search in XML files") - jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False) + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE, auto_config=True) + jam.import_data() result = jam.lookup('おみやげ') self.assertEqual(1, len(result.entries)) self.assertEqual(2, len(result.chars)) @@ -181,10 +184,16 @@ def test_home_dir(self): class TestJamdictSQLite(unittest.TestCase): + @classmethod + def tearDownClass(cls): + if os.path.isfile(TEST_DB): + os.unlink(TEST_DB) + def test_jamdict_sqlite_all(self): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) - jam = Jamdict(db_file=TEST_DB, kd2_file=TEST_DB, jmnedict_file=TEST_DB, + TEST_DB.touch() + jam = Jamdict(db_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) # Lookup using XML result = jam.jmdict_xml.lookup('おみやげ') @@ -197,11 +206,6 @@ def test_jamdict_sqlite_all(self): self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) - - def test_memory_mode(self): - if not os.path.isfile(TEST_DB): - jam = Jamdict(db_file=TEST_DB, kd2_file=TEST_DB, jmnedict_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) - jam.import_data() print("Test reading DB into RAM") ram_jam = Jamdict(TEST_DB, memory_mode=True) print("1st lookup") diff --git a/test/test_jmnedict.py b/test/test_jmnedict.py index 7d6853c..78e69fe 100644 --- a/test/test_jmnedict.py +++ b/test/test_jmnedict.py @@ -63,7 +63,8 @@ def test_ne_type_map(self): def test_jmne_support(self): ''' Test metadata ''' - with self.ramdb.ctx() as ctx: + ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) + with ramdb.ctx() as ctx: self.ramdb.insert_name_entities(self.xdb, ctx=ctx) m = ctx.meta.select_single('key=?', ('jmnedict.version',)) self.assertEqual(m.key, 'jmnedict.version') @@ -71,24 +72,25 @@ def test_jmne_support(self): def test_xml2ramdb(self): print("Testing XML to RAM") - with self.ramdb.ctx() as ctx: - self.ramdb.insert_name_entities(self.xdb, ctx=ctx) + ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) + with ramdb.ctx() as ctx: + ramdb.insert_name_entities(self.xdb, ctx=ctx) # all entries were inserted expected_idseqs = {int(e.idseq) for e in self.xdb} - inserted_idseqs = {e.idseq for e in self.ramdb.NEEntry.select(ctx=ctx)} + inserted_idseqs = {e.idseq for e in ramdb.NEEntry.select(ctx=ctx)} getLogger().info("Inserted entries: {}".format(len(inserted_idseqs))) self.assertEqual(expected_idseqs, inserted_idseqs) # make sure that the kanjis are inserted expected_kanjis = set() for e in self.xdb.entries: expected_kanjis.update(k.text for k in e.kanji_forms) - kanjis = {k.text for k in self.ramdb.NEKanji.select(ctx=ctx)} + kanjis = {k.text for k in ctx.NEKanji.select()} self.assertEqual(expected_kanjis, kanjis) # make sure that the kanas were inserted expected_readings = set() for e in self.xdb.entries: expected_readings.update(k.text for k in e.kana_forms) - readings = {k.text for k in self.ramdb.NEKana.select(ctx=ctx)} + readings = {k.text for k in ctx.NEKana.select()} self.assertEqual(expected_readings, readings) # make sure that the definitions were inserted expected_glosses = set() @@ -102,37 +104,47 @@ def test_xml2ramdb(self): ne_xml = self.xdb.lookup("id#{}".format(idseq))[0] ne_xml.idseq = int(ne_xml.idseq) getLogger().debug(ne_xml.to_json()) - ne = self.ramdb.get_ne(idseq, ctx=ctx) + ne = ramdb.get_ne(idseq, ctx=ctx) getLogger().debug(ne.to_json()) self.assertEqual(ne_xml.to_json(), ne.to_json()) # test search by idseq - shenron = self.ramdb.search_ne('id#5741815', ctx=ctx) + shenron = ramdb.search_ne('id#5741815', ctx=ctx) self.assertEqual(len(shenron), 1) self.assertEqual(shenron[0].idseq, 5741815) # test exact search - shenron2 = self.ramdb.search_ne('神龍', ctx=ctx) + shenron2 = ramdb.search_ne('神龍', ctx=ctx) self.assertEqual(len(shenron2), 1) self.assertEqual(shenron2[0].idseq, 5741815) # test search by kana - shenron3 = self.ramdb.search_ne('シェンロン', ctx=ctx) + shenron3 = ramdb.search_ne('シェンロン', ctx=ctx) self.assertEqual(len(shenron3), 1) self.assertEqual(shenron3[0].idseq, 5741815) # test search by definition - shenron4 = self.ramdb.search_ne('%spiritual%', ctx=ctx) + shenron4 = ramdb.search_ne('%spiritual%', ctx=ctx) self.assertEqual(len(shenron4), 1) self.assertEqual(shenron4[0].idseq, 5741815) # test search by wild card - all_shime_names = self.ramdb.search_ne('しめ%', ctx=ctx) + all_shime_names = ramdb.search_ne('しめ%', ctx=ctx) expected_idseqs = [5000001, 5000002, 5000003, 5000004, 5000005, 5000006, 5000007, 5000008, 5000009] actual = [x.idseq for x in all_shime_names] print(actual) self.assertEqual(expected_idseqs, actual) # test search by name_type - all_fems = self.ramdb.search_ne('person', ctx=ctx) + all_fems = ramdb.search_ne('person', ctx=ctx) expected_idseqs = [2831743, 5001644] actual = [x.idseq for x in all_fems] self.assertEqual(expected_idseqs, actual) + def test_query_netype(self): + ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) + ctx = ramdb.ctx() + ramdb.insert_name_entities(self.xdb, ctx=ctx) + shenron = ctx.search_ne('id#5741815', ctx=ctx)[0] + print(shenron) + print(shenron.to_json()) + + + # ------------------------------------------------------------------------------- # Main From ffd6029186b5b6e87738ddb4262cf181c88c00f8 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 23:04:42 +0800 Subject: [PATCH 08/42] update default logging settings --- test/logging.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/logging.json b/test/logging.json index 174c19f..9a914a0 100644 --- a/test/logging.json +++ b/test/logging.json @@ -44,7 +44,8 @@ "propagate": "no" } ,"test": { "level": "INFO" } - ,"puchikarui": { "level": "INFO" } + ,"jamdict.jmdict_sqlite": { "level": "INFO" } + ,"puchikarui": { "level": "ERROR" } }, "root": { From 61a23648d396c480c50687e7b9864cb61a3346fc Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 23:05:18 +0800 Subject: [PATCH 09/42] add all_ne_type method --- jamdict/jmnedict_sqlite.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jamdict/jmnedict_sqlite.py b/jamdict/jmnedict_sqlite.py index 6628906..0ec3c21 100644 --- a/jamdict/jmnedict_sqlite.py +++ b/jamdict/jmnedict_sqlite.py @@ -76,6 +76,12 @@ class JMNEDictSQLite(JMNEDictSchema): def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) + def all_ne_type(self, ctx=None): + if ctx is None: + return self.all_ne_type(ctx=self.ctx()) + else: + return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM NETransType")] + def search_ne(self, query, ctx=None, **kwargs): # ensure context if ctx is None: From 0aa9b03fdd0fb8cbe1a04bc964188777f8986b56 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 23:06:13 +0800 Subject: [PATCH 10/42] add all_pos() method and allow to lookup() by pos --- jamdict/jmdict_sqlite.py | 53 ++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index 0762c82..a2b589a 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -110,29 +110,46 @@ def update_jmd_meta(self, version, url, ctx=None): ju.value = url ctx.meta.save(ju) - def search(self, query, ctx=None, **kwargs): + def all_pos(self, ctx=None): + if ctx is None: + return self.all_pos(ctx=self.ctx()) + else: + return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM pos")] + + def search(self, query, ctx=None, pos=None, **kwargs): # ensure context if ctx is None: with self.ctx() as ctx: return self.search(query, ctx=ctx) - _is_wildcard_search = '_' in query or '@' in query or '%' in query - if _is_wildcard_search: - where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)" - else: - where = "idseq IN (SELECT idseq FROM Kanji WHERE text == ?) OR idseq IN (SELECT idseq FROM Kana WHERE text == ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text == ?)" - getLogger().debug(where) - params = [query, query, query] - try: - if query.startswith('id#'): - query_int = int(query[3:]) - if query_int >= 0: - getLogger().debug("Searching by ID: {}".format(query_int)) - where = "idseq = ?" - params = [query_int] - except Exception: - pass + + where = [] + params = [] + if query.startswith('id#'): + query_int = int(query[3:]) + if query_int >= 0: + getLogger().debug("Searching by ID: {}".format(query_int)) + where.append("idseq = ?") + params.append(query_int) + elif query and query != "%": + _is_wildcard_search = '_' in query or '@' in query or '%' in query + if _is_wildcard_search: + where.append("(idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?))") + else: + where.append("(idseq IN (SELECT idseq FROM Kanji WHERE text == ?) OR idseq IN (SELECT idseq FROM Kana WHERE text == ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text == ?))") + params += (query, query, query) + if pos: + if isinstance(pos, str): + getLogger().warning("POS filter should be a collection, not a string") + pos = [pos] + # allow to search by POS + slots = len(pos) + if where: + where.append("AND") + where.append(f"idseq IN (SELECT idseq FROM Sense WHERE ID IN (SELECT sid FROM pos WHERE text IN ({','.join('?' * slots)})))") + params += pos # else (a context is provided) - eids = self.Entry.select(where, params, ctx=ctx) + logging.getLogger(__name__).debug(f"Search query: {where} -- Params: {params}") + eids = self.Entry.select(' '.join(where), params, ctx=ctx) entries = [] for e in eids: entries.append(self.get_entry(e.idseq, ctx=ctx)) From 0c5cd4579dd396cdd5a4a04fbb4fdec37341019d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 23:09:51 +0800 Subject: [PATCH 11/42] add pos kwarg to lookup(), add all_pos() and all_ne_type() methods --- jamdict/util.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index 0ed19dd..b0f7e3f 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -454,8 +454,27 @@ def get_entry(self, idseq): else: raise LookupError("There is no backend data available") - def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, - pos=None, name_type=None, **kwargs): + def all_pos(self, ctx=None): + """ Find all available part-of-speeches + + :returns: A list of part-of-speeches (a list of strings) + """ + if ctx is None: + ctx = self.__make_db_ctx() + return self.jmdict.all_pos(ctx=ctx) + + def all_ne_type(self, ctx=None): + """ Find all available named-entity types + + :returns: A list of named-entity types (a list of strings) + """ + if ctx is None: + ctx = self.__make_db_ctx() + return self.jmnedict.all_ne_type(ctx=ctx) + + + def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, + lookup_ne=True, pos=None, **kwargs): ''' Search words, characters, and characters. Keyword arguments: @@ -465,7 +484,9 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup :type strict_lookup: bool :param: lookup_chars: set lookup_chars to False to disable character lookup :type lookup_chars: bool - :param: ctx: database access context, can be reused for better performance. Normally users do not have to touch this and database connections will be reused by default. + :param pos: Filter words by part-of-speeches + :type pos: list of strings + :param ctx: database access context, can be reused for better performance. Normally users do not have to touch this and database connections will be reused by default. :param lookup_ne: set lookup_ne to False to disable name-entities lookup :type lookup_ne: bool :returns: Return a LookupResult object. @@ -476,8 +497,8 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup ''' if not self.is_available(): raise LookupError("There is no backend data available") - elif not query: - raise ValueError("Query cannot be empty") + elif (not query or query == "%") and not pos: + raise ValueError("Query and POS filter cannot be both empty") if ctx is None: ctx = self.__make_db_ctx() # Lookup words @@ -485,7 +506,7 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup chars = [] names = [] if self.jmdict is not None: - entries = self.jmdict.search(query, ctx=ctx) + entries = self.jmdict.search(query, pos=pos, ctx=ctx) elif self.jmdict_xml: entries = self.jmdict_xml.lookup(query) if lookup_chars and self.has_kd2(): From 6700a49ef1b6e123703269e566daa4db386c1709 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sat, 22 May 2021 23:10:22 +0800 Subject: [PATCH 12/42] update test cases --- test/test_jamdict.py | 138 +++++++++++++++++++++++++++++++++++-- test/test_jmdict_sqlite.py | 1 - test/test_jmnedict.py | 6 +- 3 files changed, 135 insertions(+), 10 deletions(-) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index e7685a3..95128ce 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -16,8 +16,9 @@ from jamdict import Jamdict, JMDictXML from jamdict import config -from jamdict.jmdict import JMDictXMLParser +from jamdict.jmdict import JMDictXMLParser, JMDEntry from jamdict.kanjidic2 import Kanjidic2XMLParser +from jamdict.util import _JAMDICT_DATA_AVAILABLE MY_DIR = Path(os.path.abspath(os.path.dirname(__file__))) @@ -32,6 +33,22 @@ def getLogger(): return logging.getLogger(__name__) +def all_kana(result, forms=None): + if forms is None: + forms = set() + for e in result.entries: + forms.update(f.text for f in e.kana_forms) + return forms + + +def all_kanji(result, forms=None): + if forms is None: + forms = set() + for e in result.entries: + forms.update(f.text for f in e.kanji_forms) + return forms + + class TestConfig(unittest.TestCase): def test_config(self): @@ -58,7 +75,6 @@ def test_basic_models(self): def test_lookup_result(self): jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False, auto_expand=False) result = jam.lookup('おみやげ') - print(repr(result)) self.assertTrue(result.entries) self.assertEqual(result.entries[0].kana_forms[0].text, 'おみやげ') # test lookup by ID @@ -86,7 +102,8 @@ def test_jmdict_fields(self): entries = parser.parse_file(MINI_JMD) jmd = JMDictXML(entries) results = jmd.lookup(u'おてんき') - print(results) + self.assertTrue(results) + self.assertIsInstance(results[0], JMDEntry) def test_jmdict_json(self): print("Test JMDict - XML to JSON") @@ -187,7 +204,120 @@ class TestJamdictSQLite(unittest.TestCase): @classmethod def tearDownClass(cls): if os.path.isfile(TEST_DB): - os.unlink(TEST_DB) + os.unlink(TEST_DB) + + def test_search_by_pos(self): + print("Test Jamdict search in XML files") + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE, auto_config=True) + jam.import_data() + # test get all pos + poses = jam.all_pos() + expected = {'Godan verb - -aru special class', + "Godan verb with `ku' ending", + "Godan verb with `ru' ending", + "Godan verb with `su' ending", + "Godan verb with `u' ending", + 'Ichidan verb', + 'adjectival nouns or quasi-adjectives (keiyodoshi)', + 'adjective (keiyoushi)', + 'adverb (fukushi)', + "adverb taking the `to' particle", + 'auxiliary verb', + 'conjunction', + 'expressions (phrases, clauses, etc.)', + 'interjection (kandoushi)', + 'intransitive verb', + 'noun (common) (futsuumeishi)', + 'noun or participle which takes the aux. verb suru', + 'noun or verb acting prenominally', + "nouns which may take the genitive case particle `no'", + 'pre-noun adjectival (rentaishi)', + 'pronoun', + 'transitive verb'} + self.assertEqual(expected, set(poses)) + result = jam.lookup('おみやげ', pos=['noun (common) (futsuumeishi)']) + self.assertEqual(1, len(result.entries)) + with self.assertLogs('jamdict.jmdict_sqlite', level="WARNING") as cm: + result = jam.lookup('おみやげ', pos='noun (common) (futsuumeishi)') + self.assertEqual(1, len(result.entries)) + warned_pos_as_str = False + for line in cm.output: + if "POS filter should be a collection, not a string" in line: + warned_pos_as_str = True + break + self.assertTrue(warned_pos_as_str) + result = jam.lookup('おみやげ', pos=['intransitive verb']) + self.assertFalse(result.entries) + result = jam.lookup('おみやげ', pos=['intransitive verb', 'noun (common) (futsuumeishi)']) + self.assertTrue(result.entries) + + def test_search_by_ne_type(self): + print("Test Jamdict search in XML files") + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE, auto_config=True) + jam.import_data() + netypes = jam.all_ne_type() + expected = ['company', 'fem', 'given', 'organization', 'person', 'place', 'surname', 'unclass'] + self.assertEqual(expected, netypes) + res = jam.lookup("place") + actual = set() + for n in res.names: + actual.update(k.text for k in n.kanji_forms) + self.assertIn("厦門", actual) + res = jam.lookup("company") + actual = set() + for n in res.names: + actual.update(k.text for k in n.kanji_forms) + expected = {'埼銀', 'IKEA'} + self.assertTrue(expected.issubset(actual)) + + def test_find_all_verbs(self): + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE, auto_config=True) + jam.import_data() + # cannot search for everything + self.assertRaises(ValueError, lambda: jam.lookup("")) + self.assertRaises(ValueError, lambda: jam.lookup("%")) + self.assertRaises(ValueError, lambda: jam.lookup("%", pos="")) + res = jam.lookup("", pos="pronoun") + actual = all_kana(res) + pronouns = {'おい', 'おまい', 'おたく', 'あのひと', 'かしこ', 'あのかた', 'おめえ', + 'おまえ', 'おおい', 'おーい', 'あそこ', 'あこ', 'あしこ', 'あすこ'} + self.assertTrue(pronouns.issubset(actual)) + result = jam.lookup("%", pos=["intransitive verb", 'pronoun']) + forms = all_kana(result) + iverbs = {'いじける', 'イカす', 'うろたえる', 'いかす', 'おっこちる', + 'いらっしゃる', 'あぶれる', 'いななく', 'いちゃつく'} + self.assertTrue(iverbs.issubset(forms)) + self.assertTrue(pronouns.issubset(forms)) + + @unittest.skipIf(not _JAMDICT_DATA_AVAILABLE, "Jamdict data is not available. Data test is skipped") + def test_jamdict_data(self): + jam = Jamdict() + # search verb kaeru + res = jam.lookup("かえる", pos="transitive verb") + actual = [e.idseq for e in res.entries] + self.assertIn(1510650, actual) + self.assertIn(1589780, actual) + forms = all_kanji(res) + expected = {'変える', '代える', '換える', '替える'} + self.assertTrue(expected.issubset(forms)) + # search by noun kaeru + res2 = jam.lookup("かえる", pos='noun (common) (futsuumeishi)') + actual2 = [e.idseq for e in res2.entries] + forms2 = all_kanji(res2) + self.assertIn(1577460, actual2) + expected2 = {'蛙', '蛤', '蝦'} + self.assertTrue(expected2.issubset(forms2)) + # search both noun and verb + res3 = jam.lookup("かえる", pos=['noun (common) (futsuumeishi)', "transitive verb"]) + forms3 = all_kanji(res3) + self.assertTrue(expected.issubset(forms3)) + self.assertTrue(expected2.issubset(forms3)) def test_jamdict_sqlite_all(self): if os.path.isfile(TEST_DB): diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 8ebb0ba..5c3e76d 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -91,7 +91,6 @@ def test_search(self): with self.ramdb.ds.open() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) entries = ctx.Entry.select() - print(len(entries)) # Search by kana es = self.ramdb.search('あの', ctx) self.assertEqual(len(es), 2) diff --git a/test/test_jmnedict.py b/test/test_jmnedict.py index 78e69fe..81c8197 100644 --- a/test/test_jmnedict.py +++ b/test/test_jmnedict.py @@ -127,7 +127,6 @@ def test_xml2ramdb(self): all_shime_names = ramdb.search_ne('しめ%', ctx=ctx) expected_idseqs = [5000001, 5000002, 5000003, 5000004, 5000005, 5000006, 5000007, 5000008, 5000009] actual = [x.idseq for x in all_shime_names] - print(actual) self.assertEqual(expected_idseqs, actual) # test search by name_type all_fems = ramdb.search_ne('person', ctx=ctx) @@ -140,10 +139,7 @@ def test_query_netype(self): ctx = ramdb.ctx() ramdb.insert_name_entities(self.xdb, ctx=ctx) shenron = ctx.search_ne('id#5741815', ctx=ctx)[0] - print(shenron) - print(shenron.to_json()) - - + self.assertTrue(shenron) # ------------------------------------------------------------------------------- From 201d8de1bdac390860e6e18e3a11c1f2104d1bf4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sun, 23 May 2021 11:08:12 +0800 Subject: [PATCH 13/42] rename API to_json() to to_dict() --- jamdict/jmdict.py | 110 ++++++++++++++++++++++++++-------- jamdict/jmnedict_sqlite.py | 5 +- jamdict/kanjidic2.py | 84 ++++++++++++++++++++------ jamdict/tools.py | 2 +- jamdict/util.py | 107 +++++++++++++++++---------------- test/test_jamdict.py | 24 ++++++-- test/test_jmdict_sqlite.py | 4 +- test/test_jmnedict.py | 6 +- test/test_kanjidic2_sqlite.py | 6 +- 9 files changed, 237 insertions(+), 111 deletions(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index eef21d7..a2232ce 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -10,6 +10,9 @@ import os import logging +import warnings +from typing import List + try: from lxml import etree _LXML_AVAILABLE = True @@ -37,10 +40,10 @@ class JMDEntry(object): def __init__(self, idseq=''): # A unique numeric sequence number for each entry self.idseq = idseq # ent_seq - self.kanji_forms = [] # k_ele* => KanjiForm[] - self.kana_forms = [] # r_ele+ => KanaForm[] - self.info = None # info? => EntryInfo - self.senses = [] # sense+ + self.kanji_forms: List[KanjiForm] = [] # k_ele* + self.kana_forms: List[KanaForm] = [] # r_ele+ => KanaForm[] + self.info: EntryInfo = None # info? => EntryInfo + self.senses: List[Sense] = [] # sense+ def __len__(self): return len(self.senses) @@ -77,12 +80,17 @@ def __str__(self): return self.text(compact=False) def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): ed = {'idseq': self.idseq, - 'kanji': [x.to_json() for x in self.kanji_forms], - 'kana': [x.to_json() for x in self.kana_forms], - 'senses': [x.to_json() for x in self.senses]} + 'kanji': [x.to_dict() for x in self.kanji_forms], + 'kana': [x.to_dict() for x in self.kana_forms], + 'senses': [x.to_dict() for x in self.senses]} if self.info: - ed['info'] = self.info.to_json() + ed['info'] = self.info.to_dict() return ed @@ -153,6 +161,11 @@ def set_text(self, text): self.text = text def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): kjd = {'text': self.text} if self.info: kjd['info'] = self.info @@ -216,6 +229,11 @@ def set_text(self, text): self.text = text def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): knd = {'text': self.text, 'nokanji': self.nokanji} if self.restr: @@ -234,23 +252,28 @@ def __str__(self): class EntryInfo(object): - '''General coded information relating to the entry as a whole. + """General coded information relating to the entry as a whole. DTD: - ''' + """ def __init__(self): - self.links = [] # link* => Link[] - self.bibinfo = [] # bibl* => BibInfo[] + self.links: List[Link] = [] # link* + self.bibinfo: List[BibInfo] = [] # bibl* '''This field is used to hold information about the etymology of the kanji or kana parts of the entry. For gairaigo, etymological information may also be in the element.''' self.etym = [] # * - self.audit = [] # audit* => Audit[] + self.audit: List[Audit] = [] # audit* def to_json(self): - return {'links': [x.to_json() for x in self.links], - 'bibinfo': [x.to_json() for x in self.bibinfo], + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): + return {'links': [x.to_dict() for x in self.links], + 'bibinfo': [x.to_dict() for x in self.bibinfo], 'etym': self.etym, - 'audit': [x.to_json() for x in self.audit]} + 'audit': [x.to_dict() for x in self.audit]} class Link(object): @@ -261,11 +284,16 @@ class Link(object): link_uri contains the actual URI. ''' def __init__(self, tag, desc, uri): - self.tag = tag # - self.desc = desc # - self.uri = uri # + self.tag: str = tag # + self.desc: str = desc # + self.uri: str = uri # def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'tag': self.tag, 'desc': self.desc, 'uri': self.uri} @@ -280,8 +308,8 @@ class BibInfo(object): ''' def __init__(self, tag='', text=''): - self.tag = tag - self.text = text + self.tag: str = tag + self.text: str = text def set_tag(self, tag): if self.tag: @@ -294,6 +322,11 @@ def set_text(self, text): self.text = text def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'tag': self.tag, 'text': self.text} @@ -307,6 +340,11 @@ def __init__(self, upd_date, upd_detl): self.upd_detl = upd_detl # def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'upd_date': self.upd_date, 'upd_detl': self.upd_detl} @@ -360,13 +398,13 @@ def __init__(self): regional variations, etc.''' self.info = [] # - self.lsource = [] # + self.lsource: List[LSource] = [] # '''For words specifically associated with regional dialects in Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.''' self.dialect = [] # - self.gloss = [] # + self.gloss: List[SenseGloss] = [] # '''The example elements provide for pairs of short Japanese and target-language phrases or sentences which exemplify the usage of the @@ -389,6 +427,11 @@ def text(self, compact=True): return '/'.join(tmp) def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): sd = {} if self.stagk: sd['stagk'] = self.stagk @@ -407,11 +450,11 @@ def to_json(self): if self.info: sd['SenseInfo'] = self.info if self.lsource: - sd['SenseSource'] = [x.to_json() for x in self.lsource] + sd['SenseSource'] = [x.to_dict() for x in self.lsource] if self.dialect: sd['dialect'] = self.dialect if self.gloss: - sd['SenseGloss'] = [x.to_json() for x in self.gloss] + sd['SenseGloss'] = [x.to_dict() for x in self.gloss] return sd @@ -434,7 +477,12 @@ def text(self, compact=True): return '{gloss} ({types})'.format(gloss='/'.join(tmp), types=types) def to_json(self): - sd = super().to_json() + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): + sd = super().to_dict() sd['name_type'] = self.name_type return sd @@ -478,6 +526,11 @@ def __str__(self): return ' '.join(tmp) def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): gd = {} if self.lang: gd['lang'] = self.lang @@ -516,6 +569,11 @@ def __init__(self, lang, lstype, wasei, text): self.text = text def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'lang': self.lang, 'lstype': self.lstype, 'wasei': self.wasei, diff --git a/jamdict/jmnedict_sqlite.py b/jamdict/jmnedict_sqlite.py index 0ec3c21..f5e8140 100644 --- a/jamdict/jmnedict_sqlite.py +++ b/jamdict/jmnedict_sqlite.py @@ -14,6 +14,7 @@ import os import logging +from typing import Sequence from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL @@ -82,7 +83,7 @@ def all_ne_type(self, ctx=None): else: return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM NETransType")] - def search_ne(self, query, ctx=None, **kwargs): + def search_ne(self, query, ctx=None, **kwargs) -> Sequence[JMDEntry]: # ensure context if ctx is None: with self.ctx() as ctx: @@ -110,7 +111,7 @@ def search_ne(self, query, ctx=None, **kwargs): entries.append(self.get_ne(e.idseq, ctx=ctx)) return entries - def get_ne(self, idseq, ctx=None): + def get_ne(self, idseq, ctx=None) -> JMDEntry: # ensure context if ctx is None: with self.ctx() as new_context: diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index 7441582..ffdb69a 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -10,6 +10,9 @@ import os import logging +import warnings +from typing import List + try: from lxml import etree _LXML_AVAILABLE = True @@ -85,19 +88,19 @@ def __init__(self): """ self.ID = None self.literal = '' # The character itself in UTF8 coding. - self.codepoints = [] # - self.radicals = [] # + self.codepoints: List[CodePoint] = [] # + self.radicals: List[Radical] = [] # self.__canon_radical = None self.stroke_count = None # first stroke_count in misc self.grade = None # / self.stroke_miscounts = [] # /stroke_count[1:] - self.variants = [] # / + self.variants: List[Variant] = [] # / self.freq = None # / self.rad_names = [] # / a list of strings self.jlpt = None # / - self.dic_refs = [] # DicRef[] - self.query_codes = [] # QueryCode[] - self.rm_groups = [] # reading_meaning groups + self.dic_refs: List[DicRef] = [] # DicRef[] + self.query_codes: List[QueryCode] = [] # QueryCode[] + self.rm_groups: List[RMGroup] = [] # reading_meaning groups self.nanoris = [] # a list of strings def __repr__(self): @@ -134,19 +137,24 @@ def radical(self): return self.__canon_radical def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'literal': self.literal, - 'codepoints': [cp.to_json() for cp in self.codepoints], - 'radicals': [r.to_json() for r in self.radicals], + 'codepoints': [cp.to_dict() for cp in self.codepoints], + 'radicals': [r.to_dict() for r in self.radicals], 'stroke_count': self.stroke_count, 'grade': self.grade if self.grade else '', 'stroke_miscounts': self.stroke_miscounts, - 'variants': [v.to_json() for v in self.variants], + 'variants': [v.to_dict() for v in self.variants], 'freq': self.freq if self.freq else 0, 'rad_names': self.rad_names, 'jlpt': self.jlpt if self.jlpt else '', - 'dic_refs': [r.to_json() for r in self.dic_refs], - 'q_codes': [q.to_json() for q in self.query_codes], - 'rm': [rm.to_json() for rm in self.rm_groups], + 'dic_refs': [r.to_dict() for r in self.dic_refs], + 'q_codes': [q.to_dict() for q in self.query_codes], + 'rm': [rm.to_dict() for rm in self.rm_groups], 'nanoris': list(self.nanoris)} @@ -173,6 +181,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.cp_type, 'value': self.value} @@ -199,6 +212,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.rad_type, 'value': self.value} @@ -241,6 +259,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.var_type, 'value': self.value} @@ -307,6 +330,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.dr_type, 'value': self.value, "m_vol": self.m_vol, @@ -380,6 +408,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.qc_type, 'value': self.value, "skip_misclass": self.skip_misclass} @@ -397,8 +430,8 @@ def __init__(self, readings=None, meanings=None): """ self.ID = None self.cid = None - self.readings = readings if readings else [] - self.meanings = meanings if meanings else [] + self.readings: List[Reading] = readings if readings else [] + self.meanings: List[Meaning] = meanings if meanings else [] def __repr__(self): return "R: {} | M: {}".format( @@ -409,8 +442,13 @@ def __str__(self): return repr(self) def to_json(self): - return {'readings': [r.to_json() for r in self.readings], - 'meanings': [m.to_json() for m in self.meanings]} + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): + return {'readings': [r.to_dict() for r in self.readings], + 'meanings': [m.to_dict() for m in self.meanings]} class Reading(object): @@ -470,6 +508,11 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'type': self.r_type, 'value': self.value, 'on_type': self.on_type, @@ -503,12 +546,17 @@ def __str__(self): return self.value def to_json(self): + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): return {'m_lang': self.m_lang, 'value': self.value} class Kanjidic2XMLParser(object): - '''JMDict XML parser - ''' + """ JMDict XML parser + """ def __init__(self): pass diff --git a/jamdict/tools.py b/jamdict/tools.py index a68f2c5..cec5b03 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -139,7 +139,7 @@ def lookup(cli, args): results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': - report.print(json.dumps(results.to_json(), + report.print(json.dumps(results.to_dict(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: diff --git a/jamdict/util.py b/jamdict/util.py index b0f7e3f..44e27fa 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -11,9 +11,10 @@ import os import logging import threading +import warnings from collections import defaultdict as dd from collections import OrderedDict -from typing import List +from typing import List, Sequence from chirptext.deko import HIRAGANA, KATAKANA from puchikarui import MemorySource @@ -41,76 +42,72 @@ def getLogger(): ######################################################################## - -EntryList = List[JMDEntry] -CharacterList = List[Character] - - class LookupResult(object): - ''' Contain lookup results (words, Kanji characters, or named entities) from Jamdict. + """ Contain lookup results (words, Kanji characters, or named entities) from Jamdict. A typical jamdict lookup is like this: + >>> jam = Jamdict() >>> result = jam.lookup('食べ%る') The command above returns a :any:`LookupResult` object which contains found words (:any:`entries`), kanji characters (:any:`chars`), and named entities (:any:`names`). - ''' + """ def __init__(self, entries, chars, names=None): - self.__entries = entries if entries else [] - self.__chars = chars if chars else [] - self.__names = names if names else [] + self.__entries: Sequence[JMDEntry] = entries if entries else [] + self.__chars: Sequence[Character] = chars if chars else [] + self.__names: Sequence[JMDEntry] = names if names else [] @property - def entries(self): - ''' A list of words entries + def entries(self) -> Sequence[JMDEntry]: + """ A list of words entries :returns: a list of :class:`JMDEntry ` object - :rtype: EntryList - ''' + :rtype: List[JMDEntry] + """ return self.__entries @entries.setter - def entries(self, values): + def entries(self, values: Sequence[JMDEntry]): self.__entries = values @property - def chars(self): - ''' A list of found kanji characters + def chars(self) -> Sequence[Character]: + """ A list of found kanji characters :returns: a list of :class:`Character ` object - :rtype: CharacterList - ''' + :rtype: Sequence[Character] + """ return self.__chars @chars.setter - def chars(self, values): + def chars(self, values: Sequence[Character]): self.__chars = values @property - def names(self): - ''' A list of found named entities + def names(self) -> Sequence[JMDEntry]: + """ A list of found named entities :returns: a list of :class:`JMDEntry ` object - :rtype: EntryList - ''' + :rtype: Sequence[JMDEntry] + """ return self.__names @names.setter - def names(self, values): + def names(self, values: Sequence[JMDEntry]): self.__names = values - - def text(self, compact=True, entry_sep='。', separator=' | ', no_id=False, with_chars=True): - ''' Generate a text string that contains all found words, characters, and named entities. + def text(self, compact=True, entry_sep='。', separator=' | ', no_id=False, with_chars=True) -> str: + """ Generate a text string that contains all found words, characters, and named entities. :param compact: Make the output string more compact (fewer info, fewer whitespaces, etc.) :param no_id: Do not include jamdict's internal object IDs (for direct query via API) + :param entry_sep: The text to separate entries :param with_chars: Include characters information :returns: A formatted string ready for display - ''' + """ output = [] if self.entries: entry_txts = [] @@ -128,7 +125,7 @@ def text(self, compact=True, entry_sep='。', separator=' | ', no_id=False, with else: chars_txt = ', '.join(repr(c) for c in self.chars) if output: - output.append(separator) + output.append(separator) # TODO: section separator? output.append("[Chars]") output.append(entry_sep) output.append(chars_txt) @@ -151,9 +148,14 @@ def __str__(self): return self.text(compact=False) def to_json(self): - return {'entries': [e.to_json() for e in self.entries], - 'chars': [c.to_json() for c in self.chars], - 'names': [n.to_json() for n in self.names]} + warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", + DeprecationWarning, stacklevel=2) + return self.to_dict() + + def to_dict(self): + return {'entries': [e.to_dict() for e in self.entries], + 'chars': [c.to_dict() for c in self.chars], + 'names': [n.to_dict() for n in self.names]} class JamdictSQLite(KanjiDic2SQLite, JMNEDictSQLite, JMDictSQLite): @@ -164,7 +166,7 @@ def __init__(self, db_file, *args, **kwargs): class Jamdict(object): - ''' Main entry point to access all available dictionaries in jamdict. + """ Main entry point to access all available dictionaries in jamdict. >>> from jamdict import Jamdict >>> jam = Jamdict() @@ -176,7 +178,7 @@ class Jamdict(object): >>> for c in result.chars: >>> print(repr(c)) - Jamdict >= 0.1a10 support memory_mode keyword argument for reading + Jamdict >= 0.1a10 support memory_mode keyword argument for reading the whole database into memory before querying to boost up search speed. The database may take about a minute to load. Here is the sample code: @@ -185,7 +187,7 @@ class Jamdict(object): Jamdict will use database from jamdict-data by default. If there is a custom database available in configuration file, Jamdict will prioritise to use it over jamdict-data package. - ''' + """ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, @@ -232,7 +234,7 @@ def __init__(self, db_file=None, kd2_file=None, @property def ready(self): - ''' Check if Jamdict database is available ''' + """ Check if Jamdict database is available """ return os.path.isfile(self.db_file) and self.jmdict is not None def __del__(self): @@ -244,7 +246,7 @@ def __del__(self): pass def __make_db_ctx(self): - ''' Try to reuse context if allowed ''' + """ Try to reuse context if allowed """ try: if not self.reuse_ctx: return self.jmdict.ctx() @@ -314,7 +316,7 @@ def kd2(self): @property def jmnedict(self): - ''' JM NE SQLite database access object ''' + """ JM NE SQLite database access object """ if self._jmne_sqlite is None: if self.jmnedict_file is not None: with threading.Lock(): @@ -335,12 +337,12 @@ def jmdict_xml(self): @property def krad(self): - ''' Break a kanji down to writing components + """ Break a kanji down to writing components >>> jam = Jamdict() >>> print(jam.krad['雲']) ['一', '雨', '二', '厶'] - ''' + """ if not self.__krad_map: with threading.Lock(): self.__krad_map = KRad() @@ -348,12 +350,12 @@ def krad(self): @property def radk(self): - ''' Find all kanji with a writing component + """ Find all kanji with a writing component >>> jam = Jamdict() >>> print(jam.radk['鼎']) {'鼏', '鼒', '鼐', '鼎', '鼑'} - ''' + """ if not self.__krad_map: with threading.Lock(): self.__krad_map = KRad() @@ -381,7 +383,7 @@ def has_kd2(self): return self.db_file is not None or self.kd2_file is not None or self.kd2_xml_file is not None def has_jmne(self, ctx=None): - ''' Check if current database has jmne support ''' + """ Check if current database has jmne support """ if ctx is None: ctx = self.__make_db_ctx() m = ctx.meta.select_single('key=?', ('jmnedict.version',)) if ctx is not None else None @@ -396,7 +398,7 @@ def is_available(self): self.jmnedict_file is not None or self.jmnedict_xml_file is not None) def import_data(self): - ''' Import JMDict and KanjiDic2 data from XML to SQLite ''' + """ Import JMDict and KanjiDic2 data from XML to SQLite """ ctx = self.__make_db_ctx() ctx.buckmode() if self.jmdict and self.jmdict_xml: @@ -426,7 +428,7 @@ def import_data(self): getLogger().warning("JMNEdict XML data is not available - skipped!") def get_ne(self, idseq, ctx=None): - ''' Get name entity by idseq in JMnedict ''' + """ Get name entity by idseq in JMNEdict """ if self.jmnedict is not None: if ctx is None: ctx = self.__make_db_ctx() @@ -475,7 +477,7 @@ def all_ne_type(self, ctx=None): def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, pos=None, **kwargs): - ''' Search words, characters, and characters. + """ Search words, characters, and characters. Keyword arguments: @@ -493,8 +495,9 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, :rtype: :class:`jamdict.util.LookupResult` >>> # match any word that starts with "食べ" and ends with "る" (anything from between is fine) + >>> jam = Jamdict() >>> results = jam.lookup('食べ%る') - ''' + """ if not self.is_available(): raise LookupError("There is no backend data available") elif (not query or query == "%") and not pos: @@ -531,8 +534,8 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, class JMDictXML(object): - ''' JMDict API for looking up information in XML - ''' + """ JMDict API for looking up information in XML + """ def __init__(self, entries): self.entries = entries self._seqmap = {} # entryID - entryObj map @@ -551,7 +554,7 @@ def __len__(self): def __getitem__(self, idx): return self.entries[idx] - def lookup(self, a_query): + def lookup(self, a_query) -> Sequence[JMDEntry]: if a_query in self._textmap: return tuple(self._textmap[a_query]) elif a_query.startswith('id#'): diff --git a/test/test_jamdict.py b/test/test_jamdict.py index 95128ce..07f0e35 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -111,8 +111,8 @@ def test_jmdict_json(self): jmd = JMDictXML.from_file(MINI_JMD) e = jmd[10] self.assertIsNotNone(e) - self.assertTrue(e.to_json()) - self.assertTrue(jmd[-1].to_json()) + self.assertTrue(e.to_dict()) + self.assertTrue(jmd[-1].to_dict()) def test_kanjidic2_xml(self): print("Test KanjiDic2 XML") @@ -131,7 +131,7 @@ def test_kanjidic2_json(self): parser = Kanjidic2XMLParser() kd2 = parser.parse_file(MINI_KD2) for c in kd2: - self.assertIsNotNone(c.to_json()) + self.assertIsNotNone(c.to_dict()) def test_jamdict_xml(self): print("Test Jamdict search in XML files") @@ -199,6 +199,22 @@ def test_home_dir(self): os.environ['JAMDICT_HOME'] = _orig_home +class TestAPIWarning(unittest.TestCase): + + def test_warn_to_json_deprecated(self): + print("Test Jamdict search in XML files") + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE) + jam.import_data() + with self.assertWarns(DeprecationWarning): + r = jam.lookup("おみやげ") + self.assertTrue(r.to_json()) + with self.assertWarns(DeprecationWarning): + r2 = jam.lookup("シェンロン") + self.assertTrue(r2.to_json()) + + class TestJamdictSQLite(unittest.TestCase): @classmethod @@ -316,7 +332,7 @@ def test_jamdict_data(self): # search both noun and verb res3 = jam.lookup("かえる", pos=['noun (common) (futsuumeishi)', "transitive verb"]) forms3 = all_kanji(res3) - self.assertTrue(expected.issubset(forms3)) + self.assertTrue(expected.issubset(forms3)) self.assertTrue(expected2.issubset(forms3)) def test_jamdict_sqlite_all(self): diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 5c3e76d..ba3bf8b 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -70,9 +70,9 @@ def test_xml2sqlite(self): self.assertEqual(len(entries), len(self.xdb)) # test select entry by id e = self.db.get_entry(1001710) - ejson = e.to_json() + ejson = e.to_dict() self.assertEqual(ejson['kanji'][0]['text'], 'お菓子') - getLogger().debug(e.to_json()) + getLogger().debug(e.to_dict()) def test_import_to_ram(self): print("Testing XML to RAM") diff --git a/test/test_jmnedict.py b/test/test_jmnedict.py index 81c8197..395efc3 100644 --- a/test/test_jmnedict.py +++ b/test/test_jmnedict.py @@ -103,10 +103,10 @@ def test_xml2ramdb(self): for idseq in inserted_idseqs: ne_xml = self.xdb.lookup("id#{}".format(idseq))[0] ne_xml.idseq = int(ne_xml.idseq) - getLogger().debug(ne_xml.to_json()) + getLogger().debug(ne_xml.to_dict()) ne = ramdb.get_ne(idseq, ctx=ctx) - getLogger().debug(ne.to_json()) - self.assertEqual(ne_xml.to_json(), ne.to_json()) + getLogger().debug(ne.to_dict()) + self.assertEqual(ne_xml.to_dict(), ne.to_dict()) # test search by idseq shenron = ramdb.search_ne('id#5741815', ctx=ctx) self.assertEqual(len(shenron), 1) diff --git a/test/test_kanjidic2_sqlite.py b/test/test_kanjidic2_sqlite.py index fb33cc7..99f093c 100644 --- a/test/test_kanjidic2_sqlite.py +++ b/test/test_kanjidic2_sqlite.py @@ -62,9 +62,9 @@ def test_xml2sqlite(self): for c in self.xdb: db.insert_char(c, ctx) c2 = db.char_by_id(c.ID, ctx) - getLogger().debug("c-xml", c.to_json()) - getLogger().debug("c-sqlite", c2.to_json()) - self.assertEqual(c.to_json(), c2.to_json()) + getLogger().debug("c-xml", c.to_dict()) + getLogger().debug("c-sqlite", c2.to_dict()) + self.assertEqual(c.to_json(), c2.to_dict()) # test searching # by id c = ctx.char.select_single() From 03f3586cbd2475394525994f3cd6b7f0ff416e31 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sun, 23 May 2021 17:08:54 +0800 Subject: [PATCH 14/42] more type hint --- jamdict/util.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index 44e27fa..b44905a 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -17,7 +17,7 @@ from typing import List, Sequence from chirptext.deko import HIRAGANA, KATAKANA -from puchikarui import MemorySource +from puchikarui import MemorySource, ExecutionContext from . import config from .jmdict import JMDictXMLParser, JMDEntry @@ -152,7 +152,7 @@ def to_json(self): DeprecationWarning, stacklevel=2) return self.to_dict() - def to_dict(self): + def to_dict(self) -> dict: return {'entries': [e.to_dict() for e in self.entries], 'chars': [c.to_dict() for c in self.chars], 'names': [n.to_dict() for n in self.names]} @@ -233,7 +233,7 @@ def __init__(self, db_file=None, kd2_file=None, getLogger().warning("JMNE database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict import` first") @property - def ready(self): + def ready(self) -> bool: """ Check if Jamdict database is available """ return os.path.isfile(self.db_file) and self.jmdict is not None @@ -245,7 +245,7 @@ def __del__(self): except Exception: pass - def __make_db_ctx(self): + def __make_db_ctx(self) -> ExecutionContext: """ Try to reuse context if allowed """ try: if not self.reuse_ctx: @@ -379,17 +379,17 @@ def jmne_xml(self): getLogger().info("Loaded JMnedict entries: {}".format(len(self._jmne_xml))) return self._jmne_xml - def has_kd2(self): + def has_kd2(self) -> bool: return self.db_file is not None or self.kd2_file is not None or self.kd2_xml_file is not None - def has_jmne(self, ctx=None): + def has_jmne(self, ctx=None) -> bool: """ Check if current database has jmne support """ if ctx is None: ctx = self.__make_db_ctx() m = ctx.meta.select_single('key=?', ('jmnedict.version',)) if ctx is not None else None return m is not None and len(m.value) > 0 - def is_available(self): + def is_available(self) -> bool: # this function is for developer only # don't expose it to the public # ready should be used instead @@ -427,7 +427,7 @@ def import_data(self): else: getLogger().warning("JMNEdict XML data is not available - skipped!") - def get_ne(self, idseq, ctx=None): + def get_ne(self, idseq, ctx=None) -> JMDEntry: """ Get name entity by idseq in JMNEdict """ if self.jmnedict is not None: if ctx is None: @@ -438,7 +438,7 @@ def get_ne(self, idseq, ctx=None): else: raise LookupError("There is no JMnedict data source available") - def get_char(self, literal, ctx=None): + def get_char(self, literal, ctx=None) -> Character: if self.kd2 is not None: if ctx is None: ctx = self.__make_db_ctx() @@ -448,7 +448,7 @@ def get_char(self, literal, ctx=None): else: raise LookupError("There is no KanjiDic2 data source available") - def get_entry(self, idseq): + def get_entry(self, idseq) -> JMDEntry: if self.jmdict: return self.jmdict.get_entry(idseq) elif self.jmdict_xml: @@ -456,7 +456,7 @@ def get_entry(self, idseq): else: raise LookupError("There is no backend data available") - def all_pos(self, ctx=None): + def all_pos(self, ctx=None) -> List[str]: """ Find all available part-of-speeches :returns: A list of part-of-speeches (a list of strings) @@ -465,7 +465,7 @@ def all_pos(self, ctx=None): ctx = self.__make_db_ctx() return self.jmdict.all_pos(ctx=ctx) - def all_ne_type(self, ctx=None): + def all_ne_type(self, ctx=None) -> List[str]: """ Find all available named-entity types :returns: A list of named-entity types (a list of strings) @@ -476,7 +476,7 @@ def all_ne_type(self, ctx=None): def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, - lookup_ne=True, pos=None, **kwargs): + lookup_ne=True, pos=None, **kwargs) -> LookupResult: """ Search words, characters, and characters. Keyword arguments: From d1507b1e77fdd1b27e2c328c44acbf4cf9801a2b Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sun, 23 May 2021 17:14:33 +0800 Subject: [PATCH 15/42] more type hint --- jamdict/krad.py | 141 ++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 70 deletions(-) diff --git a/jamdict/krad.py b/jamdict/krad.py index f676dc4..d7cd59c 100644 --- a/jamdict/krad.py +++ b/jamdict/krad.py @@ -1,70 +1,71 @@ -# -*- coding: utf-8 -*- - -""" -jamdict.krad is a module for retrieving kanji components (i.e. radicals) -""" - -# This code is a part of jamdict library: https://github.com/neocl/jamdict -# :copyright: (c) 2016 Le Tuan Anh -# :license: MIT, see LICENSE for more details. - -import os -import logging -import threading -from collections import defaultdict as dd - -from chirptext import chio - -# ------------------------------------------------------------------------------ -# Configuration -# ------------------------------------------------------------------------------ -MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) -DATA_FOLDER = os.path.join(MY_FOLDER, 'data') -KRADFILE = os.path.join(DATA_FOLDER, 'kradfile-u.gz') -RADKFILE = os.path.join(DATA_FOLDER, 'radkfile.gz') - -logger = logging.getLogger(__name__) - - -######################################################################## - -class KRad: - ''' This class contains mapping from radicals to kanjis (radk) and kanjis to radicals (krad) - - ''' - def __init__(self, **kwargs): - """ Kanji-Radical mapping """ - self.__krad_map = None - self.__radk_map = None - self.__rads = {} - self.lock = threading.Lock() - - def _build_krad_map(self): - with self.lock: - lines = chio.read_file(KRADFILE, mode='rt').splitlines() - # build the krad map - self.__krad_map = {} - self.__radk_map = dd(set) - for line in lines: - if line.startswith("#"): - continue - else: - parts = line.split(':', maxsplit=1) - if len(parts) == 2: - rads = [r.strip() for r in parts[1].split()] - char_literal = parts[0].strip() - self.__krad_map[char_literal] = rads - for rad in rads: - self.__radk_map[rad].add(char_literal) - - @property - def radk(self): - if self.__radk_map is None: - self._build_krad_map() - return self.__radk_map - - @property - def krad(self): - if self.__krad_map is None: - self._build_krad_map() - return self.__krad_map +# -*- coding: utf-8 -*- + +""" +jamdict.krad is a module for retrieving kanji components (i.e. radicals) +""" + +# This code is a part of jamdict library: https://github.com/neocl/jamdict +# :copyright: (c) 2016 Le Tuan Anh +# :license: MIT, see LICENSE for more details. + +import os +import logging +import threading +from collections import defaultdict as dd +from typing import Mapping + +from chirptext import chio + +# ------------------------------------------------------------------------------ +# Configuration +# ------------------------------------------------------------------------------ +MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(MY_FOLDER, 'data') +KRADFILE = os.path.join(DATA_FOLDER, 'kradfile-u.gz') +RADKFILE = os.path.join(DATA_FOLDER, 'radkfile.gz') + +logger = logging.getLogger(__name__) + + +######################################################################## + +class KRad: + ''' This class contains mapping from radicals to kanjis (radk) and kanjis to radicals (krad) + + ''' + def __init__(self, **kwargs): + """ Kanji-Radical mapping """ + self.__krad_map: Mapping = None + self.__radk_map: Mapping = None + self.__rads = {} + self.lock = threading.Lock() + + def _build_krad_map(self): + with self.lock: + lines = chio.read_file(KRADFILE, mode='rt').splitlines() + # build the krad map + self.__krad_map = {} + self.__radk_map = dd(set) + for line in lines: + if line.startswith("#"): + continue + else: + parts = line.split(':', maxsplit=1) + if len(parts) == 2: + rads = [r.strip() for r in parts[1].split()] + char_literal = parts[0].strip() + self.__krad_map[char_literal] = rads + for rad in rads: + self.__radk_map[rad].add(char_literal) + + @property + def radk(self) -> Mapping: + if self.__radk_map is None: + self._build_krad_map() + return self.__radk_map + + @property + def krad(self) -> Mapping: + if self.__krad_map is None: + self._build_krad_map() + return self.__krad_map From 63be6d873d8bab9cd5d55729690dc7fb90d9d3f8 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Sun, 23 May 2021 22:39:19 +0800 Subject: [PATCH 16/42] add jamdict search_iter --- jamdict/jmdict_sqlite.py | 33 ++++++++++++++++++++++++--------- test/test_jmdict_sqlite.py | 12 ++++++++++++ 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index a2b589a..3a137ef 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -116,12 +116,7 @@ def all_pos(self, ctx=None): else: return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM pos")] - def search(self, query, ctx=None, pos=None, **kwargs): - # ensure context - if ctx is None: - with self.ctx() as ctx: - return self.search(query, ctx=ctx) - + def _build_search_query(self, query, pos=None): where = [] params = [] if query.startswith('id#'): @@ -149,12 +144,32 @@ def search(self, query, ctx=None, pos=None, **kwargs): params += pos # else (a context is provided) logging.getLogger(__name__).debug(f"Search query: {where} -- Params: {params}") - eids = self.Entry.select(' '.join(where), params, ctx=ctx) + return where, params + + def search(self, query, ctx=None, pos=None, **kwargs): + # ensure context + if ctx is None: + with self.ctx() as ctx: + return self.search(query, ctx=ctx, pos=pos) + where, params = self._build_search_query(query, pos=pos) + where.insert(0, 'SELECT idseq FROM Entry WHERE ') + idseqs = tuple(x['idseq'] for x in ctx.execute(' '.join(where), params)) entries = [] - for e in eids: - entries.append(self.get_entry(e.idseq, ctx=ctx)) + for idseq in idseqs: + entries.append(self.get_entry(idseq, ctx=ctx)) return entries + def search_iter(self, query, ctx=None, pos=None, **kwargs): + # ensure context + if ctx is None: + with self.ctx() as ctx: + return self.search(query, ctx=ctx, pos=pos, iter_mode=iter_mode) + where, params = self._build_search_query(query, pos=pos) + where.insert(0, 'SELECT idseq FROM Entry WHERE ') + idseqs = tuple(x['idseq'] for x in ctx.execute(' '.join(where), params)) + for idseq in idseqs: + yield self.get_entry(idseq, ctx=ctx) + def get_entry(self, idseq, ctx=None): # ensure context if ctx is None: diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index ba3bf8b..335c814 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -104,6 +104,18 @@ def test_search(self): self.assertTrue(es) getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es]))) + def test_iter_search(self): + with self.ramdb.ds.open() as ctx: + self.ramdb.insert_entries(self.xdb, ctx=ctx) + # search iter + res = self.ramdb.search_iter("%あの%", iter_mode=True, ctx=ctx) + forms = set() + for e in res: + forms.update(f.text for f in e.kana_forms) + expected = {'あのー', 'あのう', 'あの', 'かの', 'あのかた', 'あのひと'} + self.assertTrue(expected.issubset(forms)) + + # ------------------------------------------------------------------------------- # Main # ------------------------------------------------------------------------------- From 0de82967537b4f40783dd5cb245d0f6db1d1920e Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 18:59:13 +0800 Subject: [PATCH 17/42] use puchikarui >= 0.2a2 --- jamdict/jmdict_sqlite.py | 6 +++--- jamdict/jmnedict_sqlite.py | 2 +- requirements.txt | 2 +- test/test_kanjidic2_sqlite.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index 3a137ef..119da68 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -114,7 +114,7 @@ def all_pos(self, ctx=None): if ctx is None: return self.all_pos(ctx=self.ctx()) else: - return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM pos")] + return [x['text'] for x in ctx.execute("SELECT DISTINCT text FROM pos")] def _build_search_query(self, query, pos=None): where = [] @@ -166,8 +166,8 @@ def search_iter(self, query, ctx=None, pos=None, **kwargs): return self.search(query, ctx=ctx, pos=pos, iter_mode=iter_mode) where, params = self._build_search_query(query, pos=pos) where.insert(0, 'SELECT idseq FROM Entry WHERE ') - idseqs = tuple(x['idseq'] for x in ctx.execute(' '.join(where), params)) - for idseq in idseqs: + ctx_id = ctx.double(row_factory=None) + for (idseq,) in ctx_id.execute(' '.join(where), params): yield self.get_entry(idseq, ctx=ctx) def get_entry(self, idseq, ctx=None): diff --git a/jamdict/jmnedict_sqlite.py b/jamdict/jmnedict_sqlite.py index f5e8140..09f74f1 100644 --- a/jamdict/jmnedict_sqlite.py +++ b/jamdict/jmnedict_sqlite.py @@ -81,7 +81,7 @@ def all_ne_type(self, ctx=None): if ctx is None: return self.all_ne_type(ctx=self.ctx()) else: - return [x['text'] for x in ctx.select("SELECT DISTINCT text FROM NETransType")] + return [x['text'] for x in ctx.execute("SELECT DISTINCT text FROM NETransType")] def search_ne(self, query, ctx=None, **kwargs) -> Sequence[JMDEntry]: # ensure context diff --git a/requirements.txt b/requirements.txt index 77e676c..feec7c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ chirptext >= 0.1, <= 0.2 -puchikarui >= 0.2a1, < 0.3 +puchikarui >= 0.2a2, < 0.3 diff --git a/test/test_kanjidic2_sqlite.py b/test/test_kanjidic2_sqlite.py index 99f093c..5a95d90 100644 --- a/test/test_kanjidic2_sqlite.py +++ b/test/test_kanjidic2_sqlite.py @@ -64,7 +64,8 @@ def test_xml2sqlite(self): c2 = db.char_by_id(c.ID, ctx) getLogger().debug("c-xml", c.to_dict()) getLogger().debug("c-sqlite", c2.to_dict()) - self.assertEqual(c.to_json(), c2.to_dict()) + with self.assertWarns(DeprecationWarning): + self.assertEqual(c.to_json(), c2.to_dict()) # test searching # by id c = ctx.char.select_single() From 856e87b0d989a764cf6b88ccb7c119f73f1301c5 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 19:50:08 +0800 Subject: [PATCH 18/42] make jamdict works with puchikarui 0.1 and 0.2 --- jamdict/jmdict_sqlite.py | 3 +-- jamdict/util.py | 29 +++++++++++++++++++++++++---- requirements.txt | 2 +- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index 119da68..aef0bce 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -166,8 +166,7 @@ def search_iter(self, query, ctx=None, pos=None, **kwargs): return self.search(query, ctx=ctx, pos=pos, iter_mode=iter_mode) where, params = self._build_search_query(query, pos=pos) where.insert(0, 'SELECT idseq FROM Entry WHERE ') - ctx_id = ctx.double(row_factory=None) - for (idseq,) in ctx_id.execute(' '.join(where), params): + for (idseq,) in ctx.conn.cursor().execute(' '.join(where), params): yield self.get_entry(idseq, ctx=ctx) def get_entry(self, idseq, ctx=None): diff --git a/jamdict/util.py b/jamdict/util.py index b44905a..b3c3bef 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -17,7 +17,13 @@ from typing import List, Sequence from chirptext.deko import HIRAGANA, KATAKANA -from puchikarui import MemorySource, ExecutionContext +_MEMORY_MODE = False +try: + from puchikarui import MemorySource + _MEMORY_MODE = True +except ImportError: + pass +from puchikarui import ExecutionContext from . import config from .jmdict import JMDictXMLParser, JMDEntry @@ -299,7 +305,12 @@ def jmdict(self): if not self._db_sqlite and self.db_file: with threading.Lock(): # Use 1 DB for all - data_source = MemorySource(self.db_file) if self.memory_mode else self.db_file + if self.memory_mode and _MEMORY_MODE: + data_source = MemorySource(self.db_file) + else: + if self.memory_mode and not _MEMORY_MODE: + logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") + data_source = self.db_file self._db_sqlite = JamdictSQLite(data_source, auto_expand_path=self.auto_expand) return self._db_sqlite @@ -308,7 +319,12 @@ def kd2(self): if self._kd2_sqlite is None: if self.kd2_file is not None and os.path.isfile(self.kd2_file): with threading.Lock(): - data_source = MemorySource(self.kd2_file) if self.memory_mode else self.kd2_file + if self.memory_mode and _MEMORY_MODE: + data_source = MemorySource(self.kd2_file) + else: + if self.memory_mode and not _MEMORY_MODE: + logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") + data_source = self.kd2_file self._kd2_sqlite = KanjiDic2SQLite(data_source, auto_expand_path=self.auto_expand) elif not self.kd2_file or self.kd2_file == self.db_file: self._kd2_sqlite = self.jmdict @@ -320,7 +336,12 @@ def jmnedict(self): if self._jmne_sqlite is None: if self.jmnedict_file is not None: with threading.Lock(): - data_source = MemorySource(self.jmnedict_file) if self.memory_mode else self.jmnedict_file + if self.memory_mode and _MEMORY_MODE: + data_source = MemorySource(self.jmnedict_file) + else: + if self.memory_mode and not _MEMORY_MODE: + logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") + data_source = self.jmnedict_file self._jmne_sqlite = JMNEDictSQLite(data_source, auto_expand_path=self.auto_expand) elif not self.jmnedict_file or self.jmnedict_file == self.db_file: self._jmne_sqlite = self.jmdict diff --git a/requirements.txt b/requirements.txt index feec7c6..8476cde 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ chirptext >= 0.1, <= 0.2 -puchikarui >= 0.2a2, < 0.3 +puchikarui >= 0.1, < 0.3 From 5aeb6dff3246bc231f76e1ce51e21bb06bcbefc4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:25:44 +0800 Subject: [PATCH 19/42] add search_ne_iter() --- jamdict/jmnedict_sqlite.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/jamdict/jmnedict_sqlite.py b/jamdict/jmnedict_sqlite.py index 09f74f1..06df43e 100644 --- a/jamdict/jmnedict_sqlite.py +++ b/jamdict/jmnedict_sqlite.py @@ -82,35 +82,45 @@ def all_ne_type(self, ctx=None): return self.all_ne_type(ctx=self.ctx()) else: return [x['text'] for x in ctx.execute("SELECT DISTINCT text FROM NETransType")] - - def search_ne(self, query, ctx=None, **kwargs) -> Sequence[JMDEntry]: - # ensure context - if ctx is None: - with self.ctx() as ctx: - return self.search_ne(query, ctx=ctx) + + def _build_ne_search_query(self, query): _is_wildcard_search = '_' in query or '@' in query or '%' in query if _is_wildcard_search: where = "idseq IN (SELECT idseq FROM NEKanji WHERE text like ?) OR idseq IN (SELECT idseq FROM NEKana WHERE text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransGloss ON NETranslation.ID == NETransGloss.tid WHERE NETransGloss.text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransType ON NETranslation.ID == NETransType.tid WHERE NETransType.text like ?)" else: where = "idseq IN (SELECT idseq FROM NEKanji WHERE text == ?) OR idseq IN (SELECT idseq FROM NEKana WHERE text == ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransGloss ON NETranslation.ID == NETransGloss.tid WHERE NETransGloss.text == ?) or idseq in (SELECT idseq FROM NETranslation JOIN NETransType ON NETranslation.ID == NETransType.tid WHERE NETransType.text == ?)" - getLogger().debug(where) params = [query, query, query, query] try: if query.startswith('id#'): query_int = int(query[3:]) if query_int >= 0: - getLogger().debug("Searching NE by ID: {}".format(query_int)) where = "idseq = ?" params = [query_int] except Exception: pass - # else (a context is provided) - eids = self.NEEntry.select(where, params, ctx=ctx) + getLogger().debug(f"where={where} | params={params}") + return where, params + + def search_ne(self, query, ctx=None, **kwargs) -> Sequence[JMDEntry]: + if ctx is None: + with self.ctx() as ctx: + return self.search_ne(query, ctx=ctx) + where, params = self._build_ne_search_query(query) + where = 'SELECT idseq FROM NEEntry WHERE ' + where entries = [] - for e in eids: - entries.append(self.get_ne(e.idseq, ctx=ctx)) + for (idseq,) in ctx.conn.cursor().execute(where, params): + entries.append(self.get_ne(idseq, ctx=ctx)) return entries + def search_ne_iter(self, query, ctx=None, **kwargs): + if ctx is None: + with self.ctx() as ctx: + return self.search_ne(query, ctx=ctx) + where, params = self._build_ne_search_query(query) + where = 'SELECT idseq FROM NEEntry WHERE ' + where + for (idseq,) in ctx.conn.cursor().execute(where, params): + yield self.get_ne(idseq, ctx=ctx) + def get_ne(self, idseq, ctx=None) -> JMDEntry: # ensure context if ctx is None: From 4baf9284a32cc618414ba8f439e1ae81c2f41837 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:25:53 +0800 Subject: [PATCH 20/42] add search_chars_iter() --- jamdict/kanjidic2_sqlite.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/jamdict/kanjidic2_sqlite.py b/jamdict/kanjidic2_sqlite.py index ac488f8..1b91c64 100644 --- a/jamdict/kanjidic2_sqlite.py +++ b/jamdict/kanjidic2_sqlite.py @@ -172,6 +172,15 @@ def insert_char(self, c, ctx=None): m.gid = rmg.ID ctx.meaning.save(m) + def search_chars_iter(self, chars, ctx=None): + if ctx is None: + with self.ctx() as ctx: + return self.search_chars_iter(chars, ctx=ctx) + for c in chars: + res = self.get_char(c, ctx=ctx) + if res is not None: + yield res + def get_char(self, literal, ctx=None): if ctx is None: with self.ctx() as ctx: From e85cf361c6d021f3e0c509e3eb950249c4df3bf9 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:26:03 +0800 Subject: [PATCH 21/42] use another cursor to loop through idseqs in iter search --- jamdict/jmdict_sqlite.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index aef0bce..bde14c4 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -145,7 +145,7 @@ def _build_search_query(self, query, pos=None): # else (a context is provided) logging.getLogger(__name__).debug(f"Search query: {where} -- Params: {params}") return where, params - + def search(self, query, ctx=None, pos=None, **kwargs): # ensure context if ctx is None: @@ -153,9 +153,8 @@ def search(self, query, ctx=None, pos=None, **kwargs): return self.search(query, ctx=ctx, pos=pos) where, params = self._build_search_query(query, pos=pos) where.insert(0, 'SELECT idseq FROM Entry WHERE ') - idseqs = tuple(x['idseq'] for x in ctx.execute(' '.join(where), params)) entries = [] - for idseq in idseqs: + for (idseq,) in ctx.conn.cursor().execute(' '.join(where), params): entries.append(self.get_entry(idseq, ctx=ctx)) return entries From 3b58f3bd2e9f52fb425f2700079b3a74717c064d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:26:24 +0800 Subject: [PATCH 22/42] add text alias for char.literal --- jamdict/kanjidic2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index ffdb69a..9841e1a 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -79,7 +79,7 @@ def __getitem__(self, idx): class Character(object): """ Represent a kanji character. - + """ def __init__(self): @@ -103,6 +103,10 @@ def __init__(self): self.rm_groups: List[RMGroup] = [] # reading_meaning groups self.nanoris = [] # a list of strings + @property + def text(self): + return self.literal + def __repr__(self): meanings = self.meanings(english_only=True) return "{l}:{sc}:{meanings}".format(l=self.literal, sc=self.stroke_count, meanings=','.join(meanings)) From 6a3b042b1b90e00675c2bd7b800bca890ae0fcf2 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:26:36 +0800 Subject: [PATCH 23/42] add first version of lookup_iter() --- jamdict/util.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 3 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index b3c3bef..5bddeca 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -164,6 +164,41 @@ def to_dict(self) -> dict: 'names': [n.to_dict() for n in self.names]} +class IterLookupResult(object): + + """ Contain lookup results (words, Kanji characters, or named entities) from Jamdict. + + A typical jamdict lookup is like this: + + >>> jam = Jamdict() + >>> result = jam.lookup_iter('食べ%る') + + The command above returns a :class:`IterLookupResult` object which contains iterators + to scan through found words (``entries``), kanji characters (``chars``), + and named entities (:any:`names`) one by one. + """ + + def __init__(self, entries, chars=None, names=None): + self.__entries = entries if entries is not None else [] + self.__chars = chars if chars is not None else [] + self.__names = names if names is not None else [] + + @property + def entries(self): + """ Iterator for looping one by one through all found entries, can only be used once """ + return self.__entries + + @property + def chars(self): + """ Iterator for looping one by one through all found kanji characters, can only be used once """ + return self.__chars + + @property + def names(self): + """ Iterator for looping one by one through all found named entities, can only be used once """ + return self.__names + + class JamdictSQLite(KanjiDic2SQLite, JMNEDictSQLite, JMDictSQLite): def __init__(self, db_file, *args, **kwargs): @@ -478,7 +513,7 @@ def get_entry(self, idseq) -> JMDEntry: raise LookupError("There is no backend data available") def all_pos(self, ctx=None) -> List[str]: - """ Find all available part-of-speeches + """ Find all available part-of-speeches :returns: A list of part-of-speeches (a list of strings) """ @@ -494,7 +529,6 @@ def all_ne_type(self, ctx=None) -> List[str]: if ctx is None: ctx = self.__make_db_ctx() return self.jmnedict.all_ne_type(ctx=ctx) - def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, pos=None, **kwargs) -> LookupResult: @@ -525,7 +559,6 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, raise ValueError("Query and POS filter cannot be both empty") if ctx is None: ctx = self.__make_db_ctx() - # Lookup words entries = [] chars = [] names = [] @@ -553,6 +586,51 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, # finish return LookupResult(entries, chars, names) + def lookup_iter(self, query, strict_lookup=False, + lookup_chars=True, lookup_ne=True, + ctx=None, pos=None, **kwargs) -> LookupResult: + """ Search words, characters, and characters. + + Keyword arguments: + + :param query: Text to query, may contains wildcard characters. Use `?` for 1 exact character and `%` to match any number of characters. + :param strict_lookup: only look up the Kanji characters in query (i.e. discard characters from variants) + :type strict_lookup: bool + :param: lookup_chars: set lookup_chars to False to disable character lookup + :type lookup_chars: bool + :param pos: Filter words by part-of-speeches + :type pos: list of strings + :param ctx: database access context, can be reused for better performance. Normally users do not have to touch this and database connections will be reused by default. + :param lookup_ne: set lookup_ne to False to disable name-entities lookup + :type lookup_ne: bool + :returns: Return an IterLookupResult object. + :rtype: :class:`jamdict.util.IterLookupResult` + + >>> # match any word that starts with "食べ" and ends with "る" (anything from between is fine) + >>> jam = Jamdict() + >>> results = jam.lookup_iter('食べ%る') + """ + if not self.is_available(): + raise LookupError("There is no backend data available") + elif (not query or query == "%") and not pos: + raise ValueError("Query and POS filter cannot be both empty") + if ctx is None: + ctx = self.__make_db_ctx() + # Lookup entries, chars, and names + entries = None + chars = None + names = None + if self.jmdict is not None: + entries = self.jmdict.search_iter(query, pos=pos, ctx=ctx) + if lookup_chars and self.has_kd2(): + chars_to_search = OrderedDict({c: c for c in query if c not in HIRAGANA and c not in KATAKANA}) + chars = self.kd2.search_chars_iter(chars_to_search, ctx=ctx) + # lookup name-entities + if lookup_ne and self.has_jmne(ctx=ctx): + names = self.jmnedict.search_ne_iter(query, ctx=ctx) + # finish + return IterLookupResult(entries, chars, names) + class JMDictXML(object): """ JMDict API for looking up information in XML From a2b78becbbeccc7c9c3145cbe6fe5d80b9f61e7a Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:26:51 +0800 Subject: [PATCH 24/42] clean up test code --- test/test_jmdict_sqlite.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py index 335c814..8bf88db 100644 --- a/test/test_jmdict_sqlite.py +++ b/test/test_jmdict_sqlite.py @@ -105,12 +105,10 @@ def test_search(self): getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es]))) def test_iter_search(self): - with self.ramdb.ds.open() as ctx: + with self.ramdb.open() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) - # search iter - res = self.ramdb.search_iter("%あの%", iter_mode=True, ctx=ctx) forms = set() - for e in res: + for e in self.ramdb.search_iter("%あの%", iter_mode=True, ctx=ctx): forms.update(f.text for f in e.kana_forms) expected = {'あのー', 'あのう', 'あの', 'かの', 'あのかた', 'あのひと'} self.assertTrue(expected.issubset(forms)) From 2b2b30a79e03555428fff6ad5df88e0c0abcf3af Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Mon, 24 May 2021 23:26:57 +0800 Subject: [PATCH 25/42] test lookup_iter --- test/test_jamdict.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/test_jamdict.py b/test/test_jamdict.py index 07f0e35..1daf525 100644 --- a/test/test_jamdict.py +++ b/test/test_jamdict.py @@ -373,6 +373,37 @@ def test_jamdict_sqlite_all(self): self.assertEqual(2, len(result.chars)) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) + def test_lookup_iter(self): + jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, + kd2_xml_file=MINI_KD2, + jmnedict_xml_file=MINI_JMNE, auto_config=True) + jam.import_data() + # verify entries + res = jam.lookup_iter("おこ%", pos="noun (common) (futsuumeishi)") + entries = [e.text() for e in res.entries] + expected = ['おこのみやき (お好み焼き) : okonomiyaki/savoury pancake containing meat or seafood and ' + 'vegetables', + 'おこさん (お子さん) : child', + "おこさま (お子様) : child (someone else's)"] + self.assertEqual(expected, entries) + # verify characters + res = jam.lookup_iter("お土産") + self.assertIsNotNone(res.entries) + self.assertIsNotNone(res.chars) + self.assertIsNotNone(res.names) + # verify characters + chars = [repr(c) for c in res.chars] + expected = ['土:3:soil,earth,ground,Turkey', + '産:11:products,bear,give birth,yield,childbirth,native,property'] + self.assertEqual(expected, chars) + # verify names + res = jam.lookup_iter("surname") + names = [n.text() for n in res.names] + expected = ['しめたに (〆谷) : Shimetani (surname)', + 'しめき (〆木) : Shimeki (surname)', + 'しめの (〆野) : Shimeno (surname)'] + self.assertEqual(expected, names) + ######################################################################## From 18a340379b8e2eea5fecb6bbdeb8436ea4b4f442 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 13:42:27 +0800 Subject: [PATCH 26/42] use the improved buckmode() in puchikarui >= 0.2a3 --- jamdict/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jamdict/util.py b/jamdict/util.py index 5bddeca..4235fee 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -482,6 +482,7 @@ def import_data(self): self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ne_ctx) else: getLogger().warning("JMNEdict XML data is not available - skipped!") + ctx.commit() def get_ne(self, idseq, ctx=None) -> JMDEntry: """ Get name entity by idseq in JMNEdict """ From 13e10fa483de58838010deae464a210c4699e912 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 20:24:59 +0800 Subject: [PATCH 27/42] try to switch off buckmode after import --- jamdict/util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jamdict/util.py b/jamdict/util.py index 4235fee..e64bddc 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -482,6 +482,9 @@ def import_data(self): self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ne_ctx) else: getLogger().warning("JMNEdict XML data is not available - skipped!") + _buckmode_off = getattr(ctx, "buckmode_off", None) + if _buckmode_off is not None: + _buckmode_off() ctx.commit() def get_ne(self, idseq, ctx=None) -> JMDEntry: From bf38602d043d1687bbb760bed5222aa54d5b48c6 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 21:01:54 +0800 Subject: [PATCH 28/42] Update changelog --- docs/updates.rst | 91 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/docs/updates.rst b/docs/updates.rst index 388bbf2..dd42720 100644 --- a/docs/updates.rst +++ b/docs/updates.rst @@ -1,49 +1,46 @@ .. _updates: -Updates -======= - -2021-04-19 ----------- - -- [Version 0.1a9] -- Fix data audit query -- Enhanced Jamdict() constructor. ``Jamdict('/path/to/jamdict.db')`` - works properly. -- Code quality review -- Automated documentation build via - `readthedocs.org `__ - -.. _section-1: - -2021-04-15 ----------- - -- Make ``lxml`` optional -- Data package can be installed via PyPI with ``jamdict_data`` package -- Make configuration file optional as data files can be installed via - PyPI. - -.. _section-2: - -2020-05-31 ----------- - -- [Version 0.1a7] -- Added Japanese Proper Names Dictionary (JMnedict) support -- Included built-in KRADFILE/RADKFile support -- Improved command line tools (json, compact mode, etc.) - -.. _section-3: - -2017-08-18 ----------- - -- Support KanjiDic2 (XML/SQLite formats) - -.. _section-4: - -2016-11-09 ----------- - -- Release first version to Github +Jamdict Changelog +================= + +jamdict 0.1a10 +-------------- + +- Added ``memory_mode`` keyword to load database into memory before querying to boost up performance +- Improved import performance by using puchikarui's ``buckmode`` +- Tested with both puchikarui 0.1.* and 0.2.* + +jamdict 0.1a9 +------------- + +- 2021-04-19 + - Fix data audit query + - Enhanced ``Jamdict()`` constructor. ``Jamdict('/path/to/jamdict.db')`` + works properly. + - Code quality review + - Automated documentation build via + `readthedocs.org `__ + +jamdict 0.1a8 +------------- + +- 2021-04-15 + - Make ``lxml`` optional + - Data package can be installed via PyPI with ``jamdict_data`` package + - Make configuration file optional as data files can be installed via PyPI. + +jamdict 0.1a7 +------------- + +- 2020-05-31 + - Added Japanese Proper Names Dictionary (JMnedict) support + - Included built-in KRADFILE/RADKFile support + - Improved command line tools (json, compact mode, etc.) + +Older versions +-------------- + +- 2017-08-18 + - Support KanjiDic2 (XML/SQLite formats) +- 2016-11-09 + - Release first version to Github From e4058f595288ad7a9947f867175aac0b61c89702 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 21:58:58 +0800 Subject: [PATCH 29/42] update API list --- docs/api.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 2f0a210..8badde3 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,3 +1,5 @@ +.. _api_index: + jamdict APIs ============ @@ -5,14 +7,18 @@ An overview of jamdict modules. .. module:: jamdict +.. autoclass:: jamdict.util.Jamdict + :members: + :member-order: groupwise + :exclude-members: get_ne, has_jmne, import_data, jmnedict + .. autoclass:: jamdict.util.LookupResult :members: :member-order: groupwise -.. autoclass:: jamdict.util.Jamdict +.. autoclass:: jamdict.util.IterLookupResult :members: :member-order: groupwise - :exclude-members: get_ne, has_jmne, import_data, jmnedict .. module:: jamdict.jmdict From 6351c1dcb0dd9a28093323c697553fe9fa37ffab Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 21:59:03 +0800 Subject: [PATCH 30/42] add welcome section and replit demo link --- docs/index.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/index.rst b/docs/index.rst index 0162514..d8bac5e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,6 +4,15 @@ Jamdict's documentation! `Jamdict `_ is a Python 3 library for manipulating Jim Breen's JMdict, KanjiDic2, JMnedict and kanji-radical mappings. +Welcome +------- + +Are you new to this documentation? Here are some useful pages: + +- Want to try out Jamdict package? Try `Jamdict online demo `_ +- Want some useful code samples? See :ref:`recipes`. +- Want to look deeper into the package? See :ref:`api_index`. + Main features ------------- @@ -35,6 +44,10 @@ For more information please see :ref:`installpage` page. pip install jamdict jamdict-data +There is a demo Jamdict virtual machine to try out online on Repl.it + +https://replit.com/@tuananhle/jamdict-demo + Sample jamdict Python code -------------------------- From 8deeedf0fb131a98e5b128eba3987a7789d39dbf Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 21:59:12 +0800 Subject: [PATCH 31/42] add iteration search and pos recipes --- docs/recipes.rst | 50 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/docs/recipes.rst b/docs/recipes.rst index 9ad5998..7813276 100644 --- a/docs/recipes.rst +++ b/docs/recipes.rst @@ -3,10 +3,9 @@ Common Recipes ============== -- Search words using wildcards. -- Searching for kanji characters. -- Decomposing kanji characters into components, or search kanji characters by components. -- Search for named entities. +.. contents:: + :local: + :depth: 2 .. warning:: 👉 ⚠️ THIS SECTION IS STILL UNDER CONSTRUCTION ⚠️ @@ -20,7 +19,7 @@ High-performance tuning ----------------------- When you need to do a lot of queries on the database, it is possible to load the whole database -into memory to boost up querying performance (This will takes about 400 MB of RAM) by using the ``memory_mode`` +into memory to boost up querying performance (This will takes about 400 MB of RAM) by using the :class:`memory_mode ` keyword argument, like this: >>> from jamdict import Jamdict @@ -28,6 +27,47 @@ keyword argument, like this: The first query will be extremely slow (it may take about a minute for the whole database to be loaded into memory) but subsequent queries will be much faster. + +Iteration search +---------------- + +Sometimes people want to look through a set of search results only once and determine which items to keep +and then discard the rest. In these cases :func:`lookup_iter ` should be used. +This function returns an :class:`IterLookupResult ` object immediately after called. +Users may loop through ``result.entries``, ``result.chars``, and ``result.names`` exact one loop for each +set to find the items that they want. Users will have to store the desired word entries, characters, and names +by themselves since they are discarded after yield. + +>>> res = jam.lookup_iter("花見") +>>> for word in res.entries: +... print(word) # do somethign with the word +>>> for c in res.chars: +... print(c) +>>> for name in res.names: +... print(name) + +Part-of-speeches and named-entity types +--------------------------------------- + +Use :func:`Jamdict.all_pos ` to list all available part-of-speeches +and :func:`Jamdict.all_ne_type ` named-entity types: + +>>> for pos in jam.all_pos(): +... print(pos) # pos is a string +>>> for ne_type in jam.all_ne_type(): +... print(ne_type) # ne_type is a string + +To filter words by part-of-speech use the keyword argument ``pos`` +in :func:`loookup() ` or :func:`lookup_iter() ` +functions. + +For example to look for all "かえる" that are nouns use: + +>>> result = jam.lookup("かえる", pos=["noun (common) (futsuumeishi)"]) + +To search for all named-entities that are "surname" use: + +>>> result = jam.lookup("surname") Kanjis and radical/components (KRAD/RADK mappings) -------------------------------------------------- From f1337dd4dbc1d188638a28bc2735687b58f908f7 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 21:59:25 +0800 Subject: [PATCH 32/42] update docstring --- jamdict/util.py | 50 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/jamdict/util.py b/jamdict/util.py index e64bddc..d1c2ac1 100644 --- a/jamdict/util.py +++ b/jamdict/util.py @@ -170,12 +170,18 @@ class IterLookupResult(object): A typical jamdict lookup is like this: - >>> jam = Jamdict() - >>> result = jam.lookup_iter('食べ%る') + >>> res = jam.lookup_iter("花見") - The command above returns a :class:`IterLookupResult` object which contains iterators + ``res`` is an :class:`IterLookupResult` object which contains iterators to scan through found words (``entries``), kanji characters (``chars``), and named entities (:any:`names`) one by one. + + >>> for word in res.entries: + ... print(word) # do somethign with the word + >>> for c in res.chars: + ... print(c) + >>> for name in res.names: + ... print(name) """ def __init__(self, entries, chars=None, names=None): @@ -219,15 +225,29 @@ class Jamdict(object): >>> for c in result.chars: >>> print(repr(c)) - Jamdict >= 0.1a10 support memory_mode keyword argument for reading + To filter results by ``pos``, for example look for all "かえる" that are nouns, use: + + >>> result = jam.lookup("かえる", pos=["noun (common) (futsuumeishi)"]) + + To search for named-entities by type, use the type string as query. + For example to search for all "surname" use: + + >>> result = jam.lookup("surname") + + To find out which part-of-speeches or named-entities types are available in the + dictionary, use :func:`Jamdict.all_pos ` + and :func:`Jamdict.all_ne_type `. + + Jamdict >= 0.1a10 support ``memory_mode`` keyword argument for reading the whole database into memory before querying to boost up search speed. The database may take about a minute to load. Here is the sample code: >>> jam = Jamdict(memory_mode=True) - Jamdict will use database from jamdict-data by default. + When there is no suitable database available, Jamdict will try to use database + from `jamdict-data `_ package by default. If there is a custom database available in configuration file, - Jamdict will prioritise to use it over jamdict-data package. + Jamdict will prioritise to use it over the ``jamdict-data`` package. """ def __init__(self, db_file=None, kd2_file=None, @@ -593,7 +613,19 @@ def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, def lookup_iter(self, query, strict_lookup=False, lookup_chars=True, lookup_ne=True, ctx=None, pos=None, **kwargs) -> LookupResult: - """ Search words, characters, and characters. + """ Search for words, characters, and characters iteratively. + + An :class:`IterLookupResult` object will be returned instead of the normal ``LookupResult``. + ``res.entries``, ``res.chars``, ``res.names`` are iterators instead of lists and each of them + can only be looped through once. Users have to store the results manually. + + >>> res = jam.lookup_iter("花見") + >>> for word in res.entries: + ... print(word) # do somethign with the word + >>> for c in res.chars: + ... print(c) + >>> for name in res.names: + ... print(name) Keyword arguments: @@ -609,10 +641,6 @@ def lookup_iter(self, query, strict_lookup=False, :type lookup_ne: bool :returns: Return an IterLookupResult object. :rtype: :class:`jamdict.util.IterLookupResult` - - >>> # match any word that starts with "食べ" and ends with "る" (anything from between is fine) - >>> jam = Jamdict() - >>> results = jam.lookup_iter('食べ%る') """ if not self.is_available(): raise LookupError("There is no backend data available") From 5765830945090fb8dc47645a290fff3e85aedd3a Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:03:25 +0800 Subject: [PATCH 33/42] update changelog for jamdict 0.1a11 --- docs/updates.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/updates.rst b/docs/updates.rst index dd42720..2750bb5 100644 --- a/docs/updates.rst +++ b/docs/updates.rst @@ -3,6 +3,14 @@ Jamdict Changelog ================= +jamdict 0.1a11 +-------------- + +- Added ``lookup_iter()`` for iteration search +- Added ``pos`` filter for filtering words by part-of-speeches +- Added ``all_pos()`` and ``all_ne_type()`` to Jamdict to list part-of-speeches and named-entity types +- Improved documentation + jamdict 0.1a10 -------------- From 8ad8b8eb633641c5e9e950c711496275a1ac7a1d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:05:35 +0800 Subject: [PATCH 34/42] update changelog --- docs/updates.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/updates.rst b/docs/updates.rst index 2750bb5..6c66804 100644 --- a/docs/updates.rst +++ b/docs/updates.rst @@ -9,6 +9,7 @@ jamdict 0.1a11 - Added ``lookup_iter()`` for iteration search - Added ``pos`` filter for filtering words by part-of-speeches - Added ``all_pos()`` and ``all_ne_type()`` to Jamdict to list part-of-speeches and named-entity types +- Better version checking in ``__version__.py`` - Improved documentation jamdict 0.1a10 From 0ce0c8ce5f29f4c5e3b449bfe96aa004fd6ad53f Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:05:40 +0800 Subject: [PATCH 35/42] update package info in setup.py --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 74a82f6..8244e70 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ def read(*filenames, **kwargs): author=pkg_info['__author__'], tests_require=requirements, install_requires=requirements, + python_requires=">=3.6", author_email=pkg_info['__email__'], description=pkg_info['__description__'], long_description=long_description, @@ -60,7 +61,11 @@ def read(*filenames, **kwargs): platforms='any', test_suite='test', # Reference: https://pypi.python.org/pypi?%3Aaction=list_classifiers - classifiers=['Programming Language :: Python', + classifiers=['Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Development Status :: {}'.format(pkg_info['__status__']), 'Natural Language :: Japanese', 'Natural Language :: English', From 90c06f77bfc57170394af617ef23babe527d25b4 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:11:37 +0800 Subject: [PATCH 36/42] update package info --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 4804807..81f2bc7 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,6 @@ * Fast look up (dictionaries are stored in SQLite databases) * Command-line lookup tool [(Example)](#command-line-tools) -Homepage: [https://github.com/neocl/jamdict](https://github.com/neocl/jamdict) - [Contributors](#contributors) are welcome! 🙇. If you want to help, please see [Contributing](https://jamdict.readthedocs.io/en/latest/contributing.html) page. # Try Jamdict out From 320af2bd20e55e76accbdf3e17907724b03b0c4c Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:11:45 +0800 Subject: [PATCH 37/42] update index.rst --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index d8bac5e..7c8b45f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,7 +36,7 @@ If you want to help developing Jamdict, please visit :ref:`contributing` page. Installation ------------ -Jamdict is `available on PyPI `_ and +Jamdict and `jamdict-data `_ are both `available on PyPI `_ and can be installed using pip. For more information please see :ref:`installpage` page. @@ -44,7 +44,7 @@ For more information please see :ref:`installpage` page. pip install jamdict jamdict-data -There is a demo Jamdict virtual machine to try out online on Repl.it +Also, there is an online demo Jamdict virtual machine to try out on Repl.it https://replit.com/@tuananhle/jamdict-demo From 286560a1d36fc0a6fb91cdc1c3ab5b0f71ab2cf2 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:15:22 +0800 Subject: [PATCH 38/42] add under construction note --- docs/api.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/api.rst b/docs/api.rst index 8badde3..de48a08 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -5,6 +5,9 @@ jamdict APIs An overview of jamdict modules. +.. warning:: + 👉 ⚠️ THIS SECTION IS STILL UNDER CONSTRUCTION ⚠️ Help is much needed. + .. module:: jamdict .. autoclass:: jamdict.util.Jamdict From 8a58d16ec4112a0b6af7a277468561c52166c741 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:15:29 +0800 Subject: [PATCH 39/42] shout for help --- docs/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.rst b/docs/index.rst index 7c8b45f..2d31eee 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,7 @@ Are you new to this documentation? Here are some useful pages: - Want to try out Jamdict package? Try `Jamdict online demo `_ - Want some useful code samples? See :ref:`recipes`. - Want to look deeper into the package? See :ref:`api_index`. +- If you want to help developing Jamdict, please visit :ref:`contributing` page. Main features ------------- From 3a5fb1eb6a0f1451eb16b80e997aaa6f97f6307d Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:23:34 +0800 Subject: [PATCH 40/42] add changelog to table of contents --- docs/index.rst | 1 + jamdict/__version__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 2d31eee..9d7106a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -139,6 +139,7 @@ Documentation recipes api contributing + updates Other info ========== diff --git a/jamdict/__version__.py b/jamdict/__version__.py index baedb11..fd0acc4 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -13,7 +13,7 @@ # Version configuration (enforcing PEP 440) # ------------------------------------------------------------------------------ __status__ = "3 - Alpha" -__version_tuple__ = (0, 1, 0, 10, 4) +__version_tuple__ = (0, 1, 0, 10, 6) __version_status__ = '' # a specific value ('rc', 'dev', etc.) or leave blank to be auto-filled # ------------------------------------------------------------------------------ __status_map__ = {'3 - Alpha': 'a', '4 - Beta': 'b', '5 - Production/Stable': '', '6 - Mature': ''} From 916f9c258818522265aa8e9e2835fa1592ca2f04 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:23:43 +0800 Subject: [PATCH 41/42] update changelog --- docs/updates.rst | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/updates.rst b/docs/updates.rst index 6c66804..c929edc 100644 --- a/docs/updates.rst +++ b/docs/updates.rst @@ -6,23 +6,28 @@ Jamdict Changelog jamdict 0.1a11 -------------- -- Added ``lookup_iter()`` for iteration search -- Added ``pos`` filter for filtering words by part-of-speeches -- Added ``all_pos()`` and ``all_ne_type()`` to Jamdict to list part-of-speeches and named-entity types -- Better version checking in ``__version__.py`` -- Improved documentation +- 2021-05-25 + + - Added ``lookup_iter()`` for iteration search + - Added ``pos`` filter for filtering words by part-of-speeches + - Added ``all_pos()`` and ``all_ne_type()`` to Jamdict to list part-of-speeches and named-entity types + - Better version checking in ``__version__.py`` + - Improved documentation jamdict 0.1a10 -------------- -- Added ``memory_mode`` keyword to load database into memory before querying to boost up performance -- Improved import performance by using puchikarui's ``buckmode`` -- Tested with both puchikarui 0.1.* and 0.2.* +- 2021-05-19 + + - Added ``memory_mode`` keyword to load database into memory before querying to boost up performance + - Improved import performance by using puchikarui's ``buckmode`` + - Tested with both puchikarui 0.1.* and 0.2.* jamdict 0.1a9 ------------- - 2021-04-19 + - Fix data audit query - Enhanced ``Jamdict()`` constructor. ``Jamdict('/path/to/jamdict.db')`` works properly. @@ -34,6 +39,7 @@ jamdict 0.1a8 ------------- - 2021-04-15 + - Make ``lxml`` optional - Data package can be installed via PyPI with ``jamdict_data`` package - Make configuration file optional as data files can be installed via PyPI. @@ -42,6 +48,7 @@ jamdict 0.1a7 ------------- - 2020-05-31 + - Added Japanese Proper Names Dictionary (JMnedict) support - Included built-in KRADFILE/RADKFile support - Improved command line tools (json, compact mode, etc.) @@ -50,6 +57,9 @@ Older versions -------------- - 2017-08-18 + - Support KanjiDic2 (XML/SQLite formats) + - 2016-11-09 + - Release first version to Github From 1b1b90cd384aff19a9f86faa6f7a3c6137331408 Mon Sep 17 00:00:00 2001 From: Le Tuan Anh Date: Tue, 25 May 2021 22:26:41 +0800 Subject: [PATCH 42/42] jamdict version 0.1a11 ready --- jamdict/__version__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jamdict/__version__.py b/jamdict/__version__.py index fd0acc4..19b5e74 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -6,15 +6,15 @@ __copyright__ = "Copyright (c) 2016, Le Tuan Anh" __credits__ = [] __license__ = "MIT License" -__description__ = "Python library for manipulating Jim Breen's JMdict, KanjiDic2, KRADFILE and JMnedict" +__description__ = "Python library for using Japanese dictionaries and resources (Jim Breen's JMdict, KanjiDic2, KRADFILE, JMnedict)" __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" # ------------------------------------------------------------------------------ # Version configuration (enforcing PEP 440) # ------------------------------------------------------------------------------ __status__ = "3 - Alpha" -__version_tuple__ = (0, 1, 0, 10, 6) -__version_status__ = '' # a specific value ('rc', 'dev', etc.) or leave blank to be auto-filled +__version_tuple__ = (0, 1, 0, 11) +__version_status__ = '' # a specific value ('rc', 'dev', etc.) or leave blank to be auto-filled # ------------------------------------------------------------------------------ __status_map__ = {'3 - Alpha': 'a', '4 - Beta': 'b', '5 - Production/Stable': '', '6 - Mature': ''} if not __version_status__: