From 93c2ab15447a283355f4df681b2cd323fd3e4be3 Mon Sep 17 00:00:00 2001 From: Michael Ringgaard Date: Tue, 22 Jan 2019 10:35:43 +0100 Subject: [PATCH] Alias transfer (#326) --- app/lib/docview.js | 45 +- data/wiki/calendar.sling | 2 +- doc/guide/install.md | 2 +- doc/guide/myelin.md | 6 +- python/run.py | 73 +-- python/task/download.py | 1 + python/task/wiki.py | 7 +- python/task/workflow.py | 42 ++ sling/frame/object.h | 3 +- sling/frame/store.cc | 5 +- sling/frame/store.h | 10 +- sling/nlp/document/BUILD | 1 + sling/nlp/document/document.cc | 49 +- sling/nlp/document/document.h | 18 +- sling/nlp/kb/BUILD | 63 ++- sling/nlp/kb/app/index.html | 15 +- sling/nlp/kb/app/kb.css | 4 + sling/nlp/kb/calendar.cc | 37 +- sling/nlp/kb/calendar.h | 19 +- sling/nlp/kb/facts.cc | 42 +- sling/nlp/kb/facts.h | 7 + sling/nlp/kb/knowledge-service.cc | 28 +- sling/nlp/kb/knowledge-service.h | 3 +- sling/nlp/{wiki => kb}/name-table-builder.cc | 0 sling/nlp/{wiki => kb}/name-table.cc | 2 +- sling/nlp/{wiki => kb}/name-table.h | 0 sling/nlp/kb/phrase-table-builder.cc | 466 ++++++++++++++++++ sling/nlp/{wiki => kb}/phrase-table.cc | 75 +-- sling/nlp/{wiki => kb}/phrase-table.h | 24 +- sling/nlp/wiki/BUILD | 64 +-- .../wiki/{profile-aliases.cc => aliases.cc} | 54 +- sling/nlp/wiki/phrase-table-builder.cc | 249 ---------- sling/nlp/wiki/wiki-annotator.cc | 39 +- sling/nlp/wiki/wiki-annotator.h | 4 +- sling/nlp/wiki/wiki-macros.cc | 12 +- sling/nlp/wiki/wiki.cc | 1 + sling/nlp/wiki/wiki.h | 3 +- sling/nlp/wiki/wikidata-converter.cc | 12 +- sling/nlp/wiki/wikidata-converter.h | 1 + sling/nlp/wiki/wikipedia-documents.cc | 6 - sling/pyapi/BUILD | 9 +- sling/pyapi/pyphrase.h | 2 +- sling/pyapi/pytask.cc | 4 +- sling/task/frames.cc | 20 +- sling/task/frames.h | 3 - sling/util/unicode.cc | 9 + sling/util/unicode.h | 11 +- 47 files changed, 1013 insertions(+), 539 deletions(-) rename sling/nlp/{wiki => kb}/name-table-builder.cc (100%) rename sling/nlp/{wiki => kb}/name-table.cc (98%) rename sling/nlp/{wiki => kb}/name-table.h (100%) create mode 100644 sling/nlp/kb/phrase-table-builder.cc rename sling/nlp/{wiki => kb}/phrase-table.cc (57%) rename sling/nlp/{wiki => kb}/phrase-table.h (88%) rename sling/nlp/wiki/{profile-aliases.cc => aliases.cc} (85%) delete mode 100644 sling/nlp/wiki/phrase-table-builder.cc diff --git a/app/lib/docview.js b/app/lib/docview.js index 61f0b7b7..c56ba904 100644 --- a/app/lib/docview.js +++ b/app/lib/docview.js @@ -451,7 +451,7 @@ export class DocumentViewer extends Component { } BuildPanel(phrase, fidx) { - let mention = this.document.frames[fidx]; + let frame = this.document.frames[fidx]; let panel = document.createElement("div"); panel.className = "panel"; panel.id = "p" + next_panel++; @@ -462,9 +462,11 @@ export class DocumentViewer extends Component { let title = document.createElement("span"); title.className = "panel-title"; - title.appendChild(document.createTextNode(phrase)); - titlebar.appendChild(title); - this.AddTypes(titlebar, mention.types); + if (phrase) { + title.appendChild(document.createTextNode(phrase)); + titlebar.appendChild(title); + this.AddTypes(titlebar, frame.types); + } let icon = document.createElement("span"); icon.className = "panel-icon"; @@ -476,19 +478,24 @@ export class DocumentViewer extends Component { let contents = document.createElement("div"); contents.className = "panel-content" - let slots = mention.slots; - if (slots) { - for (let i = 0; i < slots.length; i += 2) { - let n = slots[i]; - let v = slots[i + 1]; - if (this.document.frames[n].id == "evokes" || - this.document.frames[n].id == "is") { - let avm = this.BuildAVM(v, {}); - contents.appendChild(avm); + if (phrase) { + let rendered = {}; + let slots = frame.slots; + if (slots) { + for (let i = 0; i < slots.length; i += 2) { + let n = slots[i]; + let v = slots[i + 1]; + if (this.document.frames[n].id == "evokes" || + this.document.frames[n].id == "is") { + let avm = this.BuildAVM(v, rendered); + contents.appendChild(avm); + } } } + } else { + let avm = this.BuildAVM(fidx, {}); + contents.appendChild(avm); } - panel.appendChild(contents); return panel; } @@ -504,7 +511,11 @@ export class DocumentViewer extends Component { let span = e.currentTarget; let phrase = span.getAttribute("phrase"); let fidx = parseInt(span.getAttribute("frame")); - this.AddPanel('"' + phrase + '"', fidx); + if (phrase) { + this.AddPanel('"' + phrase + '"', fidx); + } else { + this.AddPanel(null, fidx); + } } ClosePanel(e) { @@ -519,7 +530,6 @@ export class DocumentViewer extends Component { chip.className = "chip"; chip.id = "t" + fidx; chip.setAttribute("frame", fidx); - chip.setAttribute("phrase", name); chip.appendChild(document.createTextNode(name)); return chip; @@ -548,13 +558,14 @@ export class DocumentViewer extends Component { let fidx = parseInt(span.getAttribute("frame")) let mention = this.document.frames[fidx]; + let rendered = {}; let slots = mention.slots; if (slots) { for (let i = 0; i < slots.length; i += 2) { let n = slots[i]; let v = slots[i + 1]; if (this.document.frames[n].id == "evokes") { - let avm = this.BuildAVM(v, {}); + let avm = this.BuildAVM(v, rendered); callout.appendChild(avm); } } diff --git a/data/wiki/calendar.sling b/data/wiki/calendar.sling index d33bb939..790d93df 100644 --- a/data/wiki/calendar.sling +++ b/data/wiki/calendar.sling @@ -211,7 +211,7 @@ 625: Q2662 626: Q2661 627: Q2664 - 628: Q2664 + 628: Q2660 629: Q2659 630: Q2657 diff --git a/doc/guide/install.md b/doc/guide/install.md index 3586a175..89a2f462 100644 --- a/doc/guide/install.md +++ b/doc/guide/install.md @@ -43,7 +43,7 @@ SLING uses [Bazel](https://bazel.build/) as the build system, so you need to to build the SLING parser. ```shell -sudo apt-get install pkg-config zip g++ zlib1g-dev unzip python +sudo apt-get install pkg-config zip g++ zlib1g-dev unzip python2.7 python2.7-dev wget -P /tmp https://github.com/bazelbuild/bazel/releases/download/0.13.0/bazel-0.13.0-installer-linux-x86_64.sh chmod +x /tmp/bazel-0.13.0-installer-linux-x86_64.sh sudo /tmp/bazel-0.13.0-installer-linux-x86_64.sh diff --git a/doc/guide/myelin.md b/doc/guide/myelin.md index c3e8e593..a382970b 100644 --- a/doc/guide/myelin.md +++ b/doc/guide/myelin.md @@ -377,7 +377,7 @@ flow = "flow" var = <#flags> (IN=1, OUT=2, REF=4, LEARNABLE=8 UNIQUE=16, from version 5) - <#aliases> + <#aliases> * <#bytes> value @@ -397,11 +397,11 @@ blob = <#flags> (unused, from version 5) func = <#flags> (TRAINING=1, from version 5) - <#ops> + <#ops> * cnx = <#flags> (unused, from version 5) - <#vars> + <#vars> * shape = <#dims> * diff --git a/python/run.py b/python/run.py index ac5385f8..9323ce2e 100755 --- a/python/run.py +++ b/python/run.py @@ -120,37 +120,6 @@ default=False, action='store_true') -flags.define("--dryrun", - help="build worflows but do not run them", - default=False, - action='store_true') - -flags.define("--monitor", - help="port number for task monitor (0 means no monitor)", - default=6767, - type=int, - metavar="PORT") - -flags.define("--logdir", - help="directory where workflow logs are stored", - default="local/logs", - metavar="DIR") - -def run_workflow(wf): - # In dryrun mode the workflow is just dumped without running it. - if flags.arg.dryrun: - print wf.wf.dump() - return - - # Start workflow. - log.info("start workflow") - wf.wf.start() - - # Wait until workflow completes. Poll every second to make the workflow - # interruptible. - done = False - while not done: done = wf.wf.wait(1000) - def download_corpora(): if flags.arg.download_wikidata or flags.arg.download_wikipedia: wf = download.DownloadWorkflow("wiki-download") @@ -164,7 +133,7 @@ def download_corpora(): for language in flags.arg.languages: wf.download_wikipedia(language=language) - run_workflow(wf) + workflow.run(wf.wf) def import_wiki(): if flags.arg.import_wikidata or flags.arg.import_wikipedia: @@ -180,7 +149,7 @@ def import_wiki(): log.info("Import " + language + " wikipedia") wf.wikipedia(language=language) - run_workflow(wf) + workflow.run(wf.wf) def parse_wikipedia(): # Convert wikipedia pages to SLING documents. @@ -189,7 +158,7 @@ def parse_wikipedia(): log.info("Parse " + language + " wikipedia") wf = wiki.WikiWorkflow(language + "-wikipedia-parsing") wf.parse_wikipedia(language=language) - run_workflow(wf) + workflow.run(wf.wf) def fuse_items(): # Merge categories from wikipedias. @@ -197,28 +166,28 @@ def fuse_items(): log.info("Merge wikipedia categories") wf = wiki.WikiWorkflow("category-merging") wf.merge_wikipedia_categories() - run_workflow(wf) + workflow.run(wf.wf) # Invert categories. if flags.arg.invert_categories: log.info("Invert categories") wf = wiki.WikiWorkflow("category-inversion") wf.invert_wikipedia_categories() - run_workflow(wf) + workflow.run(wf.wf) # Compute item popularity. if flags.arg.compute_item_popularity: log.info("Compute item popularity") wf = wiki.WikiWorkflow("item-popularity") wf.compute_item_popularity() - run_workflow(wf) + workflow.run(wf.wf) # Fuse items. if flags.arg.fuse_items: log.info("Fuse items") wf = wiki.WikiWorkflow("fuse-items") wf.fuse_items() - run_workflow(wf) + workflow.run(wf.wf) def build_knowledge_base(): @@ -227,7 +196,7 @@ def build_knowledge_base(): log.info("Build knowledge base repository") wf = wiki.WikiWorkflow("knowledge-base") wf.build_knowledge_base() - run_workflow(wf) + workflow.run(wf.wf) # Extract item names from wikidata and wikipedia. if flags.arg.extract_names: @@ -235,7 +204,7 @@ def build_knowledge_base(): log.info("Extract " + language + " names") wf = wiki.WikiWorkflow(language + "-name-extraction") wf.extract_names(language=language) - run_workflow(wf) + workflow.run(wf.wf) # Build name table. if flags.arg.build_nametab: @@ -243,7 +212,7 @@ def build_knowledge_base(): log.info("Build " + language + " name table") wf = wiki.WikiWorkflow(language + "-name-table") wf.build_name_table(language=language) - run_workflow(wf) + workflow.run(wf.wf) # Build phrase table. if flags.arg.build_phrasetab: @@ -251,7 +220,7 @@ def build_knowledge_base(): log.info("Build " + language + " phrase table") wf = wiki.WikiWorkflow(language + "-phrase-table") wf.build_phrase_table(language=language) - run_workflow(wf) + workflow.run(wf.wf) def train_embeddings(): # Extract vocabulary for word embeddings. @@ -260,7 +229,7 @@ def train_embeddings(): log.info("Extract " + language + " vocabulary") wf = embedding.EmbeddingWorkflow(language + "-vocabulary") wf.extract_vocabulary(language=language) - run_workflow(wf) + workflow.run(wf.wf) # Train word embeddings. if flags.arg.train_word_embeddings: @@ -268,28 +237,28 @@ def train_embeddings(): log.info("Train " + language + " word embeddings") wf = embedding.EmbeddingWorkflow(language + "-word-embeddings") wf.train_word_embeddings(language=language) - run_workflow(wf) + workflow.run(wf.wf) # Extract vocabulary for fact and category embeddings. if flags.arg.extract_fact_lexicon: log.info("Extract fact and category lexicons") wf = embedding.EmbeddingWorkflow("fact-lexicon") wf.extract_fact_lexicon() - run_workflow(wf) + workflow.run(wf.wf) # Extract facts from knowledge base. if flags.arg.extract_facts: log.info("Extract facts from knowledge base") wf = embedding.EmbeddingWorkflow("fact-extraction") wf.extract_facts() - run_workflow(wf) + workflow.run(wf.wf) # Train fact and category embeddings. if flags.arg.train_fact_embeddings: log.info("Train fact and category embeddings") wf = embedding.EmbeddingWorkflow("fact-embeddings") wf.train_fact_embeddings() - run_workflow(wf) + workflow.run(wf.wf) if __name__ == '__main__': @@ -309,20 +278,16 @@ def train_embeddings(): flags.arg.build_nametab = True flags.arg.build_phrasetab = True - # Start task monitor. - if flags.arg.monitor > 0: workflow.start_monitor(flags.arg.monitor) - # Run workflows. + workflow.startup() download_corpora() import_wiki() parse_wikipedia() fuse_items() build_knowledge_base() train_embeddings() - - # Stop task monitor. - if flags.arg.monitor > 0: workflow.stop_monitor() - workflow.save_workflow_log(flags.arg.logdir) + workflow.shutdown() # Done. log.info("Done") + diff --git a/python/task/download.py b/python/task/download.py index dc50fcb4..57ac719b 100644 --- a/python/task/download.py +++ b/python/task/download.py @@ -16,6 +16,7 @@ import os import urllib2 +import _strptime import time from workflow import * diff --git a/python/task/wiki.py b/python/task/wiki.py index b4b6d97b..46cdef72 100644 --- a/python/task/wiki.py +++ b/python/task/wiki.py @@ -583,7 +583,7 @@ def extract_names(self, aliases=None, language=None): if aliases == None: # Get language-dependent aliases from Wikidata and Wikpedia. wikidata_aliases = self.wf.map(self.fused_items(), - "profile-alias-extractor", + "alias-extractor", params={ "language": language, "skip_aux": True, @@ -599,7 +599,7 @@ def extract_names(self, aliases=None, language=None): merged_aliases = self.wf.shuffle(aliases, len(names)) # Filter and select aliases. - self.wf.reduce(merged_aliases, names, "profile-alias-reducer", + self.wf.reduce(merged_aliases, names, "alias-reducer", params={"language": language}) return names @@ -648,8 +648,11 @@ def build_phrase_table(self, names=None, language=None): with self.wf.namespace("phrase-table"): builder = self.wf.task("phrase-table-builder") builder.add_param("language", language) + builder.add_param("transfer_aliases", True) self.wf.connect(self.wf.read(names, name="name-reader"), builder) + kb = self.knowledge_base() repo = self.phrase_table(language) + builder.attach_input("commons", kb) builder.attach_output("repository", repo) return repo diff --git a/python/task/workflow.py b/python/task/workflow.py index ebf5ae4a..b3408e78 100644 --- a/python/task/workflow.py +++ b/python/task/workflow.py @@ -21,8 +21,25 @@ import time import sling import sling.pysling as api +import sling.flags as flags import sling.log as log +flags.define("--dryrun", + help="build worflows but do not run them", + default=False, + action='store_true') + +flags.define("--monitor", + help="port number for task monitor (0 means no monitor)", + default=6767, + type=int, + metavar="PORT") + +flags.define("--logdir", + help="directory where workflow logs are stored", + default="local/logs", + metavar="DIR") + # Input readers. readers = { "records": "record-file-reader", @@ -705,3 +722,28 @@ def save_workflow_log(path): log.info("workflow stats saved in " + logfn) return True +def run(wf): + # In dryrun mode the workflow is just dumped without running it. + if flags.arg.dryrun: + print wf.dump() + return + + # Start workflow. + wf.start() + + # Wait until workflow completes. Poll every second to make the workflow + # interruptible. + done = False + while not done: done = wf.wait(1000) + +def startup(): + # Start task monitor. + if flags.arg.monitor > 0: start_monitor(flags.arg.monitor) + +def shutdown(): + # Stop task monitor. + if flags.arg.monitor > 0: stop_monitor() + + # Save log to log directory. + save_workflow_log(flags.arg.logdir) + diff --git a/sling/frame/object.h b/sling/frame/object.h index f39663a2..6a300cfd 100644 --- a/sling/frame/object.h +++ b/sling/frame/object.h @@ -69,7 +69,8 @@ class HandleSpace : public Space, public External { }; // Hash map and set keyed by handle. -template using HandleMap = std::unordered_map; +template using HandleMap = + std::unordered_map; typedef std::unordered_set HandleSet; // Name with lazy lookup that can be initialized as static variables and diff --git a/sling/frame/store.cc b/sling/frame/store.cc index 8c8473a4..97d115a4 100644 --- a/sling/frame/store.cc +++ b/sling/frame/store.cc @@ -1088,7 +1088,10 @@ Handle Store::AllocateHandleSlow(Datum *object) { DCHECK(free_handle_ == nullptr); // Expand handle table. - handles_.reserve(handles_.size() * 2); + if (handles_.size() >= kMaxHandlesSize) LOG(FATAL) << "Handle overflow"; + size_t newsize = handles_.size() * 2; + if (newsize > kMaxHandlesSize) newsize = kMaxHandlesSize; + handles_.reserve(newsize); // Update the pool pointer to handle table. pools_[store_tag_] = reinterpret_cast
(handles_.base()); diff --git a/sling/frame/store.h b/sling/frame/store.h index c24dd334..0feef8a5 100644 --- a/sling/frame/store.h +++ b/sling/frame/store.h @@ -204,8 +204,10 @@ inline Word Align(Word n) { // The handle class is implemented as a POD type to make it efficient to pass by // value. struct Handle { - static const int kIntShift = 2; // integers are shifted two bits - static const int kTagBits = 2; // the two lowest bits are tag bits + static const int kIntShift = 2; // integers are shifted two bits + static const int kHandleBits = 32; // handles are 32-bit integers + static const int kTagBits = 2; // the two lowest bits are tag bits + static const int kRefTagBits = 3; // tag bits plus mark bit static const Word kTagMask = 0x00000003; // bit mask for handle tag static const Word kRefTagMask = 0x00000007; // object reference tag bit mask @@ -240,6 +242,9 @@ struct Handle { static const int kMinInt = -2147483648 >> kIntShift; static const int kMaxInt = 2147483647 >> kIntShift; + // Maximum number of handles (local or global). + static const int kMaxHandles = 1 << (kHandleBits - kRefTagBits); + // Returns the tag bits for the value. Word tag() const { return bits & kTagMask; } @@ -1355,6 +1360,7 @@ class Store { // objects go through the handle table, which provides a level of indirection // that allows object to move dynamically, e.g. during garbage collection and // when symbols are resolved. + static const size_t kMaxHandlesSize = Handle::kMaxHandles * sizeof(Reference); Reference *free_handle_; Space handles_; diff --git a/sling/nlp/document/BUILD b/sling/nlp/document/BUILD index 42d6a709..a3e3a866 100644 --- a/sling/nlp/document/BUILD +++ b/sling/nlp/document/BUILD @@ -18,6 +18,7 @@ cc_library( "//sling/frame:object", "//sling/frame:store", "//sling/string:text", + "//sling/util:unicode", ], ) diff --git a/sling/nlp/document/document.cc b/sling/nlp/document/document.cc index bfd23763..d68320a0 100644 --- a/sling/nlp/document/document.cc +++ b/sling/nlp/document/document.cc @@ -34,6 +34,18 @@ uint64 Token::Fingerprint() const { return fingerprint_; } +CaseForm Token::Form() const { + if (form_ == CASE_INVALID) { + // Case for first token in a sentence is indeterminate. + if (index_ == 0 || brk_ >= SENTENCE_BREAK) { + form_ = CASE_NONE; + } else { + form_ = UTF8::Case(word_); + } + } + return form_; +} + void Span::Evoke(const Frame &frame) { mention_.Add(document_->names_->n_evokes, frame); document_->AddMention(frame.handle(), this); @@ -123,11 +135,19 @@ uint64 Span::Fingerprint() const { return fp; } +CaseForm Span::Form() const { + if (form_ == CASE_INVALID) form_ = document_->Form(begin_, end_); + return form_; +} + Document::Document(Store *store, const DocumentNames *names) : themes_(store), names_(names) { // Bind names. - if (names_ == nullptr) names_ = new DocumentNames(store); - names_->AddRef(); + if (names_ == nullptr) { + names_ = new DocumentNames(store); + } else { + names_->AddRef(); + } // Build empty document. Builder builder(store); @@ -138,8 +158,11 @@ Document::Document(Store *store, const DocumentNames *names) Document::Document(const Frame &top, const DocumentNames *names) : top_(top), themes_(top.store()), names_(names) { // Bind names. - if (names_ == nullptr) names_ = new DocumentNames(top.store()); - names_->AddRef(); + if (names_ == nullptr) { + names_ = new DocumentNames(top.store()); + } else { + names_->AddRef(); + } // Add document frame if it is missing. if (!top_.valid()) { @@ -190,6 +213,7 @@ Document::Document(const Frame &top, const DocumentNames *names) t.brk_ = i == 0 ? NO_BREAK : SPACE_BREAK; } t.fingerprint_ = 0; + t.form_ = CASE_INVALID; t.span_ = nullptr; } } @@ -311,6 +335,7 @@ void Document::AddToken(Text word, int begin, int end, BreakType brk) { t.word_.assign(word.data(), word.size()); t.brk_ = brk; t.fingerprint_ = 0; + t.form_ = CASE_INVALID; t.span_ = nullptr; tokens_changed_ = true; } @@ -412,6 +437,20 @@ uint64 Document::PhraseFingerprint(int begin, int end) { return fp; } +CaseForm Document::Form(int begin, int end) { + CaseForm form = CASE_INVALID; + for (int t = begin; t < end; ++t) { + if (token(t).skipped()) continue; + CaseForm token_form = token(t).Form(); + if (form == CASE_INVALID) { + form = token_form; + } else if (form != token_form) { + form = CASE_NONE; + } + } + return form; +} + string Document::PhraseText(int begin, int end) const { string phrase; for (int t = begin; t < end; ++t) { @@ -466,7 +505,7 @@ Span *Document::Insert(int begin, int end) { bool crossing = false; Span *enclosing = EnclosingSpan(begin, end, &crossing); if (crossing) return nullptr; - + // Check if span already exists. if (enclosing != nullptr && enclosing->begin() == begin && enclosing->end() == end) { diff --git a/sling/nlp/document/document.h b/sling/nlp/document/document.h index 41185687..a120309a 100644 --- a/sling/nlp/document/document.h +++ b/sling/nlp/document/document.h @@ -24,6 +24,7 @@ #include "sling/frame/store.h" #include "sling/nlp/document/token-breaks.h" #include "sling/string/text.h" +#include "sling/util/unicode.h" namespace sling { namespace nlp { @@ -89,6 +90,12 @@ class Token { // Token fingerprint. uint64 Fingerprint() const; + // Token case form. + CaseForm Form() const; + + // Punctuation tokens etc. are skipped in phrase comparison. + bool skipped() const { return Fingerprint() == 1; } + private: Document *document_; // document the token belongs to Handle handle_; // handle for token in the store @@ -101,6 +108,8 @@ class Token { BreakType brk_; // break level before token mutable uint64 fingerprint_; // fingerprint for token text + mutable CaseForm form_; // case form for token + Span *span_; // lowest span covering the token friend class Document; @@ -183,6 +192,9 @@ class Span { // Returns fingerprint for span phrase. uint64 Fingerprint() const; + // Returns case form for span phrase. + CaseForm Form() const; + private: // Document that span belongs to. Document *document_; @@ -203,8 +215,9 @@ class Span { Span *sibling_ = nullptr; // first sibling to the right enclosed by parent Span *children_ = nullptr; // left-most enclosed sub-span - // Span fingerprint. This is lazily initialized and cached. + // Span fingerprint and case form. This is lazily initialized and cached. mutable uint64 fingerprint_ = 0; + mutable CaseForm form_ = CASE_INVALID; friend class Document; }; @@ -295,6 +308,9 @@ class Document { // Returns the fingerprint for [begin, end). uint64 PhraseFingerprint(int begin, int end); + // Returns case form forphrase [begin, end). + CaseForm Form(int begin, int end); + // Returns the phrase text for span. string PhraseText(int begin, int end) const; diff --git a/sling/nlp/kb/BUILD b/sling/nlp/kb/BUILD index 7212f270..3a52eb53 100644 --- a/sling/nlp/kb/BUILD +++ b/sling/nlp/kb/BUILD @@ -26,6 +26,64 @@ cc_library( ], ) +cc_library( + name = "name-table-builder", + srcs = ["name-table-builder.cc"], + deps = [ + "//sling/base", + "//sling/file:repository", + "//sling/frame:object", + "//sling/task", + "//sling/task:frames", + "//sling/util:mutex", + "//sling/util:unicode", + ], + alwayslink = 1, +) + +cc_library( + name = "name-table", + srcs = ["name-table.cc"], + hdrs = ["name-table.h"], + deps = [ + "//sling/base", + "//sling/file:repository", + "//sling/string:text", + "//sling/util:unicode", + ], +) + +cc_library( + name = "phrase-table-builder", + srcs = ["phrase-table-builder.cc"], + deps = [ + ":facts", + "//sling/base", + "//sling/file:repository", + "//sling/frame:object", + "//sling/frame:serialization", + "//sling/nlp/document:phrase-tokenizer", + "//sling/nlp/wiki", + "//sling/task", + "//sling/task:frames", + "//sling/util:mutex", + ], + alwayslink = 1, +) + +cc_library( + name = "phrase-table", + srcs = ["phrase-table.cc"], + hdrs = ["phrase-table.h"], + deps = [ + "//sling/base", + "//sling/file:repository", + "//sling/frame:store", + "//sling/frame:object", + "//sling/string:text", + ], +) + embed_data( name = "app", srcs = [ @@ -41,15 +99,15 @@ cc_library( srcs = ["knowledge-service.cc"], hdrs = ["knowledge-service.h"], deps = [ - ":calendar", ":app", + ":calendar", + ":name-table", "//sling/frame:object", "//sling/frame:serialization", "//sling/frame:store", "//sling/http:http-server", "//sling/http:static-content", "//sling/http:web-service", - "//sling/nlp/wiki:name-table", ], ) @@ -68,3 +126,4 @@ cc_binary( "//sling/string:strcat", ], ) + diff --git a/sling/nlp/kb/app/index.html b/sling/nlp/kb/app/index.html index 32c8b6e2..13b57c74 100644 --- a/sling/nlp/kb/app/index.html +++ b/sling/nlp/kb/app/index.html @@ -127,7 +127,12 @@

{{active.title}}

-
{{item.description}}
+
+ {{item.description}} +
+
+ Data type: {{item.type}} +
@@ -151,9 +156,12 @@

{{active.title}}

{{v.text}} + + ({{v.lang}}) + - +
diff --git a/sling/nlp/kb/app/kb.css b/sling/nlp/kb/app/kb.css index 6ca887b8..47848503 100644 --- a/sling/nlp/kb/app/kb.css +++ b/sling/nlp/kb/app/kb.css @@ -91,6 +91,10 @@ cursor: hand; } +.prop-lang { + color: #808080; +} + .prop-value a { color: #0b0080; text-decoration: none; diff --git a/sling/nlp/kb/calendar.cc b/sling/nlp/kb/calendar.cc index bc803384..a3d3187e 100644 --- a/sling/nlp/kb/calendar.cc +++ b/sling/nlp/kb/calendar.cc @@ -125,6 +125,7 @@ void Date::ParseFromString(Text str) { void Date::ParseFromFrame(const Frame &frame) { // Try to get the 'point in time' property from frame and parse it. + if (frame.invalid()) return; Store *store = frame.store(); Object time(store, store->Resolve(frame.GetHandle("P585"))); if (time.invalid()) return; @@ -256,19 +257,24 @@ void Calendar::Init(Store *store) { if (!cal.valid()) return; // Build calendar mappings. - BuildCalendarMapping(&weekdays_, cal.GetFrame("/w/weekdays")); - BuildCalendarMapping(&months_, cal.GetFrame("/w/months")); - BuildCalendarMapping(&days_, cal.GetFrame("/w/days")); - BuildCalendarMapping(&years_, cal.GetFrame("/w/years")); - BuildCalendarMapping(&decades_, cal.GetFrame("/w/decades")); - BuildCalendarMapping(¢uries_, cal.GetFrame("/w/centuries")); - BuildCalendarMapping(&millennia_, cal.GetFrame("/w/millennia")); + BuildCalendarMapping(&weekdays_, nullptr, cal.GetFrame("/w/weekdays")); + BuildCalendarMapping(&months_, &month_items_, cal.GetFrame("/w/months")); + BuildCalendarMapping(&days_, &day_items_, cal.GetFrame("/w/days")); + BuildCalendarMapping(&years_, nullptr, cal.GetFrame("/w/years")); + BuildCalendarMapping(&decades_, nullptr, cal.GetFrame("/w/decades")); + BuildCalendarMapping(¢uries_, nullptr, cal.GetFrame("/w/centuries")); + BuildCalendarMapping(&millennia_, nullptr, cal.GetFrame("/w/millennia")); }; -bool Calendar::BuildCalendarMapping(CalendarMap *mapping, const Frame &source) { +bool Calendar::BuildCalendarMapping(CalendarMap *mapping, + CalendarItemMap *items, + const Frame &source) { if (!source.valid()) return false; for (const Slot &s : source) { (*mapping)[s.name.AsInt()] = s.value; + if (items != nullptr) { + (*items)[s.value] = s.name.AsInt(); + } } return true; } @@ -352,6 +358,21 @@ string Calendar::DateAsString(const Date &date) const { return "???"; } +bool Calendar::GetDayAndMonth(Handle item, Date *date) const { + auto f = day_items_.find(item); + if (f == day_items_.end()) return false; + date->day = f->second % 100; + date->month = f->second / 100; + return true; +} + +bool Calendar::GetMonth(Handle item, Date *date) const { + auto f = month_items_.find(item); + if (f == month_items_.end()) return false; + date->month = f->second; + return true; +} + Handle Calendar::Day(const Date &date) const { if (date.precision < Date::DAY) return Handle::nil(); return Day(date.month, date.day); diff --git a/sling/nlp/kb/calendar.h b/sling/nlp/kb/calendar.h index 834e96c4..a9640bcf 100644 --- a/sling/nlp/kb/calendar.h +++ b/sling/nlp/kb/calendar.h @@ -96,6 +96,12 @@ class Calendar { return DateAsString(Date(object)); } + // Get day and month for calendar item. + bool GetDayAndMonth(Handle item, Date *date) const; + + // Get month for item. + bool GetMonth(Handle item, Date *date) const; + // Get item for day. Handle Day(const Date &date) const; Handle Day(int month, int day) const; @@ -130,11 +136,16 @@ class Calendar { // Mapping from calendar item key to the corresponding calendar item. typedef std::unordered_map CalendarMap; + // Mapping from calendar item to the corresponding key. + typedef HandleMap CalendarItemMap; + // Get name for item. Text ItemName(Handle item) const; // Build calendar mapping. - bool BuildCalendarMapping(CalendarMap *mapping, const Frame &source); + bool BuildCalendarMapping(CalendarMap *mapping, + CalendarItemMap *items, + const Frame &source); // Store with calendar. Store *store_ = nullptr; @@ -170,6 +181,12 @@ class Calendar { // Millennia. The millennia are numbered as (year-1)/1000+1 for AD and // (year+1)/1000-1 for BC. CalendarMap millennia_; + + // Mapping from calendar item to day of year (month*100+day). + CalendarItemMap day_items_; + + // Mapping from calendar item to month. + CalendarItemMap month_items_; }; // Date parser and generator based on language-dependent format, e.g.: diff --git a/sling/nlp/kb/facts.cc b/sling/nlp/kb/facts.cc index ec911be6..74873dc2 100644 --- a/sling/nlp/kb/facts.cc +++ b/sling/nlp/kb/facts.cc @@ -56,6 +56,7 @@ void FactCatalog::Init(Store *store) { SetExtractor(p_occupation_, &Facts::ExtractOccupation); SetExtractor(p_position_, &Facts::ExtractPosition); SetExtractor(p_member_of_sports_team_, &Facts::ExtractTeam); + SetExtractor(p_time_period_, &Facts::ExtractTimePeriod); // Set up items that stops closure expansion. static const char *baseids[] = { @@ -115,7 +116,7 @@ Taxonomy *FactCatalog::CreateDefaultTaxonomy() { "Q186081", // time interval "Q11563", // number "Q17376908", // languoid - "Q2198779", // unit + "Q47574", // unit of measurement "Q39875001", // measure "Q3695082", // sign "Q2996394", // biological process @@ -133,11 +134,7 @@ Taxonomy *FactCatalog::CreateDefaultTaxonomy() { "Q35120", // entity nullptr, }; - std::vector types; - for (const char **type = default_taxonomy; *type != nullptr; ++type) { - types.emplace_back(*type); - } - return new Taxonomy(this, types); + return new Taxonomy(this, default_taxonomy); } void Facts::Extract(Handle item) { @@ -263,6 +260,26 @@ void Facts::ExtractDate(Handle value) { AddFact(catalog_->calendar_.Century(date)); } +void Facts::ExtractTimePeriod(Handle period) { + // Add fact for period. + ExtractSimple(period); + + // Add facts for start and end time of period. + Frame f(store_, store_->Resolve(period)); + Handle start = f.GetHandle(catalog_->p_start_time_); + if (!start.IsNil()) { + push(catalog_->p_start_time_); + ExtractDate(start); + pop(); + } + Handle end = f.GetHandle(catalog_->p_end_time_); + if (!end.IsNil()) { + push(catalog_->p_end_time_); + ExtractDate(end); + pop(); + } +} + void Facts::ExtractLocation(Handle location) { ExtractClosure(location, catalog_->p_located_in_.handle()); } @@ -326,6 +343,19 @@ Taxonomy::Taxonomy(const FactCatalog *catalog, const std::vector &types) { } } +Taxonomy::Taxonomy(const FactCatalog *catalog, const char **types) { + catalog_ = catalog; + for (const char **type = types; *type != nullptr; ++type) { + Handle t = catalog->store_->LookupExisting(*type); + if (t.IsNil()) { + LOG(WARNING) << "Ignoring unknown type in taxonomy: " << *type; + continue; + } + int rank = typemap_.size(); + typemap_[t] = rank; + } +} + Handle Taxonomy::Classify(const Frame &item) { // Get immediate types for item. Handles types(item.store()); diff --git a/sling/nlp/kb/facts.h b/sling/nlp/kb/facts.h index 155e910c..79e18b8e 100644 --- a/sling/nlp/kb/facts.h +++ b/sling/nlp/kb/facts.h @@ -82,6 +82,9 @@ class FactCatalog { Name p_academic_degree_{names_, "P512"}; Name p_member_of_sports_team_{names_, "P54"}; Name p_league_{names_, "P118"}; + Name p_time_period_{names_, "P2348"}; + Name p_start_time_{names_, "P580"}; + Name p_end_time_{names_, "P582"}; Name n_time_{names_, "/w/time"}; Name n_item_{names_, "/w/item"}; @@ -126,6 +129,9 @@ class Facts { // Extract date-valued fact with backoff to year, decade and century. void ExtractDate(Handle value); + // Extract time period. + void ExtractTimePeriod(Handle period); + // Extract location of item with containment backoff. void ExtractPlacement(Handle item); @@ -181,6 +187,7 @@ class Taxonomy { public: // Initialize taxonomy from a ranked type list. Taxonomy(const FactCatalog *catalog, const std::vector &types); + Taxonomy(const FactCatalog *catalog, const char **types); // Classify item according to taxonomy. Handle Classify(const Frame &item); diff --git a/sling/nlp/kb/knowledge-service.cc b/sling/nlp/kb/knowledge-service.cc index 840cdb5b..c7d25dfd 100644 --- a/sling/nlp/kb/knowledge-service.cc +++ b/sling/nlp/kb/knowledge-service.cc @@ -185,6 +185,13 @@ void KnowledgeService::HandleGetItem(HTTPRequest *request, } Builder b(ws.store()); GetStandardProperties(item, &b); + Handle datatype = item.GetHandle(n_target_); + if (!datatype.IsNil()) { + Frame dt(kb_, datatype); + if (dt.valid()) { + b.Add(n_type_, dt.GetHandle(n_name_)); + } + } // Fetch properties. Item info(ws.store()); @@ -253,8 +260,12 @@ void KnowledgeService::FetchProperties(const Frame &item, Item *info) { Handle value = h; bool qualified = false; if (kb_->IsFrame(h)) { - Handle qua = Frame(kb_, h).GetHandle(Handle::is()); - if (!qua.IsNil()) { + // Handle the ambiguity between qualified frames and mono-lingual text + // by checking for the presence of a language slot. + Frame f(kb_, h); + Handle qua = f.GetHandle(Handle::is()); + Handle lang = f.GetHandle(n_lang_); + if (!qua.IsNil() && lang.IsNil()) { value = qua; qualified = true; } @@ -282,8 +293,17 @@ void KnowledgeService::FetchProperties(const Frame &item, Item *info) { // Add string value. v.Add(n_text_, value); } else if (property.datatype == n_text_type_) { - // Add text value. - v.Add(n_text_, value); + // Add text value with language. + if (kb_->IsFrame(value)) { + Frame monotext(kb_, value); + v.Add(n_text_, monotext.GetHandle(Handle::is())); + Frame lang = monotext.GetFrame(n_lang_); + if (lang.valid()) { + v.Add(n_lang_, lang.GetHandle(n_name_)); + } + } else { + v.Add(n_text_, value); + } } else if (property.datatype == n_url_type_) { // Add URL value. v.Add(n_text_, value); diff --git a/sling/nlp/kb/knowledge-service.h b/sling/nlp/kb/knowledge-service.h index 6872453c..ce5e99fb 100644 --- a/sling/nlp/kb/knowledge-service.h +++ b/sling/nlp/kb/knowledge-service.h @@ -23,7 +23,7 @@ #include "sling/http/http-server.h" #include "sling/http/static-content.h" #include "sling/nlp/kb/calendar.h" -#include "sling/nlp/wiki/name-table.h" +#include "sling/nlp/kb/name-table.h" namespace sling { namespace nlp { @@ -117,6 +117,7 @@ class KnowledgeService { Name n_url_{names_, "url"}; Name n_thumbnail_{names_, "thumbnail"}; Name n_matches_{names_, "matches"}; + Name n_lang_{names_, "lang"}; Name n_xref_type_{names_, "/w/xref"}; Name n_item_type_{names_, "/w/item"}; diff --git a/sling/nlp/wiki/name-table-builder.cc b/sling/nlp/kb/name-table-builder.cc similarity index 100% rename from sling/nlp/wiki/name-table-builder.cc rename to sling/nlp/kb/name-table-builder.cc diff --git a/sling/nlp/wiki/name-table.cc b/sling/nlp/kb/name-table.cc similarity index 98% rename from sling/nlp/wiki/name-table.cc rename to sling/nlp/kb/name-table.cc index b24c38a0..f2974d87 100644 --- a/sling/nlp/wiki/name-table.cc +++ b/sling/nlp/kb/name-table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "sling/nlp/wiki/name-table.h" +#include "sling/nlp/kb/name-table.h" #include #include diff --git a/sling/nlp/wiki/name-table.h b/sling/nlp/kb/name-table.h similarity index 100% rename from sling/nlp/wiki/name-table.h rename to sling/nlp/kb/name-table.h diff --git a/sling/nlp/kb/phrase-table-builder.cc b/sling/nlp/kb/phrase-table-builder.cc new file mode 100644 index 00000000..492075ee --- /dev/null +++ b/sling/nlp/kb/phrase-table-builder.cc @@ -0,0 +1,466 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "sling/base/logging.h" +#include "sling/base/types.h" +#include "sling/file/repository.h" +#include "sling/frame/object.h" +#include "sling/nlp/document/phrase-tokenizer.h" +#include "sling/nlp/kb/facts.h" +#include "sling/nlp/wiki/wiki.h" +#include "sling/task/frames.h" +#include "sling/task/task.h" +#include "sling/util/mutex.h" + +namespace sling { +namespace nlp { + +// Build phrase table repository from aliases. +class PhraseTableBuilder : public task::FrameProcessor { + public: + void Startup(task::Task *task) override { + // Get language for names. + string lang = task->Get("language", "en"); + language_ = commons_->Lookup("/lang/" + lang); + task->Fetch("reliable_alias_sources", &reliable_alias_sources_); + task->Fetch("transfer_aliases", &transfer_aliases_); + + // Set phrase normalization. + tokenizer_.set_normalization( + ParseNormalization(task->Get("normalization", "lcn"))); + + // Initialize alias transfer. + if (transfer_aliases_) InitAliasTransfer(); + + // Statistics. + num_aliases_ = task->GetCounter("aliases"); + num_phrases_ = task->GetCounter("phrases"); + num_entities_ = task->GetCounter("entities"); + num_instances_ = task->GetCounter("instances"); + num_transfers_ = task->GetCounter("alias_transfers"); + num_zero_transfers_ = task->GetCounter("alias_zero_transfers"); + num_instance_transfers_ = task->GetCounter("alias_instance_transfers"); + } + + void Process(Slice key, const Frame &frame) override { + MutexLock lock(&mu_); + + // Get index for entity. + int index; + string id(key.data(), key.size()); + auto fe = entity_mapping_.find(id); + if (fe == entity_mapping_.end()) { + index = entity_table_.size(); + entity_table_.emplace_back(id); + num_entities_->Increment(); + entity_mapping_[id] = index; + } else { + index = fe->second; + } + + // Add aliases. + for (const Slot &s : frame) { + if (s.name == n_alias_) { + // Check language. + Frame alias(frame.store(), s.value); + if (alias.GetHandle(n_lang_) != language_) continue; + num_aliases_->Increment(); + + // Compute phrase fingerprint. + Text name = alias.GetText(n_name_); + int count = alias.GetInt(n_count_, 1); + int sources = alias.GetInt(n_sources_, 0); + int form = alias.GetInt(n_form_, 0); + uint64 fp = tokenizer_.Fingerprint(name); + if (fp == 1) continue; + + // Look up or add phrase for entity to phrase table. + Phrase *&phrase = phrase_table_[fp]; + if (phrase == nullptr) { + phrase = new Phrase(fp); + num_phrases_->Increment(); + } + + // Add entity to phrase. + bool reliable = (sources & reliable_alias_sources_); + phrase->entities.emplace_back(index, count, form, reliable); + + // Add alias count to entity frequency. + entity_table_[index].count += count; + num_instances_->Increment(count); + } + } + } + + void Flush(task::Task *task) override { + // Prune phrase table by transfering unreliable aliases to reliable + // aliases for related items. + if (transfer_aliases_) { + LOG(INFO) << "Transfer aliases"; + TransferAliases(); + } + + // Build phrase repository. + Repository repository; + + // Add normalization flags to repository. + string norm = NormalizationString(tokenizer_.normalization()); + repository.AddBlock("normalization", norm.data(), norm.size()); + + // Write entity map. + LOG(INFO) << "Build entity map"; + File *entity_index_block = repository.AddBlock("EntityIndex"); + File *entity_item_block = repository.AddBlock("EntityItems"); + uint32 offset = 0; + for (Entity &entity : entity_table_) { + // Write entity index entry. + entity_index_block->WriteOrDie(&offset, sizeof(uint32)); + + // Write count and id to entity entry. + CHECK_LT(entity.id.size(), 256); + uint8 idlen = entity.id.size(); + entity_item_block->WriteOrDie(&entity.count, sizeof(uint32)); + entity_item_block->WriteOrDie(&idlen, sizeof(uint8)); + entity_item_block->WriteOrDie(entity.id.data(), idlen); + + // Compute offset of next entry. + offset += sizeof(uint32) + sizeof(uint8) + idlen; + } + + // Write phrase map. + LOG(INFO) << "Build phrase map"; + int num_phrases = phrase_table_.size(); + int num_buckets = (num_phrases + 32) / 32; + std::vector items; + items.reserve(num_phrases); + for (auto &it : phrase_table_) { + Phrase *phrase = it.second; + items.push_back(phrase); + + // Sort entities in decreasing order. + std::sort(phrase->entities.begin(), phrase->entities.end(), + [](const EntityPhrase &a, const EntityPhrase &b) { + return a.count() > b.count(); + }); + } + repository.WriteMap("Phrase", &items, num_buckets); + + // Write repository to file. + const string &filename = task->GetOutput("repository")->resource()->name(); + CHECK(!filename.empty()); + LOG(INFO) << "Write phrase repository to " << filename; + repository.Write(filename); + LOG(INFO) << "Repository done"; + + // Clear collected data. + for (auto &it : phrase_table_) delete it.second; + phrase_table_.clear(); + entity_table_.clear(); + entity_mapping_.clear(); + } + + private: + // Entity with id and frequency. + struct Entity { + Entity(const string &id) : id(id) {} + string id; + uint32 count = 0; + }; + + // Entity phrase with index and frequency. The count_and_flags field contains + // the count in the lower 29 bit. Bit 29 and 30 contain the case form, and + // bit 31 contains the reliable source flag. + struct EntityPhrase { + EntityPhrase() = default; + EntityPhrase(int index, uint32 count, uint32 form, bool reliable) + : index(index), + count_and_flags(count | (form << 29) | (reliable ? (1 << 31) : 0)) {} + uint32 index; + uint32 count_and_flags; + + // Phrase frequency. + int count() const { return count_and_flags & ((1 << 29) - 1); } + void set_count(uint32 count) { + count_and_flags = (count_and_flags & ~((1 << 29) - 1)) | count; + } + + // Alias reliability. + bool reliable() const { return count_and_flags & (1 << 31); } + + // Phrase form. + int form() const { return (count_and_flags >> 29) & 3; } + }; + + // Phrase with fingerprint and entity distribution. + struct Phrase : public RepositoryMapItem { + // Initialize new phrase. + Phrase(uint64 fingerprint) : fingerprint(fingerprint) {} + + // Write phrase to repository. + int Write(File *file) const override { + file->WriteOrDie(&fingerprint, sizeof(uint64)); + uint32 count = entities.size(); + file->WriteOrDie(&count, sizeof(uint32)); + for (const EntityPhrase &ep : entities) { + file->WriteOrDie(&ep, sizeof(EntityPhrase)); + } + return sizeof(uint64) + sizeof(uint32) + count * sizeof(EntityPhrase); + } + + // Use phrase fingerprint as the hash code. + uint64 Hash() const override { return fingerprint; } + + uint64 fingerprint; // phrase fingerprint + std::vector entities; // list of entities for name phrase + }; + + // Transfer alias counts from source to target. + bool Transfer(EntityPhrase *source, EntityPhrase *target) { + // Check for conflicting case forms. + int source_form = source->form(); + int target_form = target->form(); + if (source_form != CASE_NONE && + target_form != CASE_NONE && + source_form != target_form) { + return false; + } + + // Check for zero transfers. + int source_count = source->count(); + int target_count = target->count(); + if (source_count == 0) { + num_zero_transfers_->Increment(); + return false; + } + + // Transfer alias counts from source to target. + target->set_count(target_count + source_count); + source->set_count(0); + num_transfers_->Increment(); + num_instance_transfers_->Increment(source_count); + return true; + } + + // Exchange aliases between items. + bool Exchange(EntityPhrase *a, EntityPhrase *b) { + if (a->reliable() && !b->reliable()) { + return Transfer(b, a); + } else if (b->reliable() && !a->reliable()) { + return Transfer(a, b); + } else { + return false; + } + } + + void TransferAliases() { + // Run over all phrases in phrase table. + for (auto &it : phrase_table_) { + Phrase *phrase = it.second; + + // There must be more than one entity for any transfers to take place. + if (phrase->entities.size() < 2) continue; + + // Build mappings between entity items and entity indices. + Store store(commons_); + int num_items = phrase->entities.size(); + Handles entity_item(&store); + HandleMap entity_index; + entity_item.resize(num_items); + for (int i = 0; i < num_items; ++i) { + const EntityPhrase &e = phrase->entities[i]; + const Entity &entity = entity_table_[e.index]; + Handle item = store.Lookup(entity.id); + entity_item[i] = item; + entity_index[item] = i; + } + + // Find potential targets for alias transfer. + bool pruned = false; + std::vector numbers; + std::vector years; + for (int source = 0; source < num_items; ++source) { + // Get set of facts for item. + Facts facts(&catalog_, &store); + facts.Extract(entity_item[source]); + for (Handle h : facts.list()) { + Array fact(&store, h); + DCHECK_GE(fact.length(), 2); + + // Get head property and target value. + Handle p = fact.get(0); + Handle t = fact.get(fact.length() - 1); + + // Collect numbers and years. + if (p == n_instance_of_) { + if (t == n_natural_number_) { + numbers.push_back(source); + } + if (t == n_year_ || t == n_year_bc_ || t == n_decade_) { + years.push_back(source); + } + } + + // Check for property exceptions. + if (transfer_exceptions_.count(p) > 0) continue; + + // Check if target has the phrase as an alias. + auto f = entity_index.find(t); + if (f == entity_index.end()) continue; + int target = f->second; + if (target == source) continue; + + // Transfer alias from unreliable to reliable alias. + auto &src = phrase->entities[source]; + auto &tgt = phrase->entities[target]; + if (Exchange(&src, &tgt)) pruned = true; + } + } + + // Transfer aliases for years. + if (!years.empty()) { + for (int source = 0; source < years.size(); ++source) { + for (int target = 0; target < years.size(); ++target) { + if (source == target) continue; + auto &src = phrase->entities[years[source]]; + auto &tgt = phrase->entities[years[target]]; + if (Exchange(&src, &tgt)) pruned = true; + } + } + } + + // Transfer aliases for numbers. + if (!numbers.empty()) { + for (int source = 0; source < numbers.size(); ++source) { + for (int target = 0; target < numbers.size(); ++target) { + if (source == target) continue; + auto &src = phrase->entities[numbers[source]]; + auto &tgt = phrase->entities[numbers[target]]; + if (Exchange(&src, &tgt)) pruned = true; + } + } + } + + // Prune aliases with zero count. + if (pruned) { + int j = 0; + for (int i = 0; i < num_items; ++i) { + if (phrase->entities[i].count() == 0) continue; + if (i != j) phrase->entities[j] = phrase->entities[i]; + j++; + } + phrase->entities.resize(j); + } + } + } + + void InitAliasTransfer() { + // Initialize alias transfer exceptions. + static const char *exceptions[] = { + "P1889", // different from + "P460", // said to be the same as + "P1533", // identical to this given name + "P138", // named after + "P2959", // permanent duplicated item + "P734", // family name + "P735", // given name + "P112", // founded by + "P115", // home venue + "P144", // based on + "P1950", // second family name in Spanish name + "P2359", // Roman nomen gentilicium + "P2358", // Roman praenomen + "P2365", // Roman cognomen + "P2366", // Roman agnomen + "P941", // inspired by + "P629", // edition or translation of + "P37", // official language + "P103", // native language + "P566", // basionym + nullptr + }; + for (const char **p = exceptions; *p != nullptr; ++p) { + transfer_exceptions_.insert(commons_->LookupExisting(*p)); + } + + // Initialize fact catalog. + catalog_.Init(commons_); + } + + // Symbols. + Name n_lang_{names_, "lang"}; + Name n_name_{names_, "name"}; + Name n_alias_{names_, "alias"}; + Name n_count_{names_, "count"}; + Name n_form_{names_, "form"}; + Name n_sources_{names_, "sources"}; + + Name n_instance_of_{names_, "P31"}; + Name n_natural_number_{names_, "Q21199"}; + Name n_year_{names_, "Q577"}; + Name n_year_bc_{names_, "Q29964144"}; + Name n_decade_{names_, "Q39911"}; + + // Language for aliases. + Handle language_; + + // Reliable alias sources. + int reliable_alias_sources_ = + (1 << SRC_WIKIDATA_LABEL) | + (1 << SRC_WIKIDATA_ALIAS) | + (1 << SRC_WIKIDATA_NAME) | + (1 << SRC_WIKIDATA_DEMONYM); + + // Phrase tokenizer. + PhraseTokenizer tokenizer_; + + // Sorted name table mapping phrase fingerprints to entities. + std::unordered_map phrase_table_; + + // Entity table with id and frequency count. + std::vector entity_table_; + + // Mapping of entity id to entity index in entity table. + std::unordered_map entity_mapping_; + + // Alias transfer. + bool transfer_aliases_ = false; + + // Fact catalog for alias transfer. + FactCatalog catalog_; + + // Property exceptions for alias transfer. + HandleSet transfer_exceptions_; + + // Statistics. + task::Counter *num_phrases_ = nullptr; + task::Counter *num_entities_ = nullptr; + task::Counter *num_aliases_ = nullptr; + task::Counter *num_instances_ = nullptr; + task::Counter *num_transfers_ = nullptr; + task::Counter *num_zero_transfers_ = nullptr; + task::Counter *num_instance_transfers_ = nullptr; + + // Mutex for serializing access to repository. + Mutex mu_; +}; + +REGISTER_TASK_PROCESSOR("phrase-table-builder", PhraseTableBuilder); + +} // namespace nlp +} // namespace sling + diff --git a/sling/nlp/wiki/phrase-table.cc b/sling/nlp/kb/phrase-table.cc similarity index 57% rename from sling/nlp/wiki/phrase-table.cc rename to sling/nlp/kb/phrase-table.cc index b8e746c0..4c67adda 100644 --- a/sling/nlp/wiki/phrase-table.cc +++ b/sling/nlp/kb/phrase-table.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "sling/nlp/wiki/phrase-table.h" +#include "sling/nlp/kb/phrase-table.h" namespace sling { namespace nlp { @@ -39,7 +39,7 @@ void PhraseTable::Load(Store *store, const string &filename) { entity_table_->resize(entity_index_.size()); } -Handle PhraseTable::GetEntityHandle(int index) { +Handle PhraseTable::GetEntityHandle(int index) const { Handle handle = (*entity_table_)[index]; if (handle.IsNil()) { const EntityItem *entity = entity_index_.GetEntity(index); @@ -52,47 +52,56 @@ Handle PhraseTable::GetEntityHandle(int index) { return handle; } -void PhraseTable::Lookup(uint64 fp, Handles *matches) { - matches->clear(); +const PhraseTable::Phrase *PhraseTable::Find(uint64 fp) const { int bucket = fp % phrase_index_.num_buckets(); const PhraseItem *phrase = phrase_index_.GetBucket(bucket); const PhraseItem *end = phrase_index_.GetBucket(bucket + 1); while (phrase < end) { - if (phrase->fingerprint() == fp) { - const EntityPhrase *entities = phrase->entities(); - for (int i = 0; i < phrase->num_entities(); ++i) { - int index = entities[i].index; - Handle handle = GetEntityHandle(index); - matches->push_back(handle); - } - break; - } + if (phrase->fingerprint() == fp) return phrase; phrase = phrase->next(); } + return nullptr; } -void PhraseTable::Lookup(uint64 fp, MatchList *matches) { - matches->clear(); - int bucket = fp % phrase_index_.num_buckets(); - const PhraseItem *phrase = phrase_index_.GetBucket(bucket); - const PhraseItem *end = phrase_index_.GetBucket(bucket + 1); - while (phrase < end) { - if (phrase->fingerprint() == fp) { - const EntityPhrase *entities = phrase->entities(); - for (int i = 0; i < phrase->num_entities(); ++i) { - int index = entities[i].index; - Text id = entity_index_.GetEntityId(index); - Handle handle = GetEntityHandle(index); - int count = entities[i].count(); - int form = entities[i].form(); - bool reliable = entities[i].reliable(); - matches->emplace_back(id, handle, count, form, reliable); - } - break; - } - phrase = phrase->next(); +void PhraseTable::GetMatches(const Phrase *phrase, Handles *matches) const { + if (phrase == nullptr) { + matches->clear(); + return; + } + const EntityPhrase *entities = phrase->entities(); + matches->resize(phrase->num_entities()); + for (int i = 0; i < phrase->num_entities(); ++i) { + int index = entities[i].index; + (*matches)[i] = GetEntityHandle(index); } } +void PhraseTable::GetMatches(const Phrase *phrase, MatchList *matches) const { + if (phrase == nullptr) { + matches->clear(); + return; + } + const EntityPhrase *entities = phrase->entities(); + matches->resize(phrase->num_entities()); + for (int i = 0; i < phrase->num_entities(); ++i) { + int index = entities[i].index; + Match &match = (*matches)[i]; + match.id = entity_index_.GetEntityId(index); + match.item = GetEntityHandle(index); + auto &entity = entities[i]; + match.count = entity.count(); + match.form = entity.form(); + match.reliable = entity.reliable(); + } +} + +void PhraseTable::Lookup(uint64 fp, Handles *matches) const { + GetMatches(Find(fp), matches); +} + +void PhraseTable::Lookup(uint64 fp, MatchList *matches) const { + GetMatches(Find(fp), matches); +} + } // namespace nlp } // namespace sling diff --git a/sling/nlp/wiki/phrase-table.h b/sling/nlp/kb/phrase-table.h similarity index 88% rename from sling/nlp/wiki/phrase-table.h rename to sling/nlp/kb/phrase-table.h index 43e02973..e9dab4c9 100644 --- a/sling/nlp/wiki/phrase-table.h +++ b/sling/nlp/kb/phrase-table.h @@ -32,8 +32,6 @@ namespace nlp { class PhraseTable { public: struct Match { - Match(Text id, Handle item, int count, int form, bool reliable) - : id(id), item(item), count(count), form(form), reliable(reliable) {} Text id; // entity id of matching item Handle item; // matching item int count; // frequency of matching item @@ -49,17 +47,17 @@ class PhraseTable { void Load(Store *store, const string &filename); // Find all entities matching a phrase fingerprint. - void Lookup(uint64 fp, Handles *matches); + void Lookup(uint64 fp, Handles *matches) const; // Find all entities matching a phrase fingerprint and return list of matches. - void Lookup(uint64 fp, MatchList *matches); + void Lookup(uint64 fp, MatchList *matches) const; // Text normalization flags. const string &normalization() const { return normalization_; } private: // Get handle for entity. - Handle GetEntityHandle(int index); + Handle GetEntityHandle(int index) const; // Entity phrase with entity index and frequency. The count_and_flags field // contains the count in the lower 29 bit. Bit 29 and 30 contain the case @@ -150,6 +148,20 @@ class PhraseTable { } }; + public: + // Opaque public type for phrase items in the phrase table. + typedef PhraseItem Phrase; + + // Find matching phrase in phrase table. Return null if phrase is not found. + const Phrase *Find(uint64 fp) const; + + // Get matching handles for phrase. + void GetMatches(const Phrase *phrase, Handles *matches) const; + + // Get matching items for phrase. + void GetMatches(const Phrase *phrase, MatchList *matches) const; + + private: // Repository with name table. Repository repository_; @@ -166,7 +178,7 @@ class PhraseTable { Handles *entity_table_ = nullptr; // Text normalization flags. - string normalization_ = "lcp"; + string normalization_ = "lcn"; }; } // namespace nlp diff --git a/sling/nlp/wiki/BUILD b/sling/nlp/wiki/BUILD index 9d01c262..1766edce 100644 --- a/sling/nlp/wiki/BUILD +++ b/sling/nlp/wiki/BUILD @@ -66,7 +66,7 @@ cc_library( deps = [ ":wiki-extractor", "//sling/base:registry", - "//sling/frame:object", + "//sling/frame", "//sling/nlp/document", "//sling/string:numbers", "//sling/string:strcat", @@ -151,13 +151,13 @@ cc_library( ) cc_library( - name = "profile-aliases", - srcs = ["profile-aliases.cc"], + name = "aliases", + srcs = ["aliases.cc"], deps = [ ":wiki", "//sling/base", "//sling/file:textmap", - "//sling/frame:object", + "//sling/frame", "//sling/nlp/document:phrase-tokenizer", "//sling/task", "//sling/task:frames", @@ -167,62 +167,6 @@ cc_library( alwayslink = 1, ) -cc_library( - name = "name-table-builder", - srcs = ["name-table-builder.cc"], - deps = [ - "//sling/base", - "//sling/file:repository", - "//sling/frame:object", - "//sling/task", - "//sling/task:frames", - "//sling/util:mutex", - "//sling/util:unicode", - ], - alwayslink = 1, -) - -cc_library( - name = "name-table", - srcs = ["name-table.cc"], - hdrs = ["name-table.h"], - deps = [ - "//sling/base", - "//sling/file:repository", - "//sling/string:text", - "//sling/util:unicode", - ], -) - -cc_library( - name = "phrase-table-builder", - srcs = ["phrase-table-builder.cc"], - deps = [ - ":wiki", - "//sling/base", - "//sling/file:repository", - "//sling/frame:object", - "//sling/nlp/document:phrase-tokenizer", - "//sling/task", - "//sling/task:frames", - "//sling/util:mutex", - ], - alwayslink = 1, -) - -cc_library( - name = "phrase-table", - srcs = ["phrase-table.cc"], - hdrs = ["phrase-table.h"], - deps = [ - "//sling/base", - "//sling/file:repository", - "//sling/frame:store", - "//sling/frame:object", - "//sling/string:text", - ], -) - cc_binary( name = "parse-wiki-text", srcs = ["parse-wiki-text.cc"], diff --git a/sling/nlp/wiki/profile-aliases.cc b/sling/nlp/wiki/aliases.cc similarity index 85% rename from sling/nlp/wiki/profile-aliases.cc rename to sling/nlp/wiki/aliases.cc index 38abc665..afd642d6 100644 --- a/sling/nlp/wiki/profile-aliases.cc +++ b/sling/nlp/wiki/aliases.cc @@ -29,8 +29,8 @@ namespace sling { namespace nlp { -// Extract aliases from profiles. -class ProfileAliasExtractor : public task::FrameProcessor { +// Extract aliases for items. +class AliasExtractor : public task::FrameProcessor { public: void Startup(task::Task *task) override { string lang = task->Get("language", "en"); @@ -72,12 +72,27 @@ class ProfileAliasExtractor : public task::FrameProcessor { } else if (s.name == n_demonym_) { // Output demonyms as demonym aliases. AddAlias(&a, store->Resolve(s.value), SRC_WIKIDATA_DEMONYM); + } else if (s.name == n_iso3166_country_code_2_ || + s.name == n_iso3166_country_code_3_) { + // Output country codes as alternative names. + AddAlias(&a, store->Resolve(s.value), SRC_WIKIDATA_NAME); + } else if (s.name == n_short_name_) { + // Output short names as alternative or foreign names. + Handle lang = Handle::nil(); + Frame f(store, s.value); + if (f.valid()) lang = f.GetHandle(n_lang_); + if (lang.IsNil() || lang == language_) { + AddAlias(&a, store->Resolve(s.value), SRC_WIKIDATA_NAME); + } else { + AddAlias(&a, store->Resolve(s.value), SRC_WIKIDATA_FOREIGN); + } } else if (s.name == n_instance_of_) { // Discard categories, disambiguations, info boxes and templates. - if (wikitypes_.IsCategory(s.value) || - wikitypes_.IsDisambiguation(s.value) || - wikitypes_.IsInfobox(s.value) || - wikitypes_.IsTemplate(s.value)) { + Handle type = store->Resolve(s.value); + if (wikitypes_.IsCategory(type) || + wikitypes_.IsDisambiguation(type) || + wikitypes_.IsInfobox(type) || + wikitypes_.IsTemplate(type)) { return; } } @@ -124,12 +139,16 @@ class ProfileAliasExtractor : public task::FrameProcessor { Name n_native_name_{names_, "P1559"}; Name n_native_label_{names_, "P1705"}; Name n_demonym_{names_, "P1549"}; + Name n_short_name_{names_, "P1813"}; + Name n_iso3166_country_code_2_{names_, "P297"}; + Name n_iso3166_country_code_3_{names_, "P298"}; + Name n_instance_of_{names_, "P31"}; }; -REGISTER_TASK_PROCESSOR("profile-alias-extractor", ProfileAliasExtractor); +REGISTER_TASK_PROCESSOR("alias-extractor", AliasExtractor); -class ProfileAliasReducer : public task::Reducer { +class AliasReducer : public task::Reducer { public: struct Alias { std::unordered_map variants; @@ -165,11 +184,11 @@ class ProfileAliasReducer : public task::Reducer { Store store(&commons_); std::unordered_map aliases; for (task::Message *message : input.messages()) { - // Get next alias profile. - Frame profile = DecodeMessage(&store, message); + // Get next set of aliases for item. + Frame batch = DecodeMessage(&store, message); - // Get all aliases from profile. - for (const Slot &slot : profile) { + // Get all aliases for item. + for (const Slot &slot : batch) { if (slot.name != n_alias_) continue; Frame alias(&store, slot.value); string name = alias.GetString(n_name_); @@ -238,7 +257,7 @@ class ProfileAliasReducer : public task::Reducer { merged.Add(n_alias_, a.Create()); } - // Output alias profile. + // Output selected aliases. Output(input.shard(), task::CreateMessage(qid, merged.Create())); // Delete alias table. @@ -247,7 +266,7 @@ class ProfileAliasReducer : public task::Reducer { // Check if alias should be selected. bool SelectAlias(Alias *alias, bool toxic) { - // Keep aliases from trusted sources. + // Keep aliases from "trusted" sources. if (alias->sources & (WIKIDATA_LABEL | WIKIPEDIA_TITLE | WIKIPEDIA_REDIRECT)) { @@ -255,7 +274,9 @@ class ProfileAliasReducer : public task::Reducer { } // Only keep Wikidata alias if it is not toxic. - if ((alias->sources & WIKIDATA_ALIAS) && !toxic) return true; + if (alias->sources & (WIKIDATA_ALIAS | WIKIDATA_NAME)) { + return !toxic; + } // Keep foreign, native and demonym aliases supported by Wikipedia aliases. if (alias->sources & (WIKIDATA_FOREIGN | @@ -295,6 +316,7 @@ class ProfileAliasReducer : public task::Reducer { WIKIDATA_NATIVE = 1 << SRC_WIKIDATA_NATIVE, WIKIDATA_DEMONYM = 1 << SRC_WIKIDATA_DEMONYM, WIKIPEDIA_LINK = 1 << SRC_WIKIPEDIA_LINK, + WIKIDATA_NAME = 1 << SRC_WIKIDATA_NAME, }; // Commons store. @@ -326,7 +348,7 @@ class ProfileAliasReducer : public task::Reducer { std::set toxic_aliases_; }; -REGISTER_TASK_PROCESSOR("profile-alias-reducer", ProfileAliasReducer); +REGISTER_TASK_PROCESSOR("alias-reducer", AliasReducer); } // namespace nlp } // namespace sling diff --git a/sling/nlp/wiki/phrase-table-builder.cc b/sling/nlp/wiki/phrase-table-builder.cc deleted file mode 100644 index d19ad345..00000000 --- a/sling/nlp/wiki/phrase-table-builder.cc +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright 2017 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "sling/base/logging.h" -#include "sling/base/types.h" -#include "sling/file/repository.h" -#include "sling/nlp/document/phrase-tokenizer.h" -#include "sling/nlp/wiki/wiki.h" -#include "sling/task/frames.h" -#include "sling/task/task.h" -#include "sling/util/mutex.h" - -namespace sling { -namespace nlp { - -// Build phrase table repository from aliases. -class PhraseTableBuilder : public task::FrameProcessor { - public: - void Startup(task::Task *task) override { - // Get language for names. - string lang = task->Get("language", "en"); - language_ = commons_->Lookup("/lang/" + lang); - task->Fetch("noisy_alias_sources", &noisy_alias_sources_); - - // Set phrase normalization. - tokenizer_.set_normalization( - ParseNormalization(task->Get("normalization", "lcp"))); - - // Statistics. - num_aliases_ = task->GetCounter("aliases"); - num_phrases_ = task->GetCounter("phrases"); - num_entities_ = task->GetCounter("entities"); - num_instances_ = task->GetCounter("instances"); - } - - void Process(Slice key, const Frame &frame) override { - MutexLock lock(&mu_); - - // Get index for entity. - int index; - string id(key.data(), key.size()); - auto fe = entity_mapping_.find(id); - if (fe == entity_mapping_.end()) { - index = entity_table_.size(); - entity_table_.emplace_back(id); - num_entities_->Increment(); - entity_mapping_[id] = index; - } else { - index = fe->second; - } - - // Add aliases. - for (const Slot &s : frame) { - if (s.name == n_alias_) { - // Check language. - Frame alias(frame.store(), s.value); - if (alias.GetHandle(n_lang_) != language_) continue; - num_aliases_->Increment(); - - // Compute phrase fingerprint. - Text name = alias.GetText(n_name_); - int count = alias.GetInt(n_count_, 1); - int sources = alias.GetInt(n_sources_, 0); - int form = alias.GetInt(n_form_, 0); - uint64 fp = tokenizer_.Fingerprint(name); - if (fp == 1) continue; - - // Look up or add phrase for entity to phrase table. - Phrase *&phrase = phrase_table_[fp]; - if (phrase == nullptr) { - phrase = new Phrase(fp); - num_phrases_->Increment(); - } - - // Add entity to phrase. - bool reliable = (sources & ~noisy_alias_sources_); - uint32 count_and_flags = count | (form << 29); - if (reliable) count_and_flags |= (1 << 31); - phrase->entities.emplace_back(index, count_and_flags); - - // Add alias count to entity frequency. - entity_table_[index].count += count; - num_instances_->Increment(count); - } - } - } - - void Flush(task::Task *task) override { - // Build phrase repository. - Repository repository; - - // Add normalization flags to repository. - string norm = NormalizationString(tokenizer_.normalization()); - repository.AddBlock("normalization", norm.data(), norm.size()); - - // Write entity map. - LOG(INFO) << "Build entity map"; - File *entity_index_block = repository.AddBlock("EntityIndex"); - File *entity_item_block = repository.AddBlock("EntityItems"); - uint32 offset = 0; - for (Entity &entity : entity_table_) { - // Write entity index entry. - entity_index_block->WriteOrDie(&offset, sizeof(uint32)); - - // Write count and id to entity entry. - CHECK_LT(entity.id.size(), 256); - uint8 idlen = entity.id.size(); - entity_item_block->WriteOrDie(&entity.count, sizeof(uint32)); - entity_item_block->WriteOrDie(&idlen, sizeof(uint8)); - entity_item_block->WriteOrDie(entity.id.data(), idlen); - - // Compute offset of next entry. - offset += sizeof(uint32) + sizeof(uint8) + idlen; - } - - // Write phrase map. - LOG(INFO) << "Build phrase map"; - int num_phrases = phrase_table_.size(); - int num_buckets = (num_phrases + 32) / 32; - std::vector items; - items.reserve(num_phrases); - for (auto &it : phrase_table_) { - Phrase *phrase = it.second; - items.push_back(phrase); - - // Sort entities in decreasing order. - std::sort(phrase->entities.begin(), phrase->entities.end(), - [](const EntityPhrase &a, const EntityPhrase &b) { - return a.count() > b.count(); - }); - } - repository.WriteMap("Phrase", &items, num_buckets); - - // Write repository to file. - const string &filename = task->GetOutput("repository")->resource()->name(); - CHECK(!filename.empty()); - LOG(INFO) << "Write phrase repository to " << filename; - repository.Write(filename); - LOG(INFO) << "Repository done"; - - // Clear collected data. - for (auto &it : phrase_table_) delete it.second; - phrase_table_.clear(); - entity_table_.clear(); - entity_mapping_.clear(); - } - - private: - // Entity with id and frequency. - struct Entity { - Entity(const string &id) : id(id) {} - string id; - uint32 count = 0; - }; - - // Entity phrase with index and frequency. The count_and_flags field contains - // the count in the lower 29 bit. Bit 29 and 30 contain the case form, and - // bit 31 contains the reliable source flag. - struct EntityPhrase { - EntityPhrase(int index, uint32 count_and_flags) - : index(index), count_and_flags(count_and_flags) {} - uint32 index; - uint32 count_and_flags; - int count() const { return count_and_flags & ((1 << 29) - 1); } - }; - - // Phrase with fingerprint and entity distribution. - struct Phrase : public RepositoryMapItem { - // Initialize new phrase. - Phrase(uint64 fingerprint) : fingerprint(fingerprint) {} - - // Write phrase to repository. - int Write(File *file) const override { - file->WriteOrDie(&fingerprint, sizeof(uint64)); - uint32 count = entities.size(); - file->WriteOrDie(&count, sizeof(uint32)); - for (const EntityPhrase &ep : entities) { - file->WriteOrDie(&ep, sizeof(EntityPhrase)); - } - return sizeof(uint64) + sizeof(uint32) + count * sizeof(EntityPhrase); - } - - // Use phrase fingerprint as the hash code. - uint64 Hash() const override { return fingerprint; } - - uint64 fingerprint; // phrase fingerprint - std::vector entities; // list of entities for name phrase - }; - - // Symbols. - Name n_lang_{names_, "lang"}; - Name n_name_{names_, "name"}; - Name n_alias_{names_, "alias"}; - Name n_count_{names_, "count"}; - Name n_form_{names_, "form"}; - Name n_sources_{names_, "sources"}; - - // Language for aliases. - Handle language_; - - // Noisy alias sources. Aliases that are only backed by noisy alias sources - // are not marked as reliable. - int noisy_alias_sources_ = - (1 << SRC_WIKIPEDIA_ANCHOR) | - (1 << SRC_WIKIPEDIA_LINK) | - (1 << SRC_WIKIPEDIA_DISAMBIGUATION); - - // Phrase tokenizer. - PhraseTokenizer tokenizer_; - - // Sorted name table mapping phrase fingerprints to entities. - std::unordered_map phrase_table_; - - // Entity table with id and frequency count. - std::vector entity_table_; - - // Mapping of entity id to entity index in entity table. - std::unordered_map entity_mapping_; - - // Statistics. - task::Counter *num_phrases_ = nullptr; - task::Counter *num_entities_ = nullptr; - task::Counter *num_aliases_ = nullptr; - task::Counter *num_instances_ = nullptr; - - // Mutex for serializing access to repository. - Mutex mu_; -}; - -REGISTER_TASK_PROCESSOR("phrase-table-builder", PhraseTableBuilder); - -} // namespace nlp -} // namespace sling - diff --git a/sling/nlp/wiki/wiki-annotator.cc b/sling/nlp/wiki/wiki-annotator.cc index ad97f5c8..6f623da3 100644 --- a/sling/nlp/wiki/wiki-annotator.cc +++ b/sling/nlp/wiki/wiki-annotator.cc @@ -209,25 +209,23 @@ void WikiAnnotator::Link(const Node &node, bool unanchored) { // Resolve link. Text link = resolver_->ResolveLink(node.name()); - if (link.empty()) { - if (!unanchored) extractor->ExtractChildren(node); - return; - } if (unanchored) { - // Extract anchor as plain text. - WikiPlainTextSink plain; - extractor->Enter(&plain); - extractor->ExtractChildren(node); - extractor->Leave(&plain); - - // Add thematic frame for link. - if (!plain.text().empty()) { - Builder theme(store_); - theme.AddIsA(n_link_); - theme.Add(n_name_, plain.text()); - theme.AddIs(store_->Lookup(link)); - AddTheme(theme.Create().handle()); + if (!link.empty()) { + // Extract anchor as plain text. + WikiPlainTextSink plain; + extractor->Enter(&plain); + extractor->ExtractChildren(node); + extractor->Leave(&plain); + + // Add thematic frame for link. + if (!plain.text().empty()) { + Builder theme(store_); + theme.AddIsA(n_link_); + theme.Add(n_name_, plain.text()); + theme.AddIs(store_->Lookup(link)); + AddTheme(theme.Create().handle()); + } } } else { // Output anchor text. @@ -237,7 +235,8 @@ void WikiAnnotator::Link(const Node &node, // Evoke frame for link. if (begin != end) { - AddMention(begin, end, store_->Lookup(link)); + Handle evoke = link.empty() ? Handle::nil() : store_->Lookup(link); + AddMention(begin, end, evoke); } } } @@ -277,7 +276,9 @@ void WikiAnnotator::AddToDocument(Document *document) { int begin = document->Locate(a.begin.AsInt()); int end = document->Locate(a.end.AsInt()); Span *span = document->AddSpan(begin, end); - span->Evoke(a.evoked); + if (!a.evoked.IsNil()) { + span->Evoke(a.evoked); + } } // Add thematic frames. diff --git a/sling/nlp/wiki/wiki-annotator.h b/sling/nlp/wiki/wiki-annotator.h index 1a9d3eab..31b2cb39 100644 --- a/sling/nlp/wiki/wiki-annotator.h +++ b/sling/nlp/wiki/wiki-annotator.h @@ -166,8 +166,8 @@ class WikiAnnotator : public WikiTextSink { // the store and links will be resolved using the resolver. WikiAnnotator(Store *store, WikiLinkResolver *resolver); - // Initialize sub-annotator based on another annotator. Plase notice that this - // is not a copy constructor. + // Initialize sub-annotator based on another annotator. Please notice that + // this is not a copy constructor. explicit WikiAnnotator(WikiAnnotator *other); // Wiki sink interface receiving the annotations from the extractor. diff --git a/sling/nlp/wiki/wiki-macros.cc b/sling/nlp/wiki/wiki-macros.cc index d5d50982..61083625 100644 --- a/sling/nlp/wiki/wiki-macros.cc +++ b/sling/nlp/wiki/wiki-macros.cc @@ -513,8 +513,11 @@ REGISTER_WIKI_MACRO("measure", MeasureTemplate); // Template macro for info boxes. class InfoboxTemplate : public WikiMacro { public: + ~InfoboxTemplate() { if (docnames_) docnames_->Release(); } + void Init(const Frame &config) override { Store *store = config.store(); + docnames_ = new DocumentNames(store); Handle n_class = store->Lookup("class"); Handle n_fields = store->Lookup("fields"); Handle n_group = store->Lookup("group"); @@ -566,7 +569,7 @@ class InfoboxTemplate : public WikiMacro { field = &f->second; if (!field->group.IsNil()) index = 0; } else { - // Try to remove number suffix for repreated field. + // Try to remove number suffix for repeated field. int i = name.size() - 1; int power = 1; index = 0; @@ -596,7 +599,7 @@ class InfoboxTemplate : public WikiMacro { templ.extractor()->Leave(&value); // Convert field value to LEX format. - Document document(store); + Document document(store, docnames_); document.SetText(value.text()); GetTokenizer()->Tokenize(&document); value.AddToDocument(&document); @@ -609,7 +612,7 @@ class InfoboxTemplate : public WikiMacro { } else { auto &group = groups[field->group]; if (group.size() < index + 1) group.resize(index + 1); - auto &element = group[index]; + Builder *&element = group[index]; if (element == nullptr) element = new Builder(store); element->Add(field->key, lex); } @@ -650,6 +653,9 @@ class InfoboxTemplate : public WikiMacro { // Infobox fields keyed by field name. std::unordered_map fields_; + + // Document names. + DocumentNames *docnames_ = nullptr; }; REGISTER_WIKI_MACRO("infobox", InfoboxTemplate); diff --git a/sling/nlp/wiki/wiki.cc b/sling/nlp/wiki/wiki.cc index 8826a33f..4729b25d 100644 --- a/sling/nlp/wiki/wiki.cc +++ b/sling/nlp/wiki/wiki.cc @@ -42,6 +42,7 @@ const char *kAliasSourceName[kNumAliasSources] = { "wikidata_native", "wikidata_demonym", "wikipedia_link", + "wikidata_name", }; void Wiki::SplitTitle(const string &title, diff --git a/sling/nlp/wiki/wiki.h b/sling/nlp/wiki/wiki.h index 1aa39ab6..9dba523e 100644 --- a/sling/nlp/wiki/wiki.h +++ b/sling/nlp/wiki/wiki.h @@ -56,9 +56,10 @@ enum AliasSource { SRC_WIKIDATA_NATIVE = 8, // 256 0x0100 SRC_WIKIDATA_DEMONYM = 9, // 512 0x0200 SRC_WIKIPEDIA_LINK = 10, // 1024 0x0400 + SRC_WIKIDATA_NAME = 11, // 2048 0x0800 }; -static const int kNumAliasSources = 11; +static const int kNumAliasSources = 12; extern const char *kAliasSourceName[kNumAliasSources]; diff --git a/sling/nlp/wiki/wikidata-converter.cc b/sling/nlp/wiki/wikidata-converter.cc index bbc7eab1..7ab4b704 100644 --- a/sling/nlp/wiki/wikidata-converter.cc +++ b/sling/nlp/wiki/wikidata-converter.cc @@ -86,6 +86,7 @@ WikidataConverter::WikidataConverter(Store *commons, const string &language) { lang++; } language_map_["mul"] = n_lang_mul_.handle(); + language_map_["zxx"] = n_lang_none_.handle(); } Frame WikidataConverter::Convert(const Frame &item) { @@ -141,7 +142,12 @@ Frame WikidataConverter::Convert(const Frame &item) { } } if (!label.IsNil()) builder.Add(n_name_, label); - if (!label_language.IsNil()) builder.Add(n_lang_, label_language); + if (!label_language.IsNil()) { + auto f = languages_.find(label_language); + if (f != languages_.end()) { + builder.Add(n_lang_, f->second.language); + } + } } // Pick description matching label language. @@ -363,7 +369,9 @@ Handle WikidataConverter::ConvertText(const Frame &value) { string langid = value.GetString(s_language_); auto f = language_map_.find(langid); if (f == language_map_.end()) return Handle::nil(); - if (f->second == n_lang_mul_) return text.handle(); + if (f->second == n_lang_mul_ || f->second == n_lang_none_) { + return text.handle(); + } // Convert text to string qualified by language. Builder monoling(store); diff --git a/sling/nlp/wiki/wikidata-converter.h b/sling/nlp/wiki/wikidata-converter.h index 08711ad9..bbef0bd4 100644 --- a/sling/nlp/wiki/wikidata-converter.h +++ b/sling/nlp/wiki/wikidata-converter.h @@ -138,6 +138,7 @@ class WikidataConverter { Name n_lng_{names_, "/w/lng"}; Name n_globe_{names_, "/w/globe"}; Name n_lang_mul_{names_, "/lang/mul"}; + Name n_lang_none_{names_, "/lang/zxx"}; Name n_alias_{names_, "alias"}; Name n_sources_{names_, "sources"}; diff --git a/sling/nlp/wiki/wikipedia-documents.cc b/sling/nlp/wiki/wikipedia-documents.cc index 55660f68..0db8aa72 100644 --- a/sling/nlp/wiki/wikipedia-documents.cc +++ b/sling/nlp/wiki/wikipedia-documents.cc @@ -213,12 +213,6 @@ class WikipediaDocumentBuilder : public task::FrameProcessor, annotator.AddToDocument(&document); num_article_tokens_->Increment(document.num_tokens()); document.Update(); - - // Output intro text as alternative title alias. - WikiPlainTextSink intro; - if (extractor.ExtractIntro(&intro) && !intro.text().empty()) { - OutputAlias(qid, intro.text(), SRC_WIKIPEDIA_TITLE); - } } // Output alias for article title. diff --git a/sling/pyapi/BUILD b/sling/pyapi/BUILD index 4677936b..9e8ab870 100644 --- a/sling/pyapi/BUILD +++ b/sling/pyapi/BUILD @@ -42,12 +42,12 @@ cc_library( "//sling/myelin:compiler", "//sling/myelin:compute", "//sling/nlp/document", - "//sling/nlp/wiki:phrase-table", "//sling/nlp/document:document-tokenizer", "//sling/nlp/document:lex", "//sling/nlp/document:phrase-tokenizer", "//sling/nlp/kb:calendar", "//sling/nlp/kb:facts", + "//sling/nlp/kb:phrase-table", "//sling/nlp/parser", "//sling/nlp/parser/trainer:frame-evaluation", "//sling/nlp/wiki:wikidata-converter", @@ -69,12 +69,13 @@ cc_library( cc_library( name = "tasks", deps = [ - "//sling/nlp/wiki:profile-aliases", + "//sling/nlp/wiki:aliases", "//sling/nlp/wiki:wikidata-importer", "//sling/nlp/wiki:wikipedia-importer", "//sling/nlp/wiki:wikipedia-documents", - "//sling/nlp/wiki:name-table-builder", - "//sling/nlp/wiki:phrase-table-builder", + + "//sling/nlp/kb:name-table-builder", + "//sling/nlp/kb:phrase-table-builder", "//sling/nlp/embedding:fact-embeddings", "//sling/nlp/embedding:word-embeddings", diff --git a/sling/pyapi/pyphrase.h b/sling/pyapi/pyphrase.h index dd4180bb..b249f2ea 100644 --- a/sling/pyapi/pyphrase.h +++ b/sling/pyapi/pyphrase.h @@ -16,7 +16,7 @@ #define SLING_PYAPI_PYPHRASE_H_ #include "sling/nlp/document/phrase-tokenizer.h" -#include "sling/nlp/wiki/phrase-table.h" +#include "sling/nlp/kb/phrase-table.h" #include "sling/pyapi/pybase.h" #include "sling/pyapi/pystore.h" diff --git a/sling/pyapi/pytask.cc b/sling/pyapi/pytask.cc index 8b887838..c1c346d4 100644 --- a/sling/pyapi/pytask.cc +++ b/sling/pyapi/pytask.cc @@ -294,7 +294,7 @@ PyMemberDef PyResource::members[] = { }; void PyResource::Define(PyObject *module) { - InitType(&type, "sling.Resource", sizeof(PyResource), false); + InitType(&type, "sling.api.Resource", sizeof(PyResource), false); type.tp_init = method_cast(&PyResource::Init); type.tp_dealloc = method_cast(&PyResource::Dealloc); type.tp_members = members; @@ -512,7 +512,7 @@ PyObject *PyStartTaskMonitor(PyObject *self, PyObject *args) { // Start HTTP server. bool start_http_server = false; if (http == nullptr) { - LOG(INFO) << "Start HTTP server in port " << port; + LOG(INFO) << "Start HTTP server on port " << port; HTTPServerOptions options; http = new HTTPServer(options, port); start_http_server = true; diff --git a/sling/task/frames.cc b/sling/task/frames.cc index 7b98d253..d22da977 100644 --- a/sling/task/frames.cc +++ b/sling/task/frames.cc @@ -19,6 +19,7 @@ #include "sling/frame/decoder.h" #include "sling/frame/object.h" #include "sling/frame/reader.h" +#include "sling/frame/serialization.h" #include "sling/frame/store.h" #include "sling/frame/wire.h" #include "sling/stream/file.h" @@ -34,7 +35,7 @@ void FrameProcessor::Start(Task *task) { // Load commons store from file. for (Binding *binding : task->GetInputs("commons")) { - LoadStore(commons_, binding->resource()); + LoadStore(binding->resource()->name(), commons_); } // Get output channel (optional). @@ -151,23 +152,6 @@ Frame DecodeMessage(Store *store, Message *message) { } } -void LoadStore(Store *store, Resource *file) { - store->LockGC(); - FileInputStream stream(file->name()); - Input input(&stream); - if (input.Peek() == WIRE_BINARY_MARKER) { - Decoder decoder(store, &input); - decoder.DecodeAll(); - } else { - Reader reader(store, &input); - while (!reader.done()) { - reader.Read(); - CHECK(!reader.error()) << reader.GetErrorMessage(file->name()); - } - } - store->UnlockGC(); -} - } // namespace task } // namespace sling diff --git a/sling/task/frames.h b/sling/task/frames.h index 8d8f62a5..32337451 100644 --- a/sling/task/frames.h +++ b/sling/task/frames.h @@ -84,9 +84,6 @@ Message *CreateMessage(const Frame &frame, bool shallow = false); // Decode message as frame. Frame DecodeMessage(Store *store, Message *message); -// Load repository into store from input file. -void LoadStore(Store *store, Resource *file); - } // namespace task } // namespace sling diff --git a/sling/util/unicode.cc b/sling/util/unicode.cc index af737144..8d1af38d 100644 --- a/sling/util/unicode.cc +++ b/sling/util/unicode.cc @@ -49,6 +49,7 @@ Normalization ParseNormalization(const string &spec) { case 'd': flags |= NORMALIZE_DIGITS; break; case 'p': flags |= NORMALIZE_PUNCTUATION; break; case 'w': flags |= NORMALIZE_WHITESPACE; break; + case 'n': flags |= NORMALIZE_NAME; break; default: LOG(FATAL) << "Unknown normalization specifier: " << spec; } @@ -63,6 +64,7 @@ string NormalizationString(Normalization normalization) { if (normalization & NORMALIZE_DIGITS) str.push_back('d'); if (normalization & NORMALIZE_PUNCTUATION) str.push_back('p'); if (normalization & NORMALIZE_WHITESPACE) str.push_back('w'); + if (normalization & NORMALIZE_NAME) str.push_back('n'); return str; } @@ -127,6 +129,10 @@ bool Unicode::IsPunctuation(int c) { return Is(c, CATMASK_PUNCTUATION); } +bool Unicode::IsNamePunctuation(int c) { + return Is(c, CATMASK_NAME_PUNCTUATION) || c == '.'; +} + int Unicode::ToLower(int c) { if (c & unicode_tab_mask) return c; return unicode_lower_tab[c]; @@ -151,6 +157,9 @@ int Unicode::Normalize(int c, int flags) { if (flags & NORMALIZE_PUNCTUATION) { if (IsPunctuation(c)) c = 0; } + if (flags & NORMALIZE_NAME) { + if (IsNamePunctuation(c)) c = 0; + } if (flags & NORMALIZE_WHITESPACE) { if (IsWhitespace(c)) c = 0; } diff --git a/sling/util/unicode.h b/sling/util/unicode.h index 60956798..6987d8bc 100644 --- a/sling/util/unicode.h +++ b/sling/util/unicode.h @@ -95,6 +95,10 @@ enum UnicodeCategoryMask { (1 << CHARCAT_LINE_SEPARATOR) | (1 << CHARCAT_PARAGRAPH_SEPARATOR), + // Name punctuation. + CATMASK_NAME_PUNCTUATION = + (1 << CHARCAT_DASH_PUNCTUATION), + // Punctuation. CATMASK_PUNCTUATION = (1 << CHARCAT_DASH_PUNCTUATION) | @@ -118,9 +122,10 @@ enum Normalization { NORMALIZE_DIGITS = 0x04, // replace all digits with 9 NORMALIZE_PUNCTUATION = 0x08, // remove punctuation NORMALIZE_WHITESPACE = 0x10, // remove whitespace + NORMALIZE_NAME = 0x20, // remove name punctuation (periods and dashes) // Default normalization. - NORMALIZE_DEFAULT = NORMALIZE_CASE | NORMALIZE_LETTERS | NORMALIZE_PUNCTUATION + NORMALIZE_DEFAULT = NORMALIZE_CASE | NORMALIZE_LETTERS | NORMALIZE_NAME }; // Parse a list of normalization specifiers to a normalization bit mask. @@ -130,6 +135,7 @@ enum Normalization { // d: NORMALIZE_DIGITS // p: NORMALIZE_PUNCTUATION // w: NORMALIZE_WHITESPACE +// n: NORMALIZE_NAME Normalization ParseNormalization(const string &spec); // Return string with normalization specifiers for flags. @@ -174,6 +180,9 @@ class Unicode { // Check if code point is punctuation. static bool IsPunctuation(int c); + // Check if code point is name punctuation. + static bool IsNamePunctuation(int c); + // Convert code point to lower case. static int ToLower(int c);
{{qp.property}} @@ -163,6 +171,9 @@

{{active.title}}

{{qv.text}} + + ({{qv.lang}}) +