Skip to content

Commit

Permalink
Add ability to filter entities by kb prefix in markdown corpus injest
Browse files Browse the repository at this point in the history
  • Loading branch information
andychisholm committed Jan 6, 2016
1 parent a242838 commit 2a246d5
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions nel/corpora/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ def trim_link_subsection(s):
@PrepareCorpus.Register
class MarkdownPrepare(object):
""" Injest a set of markdown documents with neleval formatted annotations """
def __init__(self, docs_path, annotations_path, redirect_model_tag):
def __init__(self, docs_path, annotations_path, redirect_model_tag, target_entity_filter):
self.docs_path = docs_path
self.annotations_path = annotations_path
self.redirect_model = Redirects(redirect_model_tag)
self.target_entity_filter = target_entity_filter

def iter_mentions(self):
with open(self.annotations_path, 'r') as f:
Expand All @@ -43,6 +44,10 @@ def iter_mentions(self):
if len(parts) > 5:
tag = parts[5].lower().strip()

if self.target_entity_filter and resolution_id:
if not resolution_id.startswith(self.target_entity_filter):
resolution_id = None

yield {
'doc': parts[0],
'span': slice(int(parts[1]), int(parts[2])),
Expand Down Expand Up @@ -100,6 +105,7 @@ def __call__(self):
def add_arguments(cls, p):
p.add_argument('docs_path', metavar='SOURCE_DOCS_PATH')
p.add_argument('annotations_path', metavar='ANNOTATIONS_TSV_PATH')
p.add_argument('--redirect_model_tag', default='wikipedia', required=False, metavar='REDIRECT_MODEL')
p.add_argument('--redirect-model-tag', dest='redirect_model_tag', default='wikipedia', required=False, metavar='REDIRECT_MODEL')
p.add_argument('--target-entity-filter', dest='target_entity_filter', default=None, required=False, metavar='FILTER')
p.set_defaults(parsecls=cls)
return p

0 comments on commit 2a246d5

Please sign in to comment.