Skip to content

Commit

Permalink
Implemented advanced log in CRFSlotFiller inference
Browse files Browse the repository at this point in the history
  • Loading branch information
ClemDoum committed Mar 29, 2019
1 parent 760c278 commit cc63b50
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 10 deletions.
99 changes: 92 additions & 7 deletions snips_nlu/slot_filler/crf_slot_filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,13 @@ def get_slots(self, text):
if not tokens:
return []
features = self.compute_features(tokens)
tags = [_decode_tag(tag) for tag in
self.crf_model.predict_single(features)]
return tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
tags = self.crf_model.predict_single(features)
logger.debug(DifferedLoggingMessage(
self.log_inference_weights, text, tokens=tokens, features=features,
tags=tags))
decoded_tags = [_decode_tag(t) for t in tags]
return tags_to_slots(text, tokens, decoded_tags,
self.config.tagging_scheme,
self.slot_name_mapping)

def compute_features(self, tokens, drop_out=False):
Expand Down Expand Up @@ -251,23 +255,100 @@ def log_weights(self):
log = ""
transition_features = self.crf_model.transition_features_
transition_features = sorted(
iteritems(transition_features),
key=lambda transition_weight: math.fabs(transition_weight[1]),
iteritems(transition_features), key=_weight_absolute_value,
reverse=True)
log += "\nTransition weights: \n\n"
for (state_1, state_2), weight in transition_features:
log += "\n%s %s: %s" % (
_decode_tag(state_1), _decode_tag(state_2), weight)
feature_weights = self.crf_model.state_features_
feature_weights = sorted(
iteritems(feature_weights),
key=lambda feature_weight: math.fabs(feature_weight[1]),
iteritems(feature_weights), key=_weight_absolute_value,
reverse=True)
log += "\n\nFeature weights: \n\n"
for (feat, tag), weight in feature_weights:
log += "\n%s %s: %s" % (feat, _decode_tag(tag), weight)
return log

def log_inference_weights(self, text, tokens, features, tags):
model_features = set(
f for (f, _), w in iteritems(self.crf_model.state_features_))
log = "Feature weights for \"%s\":\n\n" % text
max_index = len(tokens) - 1
tokens_logs = []
for i, (token, feats, tag) in enumerate(zip(tokens, features, tags)):
token_log = "# Token \"%s\" (tagged as %s):" \
% (token.value, _decode_tag(tag))
if i != 0:
weights = sorted(self._get_outgoing_weights(tags[i - 1]),
key=_weight_absolute_value, reverse=True)
if weights:
token_log += "\n\nTransition weights from previous tag:"
weight_lines = (
"- (%s, %s) -> %s"
% (_decode_tag(a), _decode_tag(b), w)
for (a, b), w in weights
)
token_log += "\n" + "\n".join(weight_lines)
else:
token_log += \
"\n\nNo transition from previous tag seen at" \
" train time !"

if i != max_index:
weights = sorted(self._get_incoming_weights(tags[i + 1]),
key=_weight_absolute_value, reverse=True)
if weights:
token_log += "\n\nTransition weights to next tag:"
weight_lines = (
"- (%s, %s) -> %s"
% (_decode_tag(a), _decode_tag(b), w)
for (a, b), w in weights
)
token_log += "\n" + "\n".join(weight_lines)
else:
token_log += \
"\n\nNo transition to next tag seen at train time !"
feats = [":".join(f) for f in feats]
weights = (w for f in iteritems(feats)
for w in self._get_feature_weight(f))
weights = sorted(weights, key=_weight_absolute_value, reverse=True)
if weights:
token_log += "\n\nFeature weights:\n"
token_log += "\n".join(
"- (%s, %s) -> %s"
% (f, _decode_tag(t), w) for (f, t), w in weights
)
else:
token_log += "\n\nNo feature weights !"

unseen_features = sorted(
set(f for f in feats if f not in model_features))
if unseen_features:
token_log += "\n\nFeatures not seen at train time:\n%s" % \
"\n".join("- %s" % f for f in unseen_features)
tokens_logs.append(token_log)

log += "\n\n\n".join(tokens_logs)
return log

@fitted_required
def _get_incoming_weights(self, tag):
return [((first, second), w) for (first, second), w
in iteritems(self.crf_model.transition_features_)
if second == tag]

@fitted_required
def _get_outgoing_weights(self, tag):
return [((first, second), w) for (first, second), w
in iteritems(self.crf_model.transition_features_)
if first == tag]

@fitted_required
def _get_feature_weight(self, feature):
return [((f, tag), w) for (f, tag), w
in iteritems(self.crf_model.state_features_) if f == feature]

@check_persisted_path
def persist(self, path):
"""Persists the object at the given path"""
Expand Down Expand Up @@ -375,3 +456,7 @@ def _ensure_safe(X, Y):
safe_X.append([""]) # empty feature
safe_Y.append([OUTSIDE]) # outside label
return safe_X, safe_Y


def _weight_absolute_value(x):
return math.fabs(x[1])
193 changes: 190 additions & 3 deletions snips_nlu/tests/test_crf_slot_filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from builtins import range
from pathlib import Path

from mock import MagicMock
from mock import MagicMock, PropertyMock
from sklearn_crfsuite import CRF

from snips_nlu.constants import (
Expand All @@ -14,9 +14,10 @@
from snips_nlu.entity_parser import CustomEntityParserUsage
from snips_nlu.exceptions import NotTrained
from snips_nlu.pipeline.configs import CRFSlotFillerConfig
from snips_nlu.preprocessing import tokenize
from snips_nlu.preprocessing import tokenize, Token
from snips_nlu.result import unresolved_slot
from snips_nlu.slot_filler.crf_slot_filler import CRFSlotFiller, _ensure_safe
from snips_nlu.slot_filler.crf_slot_filler import (
CRFSlotFiller, _ensure_safe, _encode_tag)
from snips_nlu.slot_filler.crf_utils import TaggingScheme
from snips_nlu.slot_filler.feature_factory import (
IsDigitFactory, NgramFactory, ShapeNgramFactory)
Expand Down Expand Up @@ -804,3 +805,189 @@ def test_ensure_safe(self):
x, y = _ensure_safe(x, y)
model = CRF().fit(x, y)
model.predict_single([""])

def test_log_inference_weights(self):
# Given
self.maxDiff = None # pylint: disable=invalid-name
text = "this is a slot in a text"
tokens = [
Token("this", 0, 0),
Token("is", 0, 0),
Token("a", 0, 0),
Token("slot", 0, 0),
Token("in", 0, 0),
Token("a", 0, 0),
Token("text", 0, 0),
]
features = [
{
"ngram_1": "this",
"is_first": "1",
},
{
"ngram_1": "is",
"common": "1",
},
{
"ngram_1": "a"
},
{
"ngram_1": "slot",
},
{
"ngram_1": "in",
},
{
"ngram_1": "a",
},
{
"ngram_1": "text",
},
]
tags = ["O", "O", "B-slot", "I-slot", "O", "O", "O"]
tags = [_encode_tag(t) for t in tags]

transitions_weights = {
(_encode_tag("O"), _encode_tag("O")): 2,
(_encode_tag("O"), _encode_tag("B-slot")): 1,
(_encode_tag("B-slot"), _encode_tag("I-slot")): 2,
(_encode_tag("B-slot"), _encode_tag("O")): 1.5,
}

states_weights = {
("ngram_1:this", _encode_tag("O")): 5,
("ngram_1:this", _encode_tag("B-slot")): -2,
("ngram_1:slot", _encode_tag("B-slot")): 5,
("ngram_1:slot", _encode_tag("I-slot")): -3,
("ngram_1:slot", _encode_tag("O")): -1
}

# pylint: disable=super-init-not-called
class MockedSlotFiller(CRFSlotFiller):
def __init__(self, transition_features, state_features):
mocked_model = MagicMock()
type(mocked_model).transition_features_ = PropertyMock(
return_value=transition_features)
type(mocked_model).state_features_ = PropertyMock(
return_value=state_features)
self.crf_model = mocked_model
self.slot_name_mapping = 1

def __del__(self):
pass

slot_filler = MockedSlotFiller(transitions_weights, states_weights)

# When
log = slot_filler.log_inference_weights(
text=text, tokens=tokens, features=features, tags=tags)

# Then
expected_log = """Feature weights for "this is a slot in a text":
# Token "this" (tagged as O):
Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5
Feature weights:
- (ngram_1:this, O) -> 5
- (ngram_1:this, B-slot) -> -2
Features not seen at train time:
- is_first
- ngram_1
# Token "is" (tagged as O):
Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1
Transition weights to next tag:
- (O, B-slot) -> 1
No feature weights !
Features not seen at train time:
- common
- ngram_1
# Token "a" (tagged as B-slot):
Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1
Transition weights to next tag:
- (B-slot, I-slot) -> 2
No feature weights !
Features not seen at train time:
- ngram_1
# Token "slot" (tagged as I-slot):
Transition weights from previous tag:
- (B-slot, I-slot) -> 2
- (B-slot, O) -> 1.5
Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5
Feature weights:
- (ngram_1:slot, B-slot) -> 5
- (ngram_1:slot, I-slot) -> -3
- (ngram_1:slot, O) -> -1
Features not seen at train time:
- ngram_1
# Token "in" (tagged as O):
No transition from previous tag seen at train time !
Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5
No feature weights !
Features not seen at train time:
- ngram_1
# Token "a" (tagged as O):
Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1
Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5
No feature weights !
Features not seen at train time:
- ngram_1
# Token "text" (tagged as O):
Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1
No feature weights !
Features not seen at train time:
- ngram_1"""
self.assertEqual(expected_log, log)

0 comments on commit cc63b50

Please sign in to comment.