Implemented advanced log in CRFSlotFiller inference

snipsco · Mar 29, 2019 · cc63b50 · cc63b50
1 parent 760c278
commit cc63b50
Show file tree

Hide file tree

Showing 2 changed files with 282 additions and 10 deletions.
diff --git a/snips_nlu/slot_filler/crf_slot_filler.py b/snips_nlu/slot_filler/crf_slot_filler.py
@@ -182,9 +182,13 @@ def get_slots(self, text):
         if not tokens:
             return []
         features = self.compute_features(tokens)
-        tags = [_decode_tag(tag) for tag in
-                self.crf_model.predict_single(features)]
-        return tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
+        tags = self.crf_model.predict_single(features)
+        logger.debug(DifferedLoggingMessage(
+            self.log_inference_weights, text, tokens=tokens, features=features,
+            tags=tags))
+        decoded_tags = [_decode_tag(t) for t in tags]
+        return tags_to_slots(text, tokens, decoded_tags,
+                             self.config.tagging_scheme,
                              self.slot_name_mapping)
 
     def compute_features(self, tokens, drop_out=False):
@@ -251,23 +255,100 @@ def log_weights(self):
         log = ""
         transition_features = self.crf_model.transition_features_
         transition_features = sorted(
-            iteritems(transition_features),
-            key=lambda transition_weight: math.fabs(transition_weight[1]),
+            iteritems(transition_features), key=_weight_absolute_value,
             reverse=True)
         log += "\nTransition weights: \n\n"
         for (state_1, state_2), weight in transition_features:
             log += "\n%s %s: %s" % (
                 _decode_tag(state_1), _decode_tag(state_2), weight)
         feature_weights = self.crf_model.state_features_
         feature_weights = sorted(
-            iteritems(feature_weights),
-            key=lambda feature_weight: math.fabs(feature_weight[1]),
+            iteritems(feature_weights), key=_weight_absolute_value,
             reverse=True)
         log += "\n\nFeature weights: \n\n"
         for (feat, tag), weight in feature_weights:
             log += "\n%s %s: %s" % (feat, _decode_tag(tag), weight)
         return log
 
+    def log_inference_weights(self, text, tokens, features, tags):
+        model_features = set(
+            f for (f, _), w in iteritems(self.crf_model.state_features_))
+        log = "Feature weights for \"%s\":\n\n" % text
+        max_index = len(tokens) - 1
+        tokens_logs = []
+        for i, (token, feats, tag) in enumerate(zip(tokens, features, tags)):
+            token_log = "# Token \"%s\" (tagged as %s):" \
+                        % (token.value, _decode_tag(tag))
+            if i != 0:
+                weights = sorted(self._get_outgoing_weights(tags[i - 1]),
+                                 key=_weight_absolute_value, reverse=True)
+                if weights:
+                    token_log += "\n\nTransition weights from previous tag:"
+                    weight_lines = (
+                        "- (%s, %s) -> %s"
+                        % (_decode_tag(a), _decode_tag(b), w)
+                        for (a, b), w in weights
+                    )
+                    token_log += "\n" + "\n".join(weight_lines)
+                else:
+                    token_log += \
+                        "\n\nNo transition from previous tag seen at" \
+                        " train time !"
+
+            if i != max_index:
+                weights = sorted(self._get_incoming_weights(tags[i + 1]),
+                                 key=_weight_absolute_value, reverse=True)
+                if weights:
+                    token_log += "\n\nTransition weights to next tag:"
+                    weight_lines = (
+                        "- (%s, %s) -> %s"
+                        % (_decode_tag(a), _decode_tag(b), w)
+                        for (a, b), w in weights
+                    )
+                    token_log += "\n" + "\n".join(weight_lines)
+                else:
+                    token_log += \
+                        "\n\nNo transition to next tag seen at train time !"
+            feats = [":".join(f) for f in feats]
+            weights = (w for f in iteritems(feats)
+                       for w in self._get_feature_weight(f))
+            weights = sorted(weights, key=_weight_absolute_value, reverse=True)
+            if weights:
+                token_log += "\n\nFeature weights:\n"
+                token_log += "\n".join(
+                    "- (%s, %s) -> %s"
+                    % (f, _decode_tag(t), w) for (f, t), w in weights
+                )
+            else:
+                token_log += "\n\nNo feature weights !"
+
+            unseen_features = sorted(
+                set(f for f in feats if f not in model_features))
+            if unseen_features:
+                token_log += "\n\nFeatures not seen at train time:\n%s" % \
+                             "\n".join("- %s" % f for f in unseen_features)
+            tokens_logs.append(token_log)
+
+        log += "\n\n\n".join(tokens_logs)
+        return log
+
+    @fitted_required
+    def _get_incoming_weights(self, tag):
+        return [((first, second), w) for (first, second), w
+                in iteritems(self.crf_model.transition_features_)
+                if second == tag]
+
+    @fitted_required
+    def _get_outgoing_weights(self, tag):
+        return [((first, second), w) for (first, second), w
+                in iteritems(self.crf_model.transition_features_)
+                if first == tag]
+
+    @fitted_required
+    def _get_feature_weight(self, feature):
+        return [((f, tag), w) for (f, tag), w
+                in iteritems(self.crf_model.state_features_) if f == feature]
+
     @check_persisted_path
     def persist(self, path):
         """Persists the object at the given path"""
@@ -375,3 +456,7 @@ def _ensure_safe(X, Y):
         safe_X.append([""])  # empty feature
         safe_Y.append([OUTSIDE])  # outside label
     return safe_X, safe_Y
+
+
+def _weight_absolute_value(x):
+    return math.fabs(x[1])
diff --git a/snips_nlu/tests/test_crf_slot_filler.py b/snips_nlu/tests/test_crf_slot_filler.py
@@ -5,7 +5,7 @@
 from builtins import range
 from pathlib import Path
 
-from mock import MagicMock
+from mock import MagicMock, PropertyMock
 from sklearn_crfsuite import CRF
 
 from snips_nlu.constants import (
@@ -14,9 +14,10 @@
 from snips_nlu.entity_parser import CustomEntityParserUsage
 from snips_nlu.exceptions import NotTrained
 from snips_nlu.pipeline.configs import CRFSlotFillerConfig
-from snips_nlu.preprocessing import tokenize
+from snips_nlu.preprocessing import tokenize, Token
 from snips_nlu.result import unresolved_slot
-from snips_nlu.slot_filler.crf_slot_filler import CRFSlotFiller, _ensure_safe
+from snips_nlu.slot_filler.crf_slot_filler import (
+    CRFSlotFiller, _ensure_safe, _encode_tag)
 from snips_nlu.slot_filler.crf_utils import TaggingScheme
 from snips_nlu.slot_filler.feature_factory import (
     IsDigitFactory, NgramFactory, ShapeNgramFactory)
@@ -804,3 +805,189 @@ def test_ensure_safe(self):
             x, y = _ensure_safe(x, y)
             model = CRF().fit(x, y)
             model.predict_single([""])
+
+    def test_log_inference_weights(self):
+        # Given
+        self.maxDiff = None  # pylint: disable=invalid-name
+        text = "this is a slot in a text"
+        tokens = [
+            Token("this", 0, 0),
+            Token("is", 0, 0),
+            Token("a", 0, 0),
+            Token("slot", 0, 0),
+            Token("in", 0, 0),
+            Token("a", 0, 0),
+            Token("text", 0, 0),
+        ]
+        features = [
+            {
+                "ngram_1": "this",
+                "is_first": "1",
+            },
+            {
+                "ngram_1": "is",
+                "common": "1",
+            },
+            {
+                "ngram_1": "a"
+            },
+            {
+                "ngram_1": "slot",
+            },
+            {
+                "ngram_1": "in",
+            },
+            {
+                "ngram_1": "a",
+            },
+            {
+                "ngram_1": "text",
+            },
+        ]
+        tags = ["O", "O", "B-slot", "I-slot", "O", "O", "O"]
+        tags = [_encode_tag(t) for t in tags]
+
+        transitions_weights = {
+            (_encode_tag("O"), _encode_tag("O")): 2,
+            (_encode_tag("O"), _encode_tag("B-slot")): 1,
+            (_encode_tag("B-slot"), _encode_tag("I-slot")): 2,
+            (_encode_tag("B-slot"), _encode_tag("O")): 1.5,
+        }
+
+        states_weights = {
+            ("ngram_1:this", _encode_tag("O")): 5,
+            ("ngram_1:this", _encode_tag("B-slot")): -2,
+            ("ngram_1:slot", _encode_tag("B-slot")): 5,
+            ("ngram_1:slot", _encode_tag("I-slot")): -3,
+            ("ngram_1:slot", _encode_tag("O")): -1
+        }
+
+        # pylint: disable=super-init-not-called
+        class MockedSlotFiller(CRFSlotFiller):
+            def __init__(self, transition_features, state_features):
+                mocked_model = MagicMock()
+                type(mocked_model).transition_features_ = PropertyMock(
+                    return_value=transition_features)
+                type(mocked_model).state_features_ = PropertyMock(
+                    return_value=state_features)
+                self.crf_model = mocked_model
+                self.slot_name_mapping = 1
+
+            def __del__(self):
+                pass
+
+        slot_filler = MockedSlotFiller(transitions_weights, states_weights)
+
+        # When
+        log = slot_filler.log_inference_weights(
+            text=text, tokens=tokens, features=features, tags=tags)
+
+        # Then
+        expected_log = """Feature weights for "this is a slot in a text":
+
+# Token "this" (tagged as O):
+
+Transition weights to next tag:
+- (O, O) -> 2
+- (B-slot, O) -> 1.5
+
+Feature weights:
+- (ngram_1:this, O) -> 5
+- (ngram_1:this, B-slot) -> -2
+
+Features not seen at train time:
+- is_first
+- ngram_1
+
+
+# Token "is" (tagged as O):
+
+Transition weights from previous tag:
+- (O, O) -> 2
+- (O, B-slot) -> 1
+
+Transition weights to next tag:
+- (O, B-slot) -> 1
+
+No feature weights !
+
+Features not seen at train time:
+- common
+- ngram_1
+
+
+# Token "a" (tagged as B-slot):
+
+Transition weights from previous tag:
+- (O, O) -> 2
+- (O, B-slot) -> 1
+
+Transition weights to next tag:
+- (B-slot, I-slot) -> 2
+
+No feature weights !
+
+Features not seen at train time:
+- ngram_1
+
+
+# Token "slot" (tagged as I-slot):
+
+Transition weights from previous tag:
+- (B-slot, I-slot) -> 2
+- (B-slot, O) -> 1.5
+
+Transition weights to next tag:
+- (O, O) -> 2
+- (B-slot, O) -> 1.5
+
+Feature weights:
+- (ngram_1:slot, B-slot) -> 5
+- (ngram_1:slot, I-slot) -> -3
+- (ngram_1:slot, O) -> -1
+
+Features not seen at train time:
+- ngram_1
+
+
+# Token "in" (tagged as O):
+
+No transition from previous tag seen at train time !
+
+Transition weights to next tag:
+- (O, O) -> 2
+- (B-slot, O) -> 1.5
+
+No feature weights !
+
+Features not seen at train time:
+- ngram_1
+
+
+# Token "a" (tagged as O):
+
+Transition weights from previous tag:
+- (O, O) -> 2
+- (O, B-slot) -> 1
+
+Transition weights to next tag:
+- (O, O) -> 2
+- (B-slot, O) -> 1.5
+
+No feature weights !
+
+Features not seen at train time:
+- ngram_1
+
+
+# Token "text" (tagged as O):
+
+Transition weights from previous tag:
+- (O, O) -> 2
+- (O, B-slot) -> 1
+
+No feature weights !
+
+Features not seen at train time:
+- ngram_1"""
+        self.assertEqual(expected_log, log)