diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 4a6174bc3d8..2ef639cad52 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -4,8 +4,8 @@
import numpy as np
import srsly
-from thinc.api import Config, Model, NumpyOps, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+from thinc.types import ArrayXd, Floats2d, Ints1d
from .. import util
from ..errors import Errors
@@ -22,6 +22,9 @@
TOP_K_GUARDRAIL = 20
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
+
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@@ -50,6 +53,7 @@
"overwrite": False,
"top_k": 1,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
+ "save_activations": False,
},
default_score_weights={"lemma_acc": 1.0},
)
@@ -62,6 +66,7 @@ def make_edit_tree_lemmatizer(
overwrite: bool,
top_k: int,
scorer: Optional[Callable],
+ save_activations: bool,
):
"""Construct an EditTreeLemmatizer component."""
return EditTreeLemmatizer(
@@ -73,6 +78,7 @@ def make_edit_tree_lemmatizer(
overwrite=overwrite,
top_k=top_k,
scorer=scorer,
+ save_activations=save_activations,
)
@@ -92,6 +98,7 @@ def __init__(
overwrite: bool = False,
top_k: int = 1,
scorer: Optional[Callable] = lemmatizer_score,
+ save_activations: bool = False,
):
"""
Construct an edit tree lemmatizer.
@@ -103,6 +110,7 @@ def __init__(
frequency in the training data.
overwrite (bool): overwrite existing lemma annotations.
top_k (int): try to apply at most the k most probable edit trees.
+ save_activations (bool): save model activations in Doc when annotating.
"""
self.vocab = vocab
self.model = model
@@ -117,7 +125,7 @@ def __init__(
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
- self.numpy_ops = NumpyOps()
+ self.save_activations = save_activations
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@@ -146,31 +154,24 @@ def get_loss(
return float(loss), d_scores
- def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
- if self.top_k == 1:
- scores2guesses = self._scores2guesses_top_k_equals_1
- elif self.top_k <= TOP_K_GUARDRAIL:
- scores2guesses = self._scores2guesses_top_k_greater_1
- else:
- scores2guesses = self._scores2guesses_top_k_guardrail
- # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
- # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
- # for its principal purpose of lemmatizing tokens. However, the code could also
- # be used for other purposes, and with very large values of *top_k* the method
- # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
- # instead.
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.cfg["labels"])
- guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
+ guesses: List[Ints1d] = [
+ self.model.ops.alloc((0,), dtype="i") for doc in docs
+ ]
+ scores: List[Floats2d] = [
+ self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
+ ]
assert len(guesses) == n_docs
- return guesses
+ return {"probabilities": scores, "tree_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == n_docs
guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
- return guesses
+ return {"probabilities": scores, "tree_ids": guesses}
def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
@@ -230,8 +231,13 @@ def _scores2guesses_top_k_guardrail(self, docs, scores):
return guesses
- def set_annotations(self, docs: Iterable[Doc], batch_tree_ids):
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
+ batch_tree_ids = activations["tree_ids"]
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
doc_tree_ids = batch_tree_ids[i]
if hasattr(doc_tree_ids, "get"):
doc_tree_ids = doc_tree_ids.get()
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index a730ece1bfa..bab79282d5b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -1,3 +1,10 @@
+from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
+from typing import cast
+from numpy import dtype
+from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
+from pathlib import Path
+from itertools import islice
+import srsly
import random
from itertools import islice
from pathlib import Path
@@ -21,6 +28,11 @@
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
+
+ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
+
+KNOWLEDGE_BASE_IDS = "kb_ids"
+
# See #9050
BACKWARD_OVERWRITE = True
@@ -60,6 +72,7 @@
"use_gold_ents": True,
"candidates_batch_size": 1,
"threshold": None,
+ "save_activations": False,
},
default_score_weights={
"nel_micro_f": 1.0,
@@ -87,6 +100,7 @@ def make_entity_linker(
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
+ save_activations: bool,
):
"""Construct an EntityLinker component.
@@ -110,6 +124,7 @@ def make_entity_linker(
candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
+ save_activations (bool): save model activations in Doc when annotating.
"""
if not model.attrs.get("include_span_maker", False):
@@ -144,6 +159,7 @@ def make_entity_linker(
use_gold_ents=use_gold_ents,
candidates_batch_size=candidates_batch_size,
threshold=threshold,
+ save_activations=save_activations,
)
@@ -185,6 +201,7 @@ def __init__(
use_gold_ents: bool,
candidates_batch_size: int,
threshold: Optional[float] = None,
+ save_activations: bool = False,
) -> None:
"""Initialize an entity linker.
@@ -239,6 +256,7 @@ def __init__(
self.use_gold_ents = use_gold_ents
self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
+ self.save_activations = save_activations
if candidates_batch_size < 1:
raise ValueError(Errors.E1044)
@@ -427,7 +445,7 @@ def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
loss = loss / len(entity_encodings)
return float(loss), out
- def predict(self, docs: Iterable[Doc]) -> List[str]:
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
no prediction.
@@ -440,129 +458,138 @@ def predict(self, docs: Iterable[Doc]) -> List[str]:
self.validate_kb()
entity_count = 0
final_kb_ids: List[str] = []
- xp = self.model.ops.xp
+ ops = self.model.ops
+ xp = ops.xp
+ docs_ents: List[Ragged] = []
+ docs_scores: List[Ragged] = []
if not docs:
- return final_kb_ids
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
if isinstance(docs, Doc):
docs = [docs]
- for i, doc in enumerate(docs):
+ for doc in docs:
+ doc_ents: List[Ints1d] = []
+ doc_scores: List[Floats1d] = []
if len(doc) == 0:
+ docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0)))
+ docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0)))
continue
sentences = [s for s in doc.sents]
- # Loop over entities in batches.
- for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
- ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
-
- # Look up candidate entities.
- valid_ent_idx = [
- idx
- for idx in range(len(ent_batch))
- if ent_batch[idx].label_ not in self.labels_discard
- ]
-
- batch_candidates = list(
- self.get_candidates_batch(
- self.kb, [ent_batch[idx] for idx in valid_ent_idx]
- )
- if self.candidates_batch_size > 1
- else [
- self.get_candidates(self.kb, ent_batch[idx])
- for idx in valid_ent_idx
- ]
- )
-
- # Looping through each entity in batch (TODO: rewrite)
- for j, ent in enumerate(ent_batch):
- assert hasattr(ent, "sents")
- sents = list(ent.sents)
- sent_indices = (
- sentences.index(sents[0]),
- sentences.index(sents[-1]),
+ if self.incl_context:
+ # get n_neighbour sentences, clipped to the length of the document
+ start_sentence = max(0, sent_index - self.n_sents)
+ end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
+ start_token = sentences[start_sentence].start
+ end_token = sentences[end_sentence].end
+ sent_doc = doc[start_token:end_token].as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model.predict([sent_doc])[0]
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
+ entity_count += 1
+ if ent.label_ in self.labels_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=[0.0],
+ ents=[0],
)
- assert sent_indices[1] >= sent_indices[0] >= 0
-
- if self.incl_context:
- # get n_neighbour sentences, clipped to the length of the document
- start_sentence = max(0, sent_indices[0] - self.n_sents)
- end_sentence = min(
- len(sentences) - 1, sent_indices[1] + self.n_sents
- )
- start_token = sentences[start_sentence].start
- end_token = sentences[end_sentence].end
- sent_doc = doc[start_token:end_token].as_doc()
-
- # currently, the context is the same for each entity in a sentence (should be refined)
- sentence_encoding = self.model.predict([sent_doc])[0]
- sentence_encoding_t = sentence_encoding.T
- sentence_norm = xp.linalg.norm(sentence_encoding_t)
- entity_count += 1
- if ent.label_ in self.labels_discard:
- # ignoring this entity - setting to NIL
+ else:
+ candidates = list(self.get_candidates(self.kb, ent))
+ if not candidates:
+ # no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL)
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=[0.0],
+ ents=[0],
+ )
+ elif len(candidates) == 1 and self.threshold is None:
+ # shortcut for efficiency reasons: take the 1 candidate
+ final_kb_ids.append(candidates[0].entity_)
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=[1.0],
+ ents=[candidates[0].entity_],
+ )
else:
- candidates = list(batch_candidates[j])
- if not candidates:
- # no prediction possible for this entity - setting to NIL
- final_kb_ids.append(self.NIL)
- elif len(candidates) == 1 and self.threshold is None:
- # shortcut for efficiency reasons: take the 1 candidate
- final_kb_ids.append(candidates[0].entity_)
- else:
- random.shuffle(candidates)
- # set all prior probabilities to 0 if incl_prior=False
- prior_probs = xp.asarray([c.prior_prob for c in candidates])
- if not self.incl_prior:
- prior_probs = xp.asarray([0.0 for _ in candidates])
- scores = prior_probs
- # add in similarity from the context
- if self.incl_context:
- entity_encodings = xp.asarray(
- [c.entity_vector for c in candidates]
- )
- entity_norm = xp.linalg.norm(entity_encodings, axis=1)
- if len(entity_encodings) != len(prior_probs):
- raise RuntimeError(
- Errors.E147.format(
- method="predict",
- msg="vectors not of equal length",
- )
+ random.shuffle(candidates)
+ # set all prior probabilities to 0 if incl_prior=False
+ prior_probs = xp.asarray([c.prior_prob for c in candidates])
+ if not self.incl_prior:
+ prior_probs = xp.asarray([0.0 for _ in candidates])
+ scores = prior_probs
+ # add in similarity from the context
+ if self.incl_context:
+ entity_encodings = xp.asarray(
+ [c.entity_vector for c in candidates]
+ )
+ entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+ if len(entity_encodings) != len(prior_probs):
+ raise RuntimeError(
+ Errors.E147.format(
+ method="predict",
+ msg="vectors not of equal length",
)
- # cosine similarity
- sims = xp.dot(entity_encodings, sentence_encoding_t) / (
- sentence_norm * entity_norm
)
- if sims.shape != prior_probs.shape:
- raise ValueError(Errors.E161)
- scores = prior_probs + sims - (prior_probs * sims)
- final_kb_ids.append(
- candidates[scores.argmax().item()].entity_
- if self.threshold is None
- or scores.max() >= self.threshold
- else EntityLinker.NIL
+ # cosine similarity
+ sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+ sentence_norm * entity_norm
)
-
+ if sims.shape != prior_probs.shape:
+ raise ValueError(Errors.E161)
+ scores = prior_probs + sims - (prior_probs * sims)
+ final_kb_ids.append(
+ candidates[scores.argmax().item()].entity_
+ if self.threshold is None or scores.max() >= self.threshold
+ else EntityLinker.NIL
+ )
+ self._add_activations(
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ scores=scores,
+ ents=[c.entity for c in candidates],
+ )
+ self._add_doc_activations(
+ docs_scores=docs_scores,
+ docs_ents=docs_ents,
+ doc_scores=doc_scores,
+ doc_ents=doc_ents,
+ )
if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format(
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
- return final_kb_ids
+ return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
- def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
- kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced
+ by EntityLinker.predict.
DOCS: https://spacy.io/api/entitylinker#set_annotations
"""
+ kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS])
count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
overwrite = self.cfg["overwrite"]
- for doc in docs:
+ for j, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ if act_name != KNOWLEDGE_BASE_IDS:
+ # We only copy activations that are Ragged.
+ doc.activations[self.name][act_name] = cast(Ragged, acts[j])
+
for ent in doc.ents:
kb_id = kb_ids[i]
i += 1
@@ -661,3 +688,32 @@ def rehearse(self, examples, *, sgd=None, losses=None, **config):
def add_label(self, label):
raise NotImplementedError
+
+ def _add_doc_activations(
+ self,
+ *,
+ docs_scores: List[Ragged],
+ docs_ents: List[Ragged],
+ doc_scores: List[Floats1d],
+ doc_ents: List[Ints1d],
+ ):
+ if not self.save_activations:
+ return
+ ops = self.model.ops
+ lengths = ops.asarray1i([s.shape[0] for s in doc_scores])
+ docs_scores.append(Ragged(ops.flatten(doc_scores), lengths))
+ docs_ents.append(Ragged(ops.flatten(doc_ents), lengths))
+
+ def _add_activations(
+ self,
+ *,
+ doc_scores: List[Floats1d],
+ doc_ents: List[Ints1d],
+ scores: Sequence[float],
+ ents: Sequence[int],
+ ):
+ if not self.save_activations:
+ return
+ ops = self.model.ops
+ doc_scores.append(ops.asarray1f(scores))
+ doc_ents.append(ops.asarray1i(ents, dtype="uint64"))
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index bdbe75fd824..cc8f87936b9 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,4 +1,8 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import srsly
+from thinc.api import SequenceCategoricalCrossentropy, Model, Config
+from thinc.types import Floats2d, Ints1d
from itertools import islice
from typing import Callable, Dict, Optional, Union
@@ -8,6 +12,12 @@ from ..morphology cimport Morphology
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
+from ..parts_of_speech import IDS as POS_IDS
+from ..symbols import POS
+from ..language import Language
+from ..errors import Errors
+from .pipe import deserialize_config
+from .tagger import ActivationsT, Tagger
from .. import util
from ..errors import Errors
from ..language import Language
@@ -50,8 +60,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"morphologizer",
assigns=["token.morph", "token.pos"],
- default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
- "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
+ default_config={
+ "model": DEFAULT_MORPH_MODEL,
+ "overwrite": True,
+ "extend": False,
+ "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"},
+ "save_activations": False,
+ },
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
@@ -62,8 +77,10 @@ def make_morphologizer(
extend: bool,
label_smoothing: float,
scorer: Optional[Callable],
+ save_activations: bool,
):
- return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
+ return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer,
+ save_activations=save_activations)
def morphologizer_score(examples, **kwargs):
@@ -99,6 +116,7 @@ class Morphologizer(Tagger):
extend: bool = BACKWARD_EXTEND,
label_smoothing: float = 0.0,
scorer: Optional[Callable] = morphologizer_score,
+ save_activations: bool = False,
):
"""Initialize a morphologizer.
@@ -109,6 +127,7 @@ class Morphologizer(Tagger):
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/morphologizer#init
"""
@@ -129,6 +148,7 @@ class Morphologizer(Tagger):
}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
+ self.save_activations = save_activations
@property
def labels(self):
@@ -222,14 +242,15 @@ class Morphologizer(Tagger):
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
- def set_annotations(self, docs, batch_tag_ids):
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
- batch_tag_ids: The IDs to set, produced by Morphologizer.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict.
DOCS: https://spacy.io/api/morphologizer#set_annotations
"""
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@@ -240,6 +261,10 @@ class Morphologizer(Tagger):
# to allocate a compatible container out of the iterable.
labels = tuple(self.labels)
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index df093baa9c6..521afe1d181 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,12 +1,16 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Dict, Iterable, Optional, Callable, List, Union
from itertools import islice
from typing import Callable, Optional
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
+import srsly
+from thinc.api import Model, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
from ..tokens.doc cimport Doc
-from .. import util
+from .tagger import ActivationsT, Tagger
+from ..language import Language
from ..errors import Errors
from ..language import Language
from ..scorer import Scorer
@@ -37,11 +41,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"senter",
assigns=["token.is_sent_start"],
- default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}},
+ default_config={
+ "model": DEFAULT_SENTER_MODEL,
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.senter_scorer.v1"},
+ "save_activations": False,
+ },
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
-def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]):
- return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer)
+def make_senter(nlp: Language,
+ name: str,
+ model: Model,
+ overwrite: bool,
+ scorer: Optional[Callable],
+ save_activations: bool):
+ return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations)
def senter_score(examples, **kwargs):
@@ -71,6 +85,7 @@ class SentenceRecognizer(Tagger):
*,
overwrite=BACKWARD_OVERWRITE,
scorer=senter_score,
+ save_activations: bool = False,
):
"""Initialize a sentence recognizer.
@@ -80,6 +95,7 @@ class SentenceRecognizer(Tagger):
losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/sentencerecognizer#init
"""
@@ -89,6 +105,7 @@ class SentenceRecognizer(Tagger):
self._rehearsal_model = None
self.cfg = {"overwrite": overwrite}
self.scorer = scorer
+ self.save_activations = save_activations
@property
def labels(self):
@@ -106,19 +123,24 @@ class SentenceRecognizer(Tagger):
def label_data(self):
return None
- def set_annotations(self, docs, batch_tag_ids):
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
- batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict.
DOCS: https://spacy.io/api/sentencerecognizer#set_annotations
"""
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 08a5478a912..1450bb5d6cb 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -1,6 +1,8 @@
-from dataclasses import dataclass
-from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
+from typing import Union
+from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
+from thinc.api import Optimizer
+from thinc.types import Ragged, Ints2d, Floats2d, Ints1d
import numpy
from thinc.api import Config, Model, Ops, Optimizer, get_current_ops, set_dropout_rate
@@ -16,6 +18,9 @@
from ..vocab import Vocab
from .trainable_pipe import TrainablePipe
+ActivationsT = Dict[str, Union[Floats2d, Ragged]]
+
+
spancat_default_config = """
[model]
@architectures = "spacy.SpanCategorizer.v1"
@@ -170,6 +175,7 @@ def build_preset_spans_suggester(spans_key: str) -> Suggester:
"model": DEFAULT_SPANCAT_MODEL,
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
+ "save_activations": False,
},
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
)
@@ -182,6 +188,7 @@ def make_spancat(
scorer: Optional[Callable],
threshold: float,
max_positive: Optional[int],
+ save_activations: bool,
) -> "SpanCategorizer":
"""Create a SpanCategorizer component and configure it for multi-label
classification to be able to assign multiple labels for each span.
@@ -209,6 +216,7 @@ def make_spancat(
0.5.
max_positive (Optional[int]): Maximum number of labels to consider positive
per span. Defaults to None, indicating no limit.
+ save_activations (bool): save model activations in Doc when annotating.
"""
return SpanCategorizer(
nlp.vocab,
@@ -287,6 +295,7 @@ def make_spancat_singlelabel(
add_negative_label=True,
threshold=None,
scorer=scorer,
+ save_activations=save_activations,
)
@@ -349,6 +358,7 @@ def __init__(
max_positive: Optional[int] = None,
threshold: Optional[float] = 0.5,
scorer: Optional[Callable] = spancat_score,
+ save_activations: bool = False,
) -> None:
"""Initialize the multi-label or multi-class span categorizer.
@@ -398,9 +408,7 @@ def __init__(
self.model = model
self.name = name
self.scorer = scorer
- self.add_negative_label = add_negative_label
- if not allow_overlap and max_positive is not None and max_positive > 1:
- raise ValueError(Errors.E1051.format(max_positive=max_positive))
+ self.save_activations = save_activations
@property
def key(self) -> str:
@@ -458,28 +466,7 @@ def label_data(self) -> List[str]:
"""
return list(self.labels)
- @property
- def _label_map(self) -> Dict[str, int]:
- """RETURNS (Dict[str, int]): The label map."""
- return {label: i for i, label in enumerate(self.labels)}
-
- @property
- def _n_labels(self) -> int:
- """RETURNS (int): Number of labels."""
- if self.add_negative_label:
- return len(self.labels) + 1
- else:
- return len(self.labels)
-
- @property
- def _negative_label_i(self) -> Union[int, None]:
- """RETURNS (Union[int, None]): Index of the negative label."""
- if self.add_negative_label:
- return len(self.label_data)
- else:
- return None
-
- def predict(self, docs: Iterable[Doc]):
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
@@ -488,11 +475,8 @@ def predict(self, docs: Iterable[Doc]):
DOCS: https://spacy.io/api/spancategorizer#predict
"""
indices = self.suggester(docs, ops=self.model.ops)
- if indices.lengths.sum() == 0:
- scores = self.model.ops.alloc2f(0, 0)
- else:
- scores = self.model.predict((docs, indices)) # type: ignore
- return indices, scores
+ scores = self.model.predict((docs, indices)) # type: ignore
+ return {"indices": indices, "scores": scores}
def set_candidates(
self, docs: Iterable[Doc], *, candidates_key: str = "candidates"
@@ -512,32 +496,32 @@ def set_candidates(
for index in candidates.dataXd:
doc.spans[candidates_key].append(doc[index[0] : index[1]])
- def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None:
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
- scores: The scores to set, produced by SpanCategorizer.predict.
+ activations: ActivationsT: The activations, produced by SpanCategorizer.predict.
DOCS: https://spacy.io/api/spancategorizer#set_annotations
"""
- indices, scores = indices_scores
+ labels = self.labels
+
+ indices = activations["indices"]
+ assert isinstance(indices, Ragged)
+ scores = cast(Floats2d, activations["scores"])
+
offset = 0
for i, doc in enumerate(docs):
indices_i = indices[i].dataXd
- allow_overlap = cast(bool, self.cfg["allow_overlap"])
- if self.cfg["max_positive"] == 1:
- doc.spans[self.key] = self._make_span_group_singlelabel(
- doc,
- indices_i,
- scores[offset : offset + indices.lengths[i]],
- allow_overlap,
- )
- else:
- doc.spans[self.key] = self._make_span_group_multilabel(
- doc,
- indices_i,
- scores[offset : offset + indices.lengths[i]],
- )
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ doc.activations[self.name]["indices"] = indices_i
+ doc.activations[self.name]["scores"] = scores[
+ offset : offset + indices.lengths[i]
+ ]
+ doc.spans[self.key] = self._make_span_group(
+ doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
+ )
offset += indices.lengths[i]
def update(
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 34e85d49c2b..8ecd0c46ee0 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,4 +1,10 @@
-# cython: infer_types=True, binding=True
+# cython: infer_types=True, profile=True, binding=True
+from typing import Callable, Dict, Iterable, List, Optional, Union
+import numpy
+import srsly
+from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d, Ints1d
+import warnings
from itertools import islice
from typing import Callable, Optional
@@ -15,6 +21,9 @@ from ..training import validate_examples, validate_get_examples
from ..util import registry
from .trainable_pipe import TrainablePipe
+
+ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
+
# See #9050
BACKWARD_OVERWRITE = False
@@ -38,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"tagger",
assigns=["token.tag"],
- default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
+ default_config={
+ "model": DEFAULT_TAGGER_MODEL,
+ "overwrite": False,
+ "scorer": {"@scorers": "spacy.tagger_scorer.v1"},
+ "neg_prefix": "!",
+ "save_activations": False,
+ },
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(
@@ -48,7 +63,7 @@ def make_tagger(
overwrite: bool,
scorer: Optional[Callable],
neg_prefix: str,
- label_smoothing: float,
+ save_activations: bool,
):
"""Construct a part-of-speech tagger component.
@@ -57,7 +72,8 @@ def make_tagger(
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
- return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
+ return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
+ save_activations=save_activations)
def tagger_score(examples, **kwargs):
@@ -83,7 +99,7 @@ class Tagger(TrainablePipe):
overwrite=BACKWARD_OVERWRITE,
scorer=tagger_score,
neg_prefix="!",
- label_smoothing=0.0,
+ save_activations: bool = False,
):
"""Initialize a part-of-speech tagger.
@@ -93,6 +109,7 @@ class Tagger(TrainablePipe):
losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/tagger#init
"""
@@ -103,6 +120,7 @@ class Tagger(TrainablePipe):
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
self.cfg = dict(sorted(cfg.items()))
self.scorer = scorer
+ self.save_activations = save_activations
@property
def labels(self):
@@ -121,7 +139,7 @@ class Tagger(TrainablePipe):
"""Data about the labels currently added to the component."""
return tuple(self.cfg["labels"])
- def predict(self, docs):
+ def predict(self, docs) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
@@ -134,12 +152,12 @@ class Tagger(TrainablePipe):
n_labels = len(self.labels)
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
assert len(guesses) == len(docs)
- return guesses
+ return {"probabilities": guesses, "label_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == len(docs), (len(scores), len(docs))
guesses = self._scores2guesses(scores)
assert len(guesses) == len(docs)
- return guesses
+ return {"probabilities": scores, "label_ids": guesses}
def _scores2guesses(self, scores):
guesses = []
@@ -150,20 +168,25 @@ class Tagger(TrainablePipe):
guesses.append(doc_guesses)
return guesses
- def set_annotations(self, docs, batch_tag_ids):
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
- batch_tag_ids: The IDs to set, produced by Tagger.predict.
+ activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
DOCS: https://spacy.io/api/tagger#set_annotations
"""
+ batch_tag_ids = activations["label_ids"]
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ for act_name, acts in activations.items():
+ doc.activations[self.name][act_name] = acts[i]
doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get()
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index ae227017a9f..6cb33109891 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union
+from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
+import numpy
from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
@@ -14,6 +18,9 @@
from ..vocab import Vocab
from .trainable_pipe import TrainablePipe
+ActivationsT = Dict[str, Floats2d]
+
+
single_label_default_config = """
[model]
@architectures = "spacy.TextCatEnsemble.v2"
@@ -80,7 +87,8 @@
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
- "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
+ "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+ "save_activations": False,
},
default_score_weights={
"cats_score": 1.0,
@@ -101,6 +109,7 @@ def make_textcat(
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
+ save_activations: bool,
) -> "TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
@@ -110,8 +119,16 @@ def make_textcat(
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
+ save_activations (bool): save model activations in Doc when annotating.
"""
- return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
+ return TextCategorizer(
+ nlp.vocab,
+ model,
+ name,
+ threshold=threshold,
+ scorer=scorer,
+ save_activations=save_activations,
+ )
def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
@@ -142,6 +159,7 @@ def __init__(
*,
threshold: float,
scorer: Optional[Callable] = textcat_score,
+ save_activations: bool = False,
) -> None:
"""Initialize a text categorizer for single-label classification.
@@ -167,6 +185,7 @@ def __init__(
}
self.cfg = dict(cfg)
self.scorer = scorer
+ self.save_activations = save_activations
@property
def support_missing_values(self):
@@ -191,7 +210,7 @@ def label_data(self) -> List[str]:
"""
return self.labels # type: ignore[return-value]
- def predict(self, docs: Iterable[Doc]):
+ def predict(self, docs: Iterable[Doc]) -> ActivationsT:
"""Apply the pipeline's model to a batch of docs, without modifying them.
docs (Iterable[Doc]): The documents to predict.
@@ -204,12 +223,12 @@ def predict(self, docs: Iterable[Doc]):
tensors = [doc.tensor for doc in docs]
xp = self.model.ops.xp
scores = xp.zeros((len(list(docs)), len(self.labels)))
- return scores
+ return {"probabilities": scores}
scores = self.model.predict(docs)
scores = self.model.ops.asarray(scores)
- return scores
+ return {"probabilities": scores}
- def set_annotations(self, docs: Iterable[Doc], scores) -> None:
+ def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of Doc objects, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
@@ -217,9 +236,13 @@ def set_annotations(self, docs: Iterable[Doc], scores) -> None:
DOCS: https://spacy.io/api/textcategorizer#set_annotations
"""
+ probs = activations["probabilities"]
for i, doc in enumerate(docs):
+ if self.save_activations:
+ doc.activations[self.name] = {}
+ doc.activations[self.name]["probabilities"] = probs[i]
for j, label in enumerate(self.labels):
- doc.cats[label] = float(scores[i, j])
+ doc.cats[label] = float(probs[i, j])
def update(
self,
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 2f8d5e60437..ac024ba3639 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -1,3 +1,7 @@
+from typing import Iterable, Optional, Dict, List, Callable, Any, Union
+from thinc.types import Floats2d
+from thinc.api import Model, Config
+
from itertools import islice
from typing import Any, Callable, Dict, Iterable, List, Optional
@@ -78,7 +82,8 @@
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
- "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
+ "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+ "save_activations": False,
},
default_score_weights={
"cats_score": 1.0,
@@ -99,8 +104,9 @@ def make_multilabel_textcat(
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
-) -> "MultiLabel_TextCategorizer":
- """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
+ save_activations: bool,
+) -> "TextCategorizer":
+ """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels
per doc).
@@ -111,7 +117,12 @@ def make_multilabel_textcat(
scorer (Optional[Callable]): The scoring method.
"""
return MultiLabel_TextCategorizer(
- nlp.vocab, model, name, threshold=threshold, scorer=scorer
+ nlp.vocab,
+ model,
+ name,
+ threshold=threshold,
+ scorer=scorer,
+ save_activations=save_activations,
)
@@ -143,6 +154,7 @@ def __init__(
*,
threshold: float,
scorer: Optional[Callable] = textcat_multilabel_score,
+ save_activations: bool = False,
) -> None:
"""Initialize a text categorizer for multi-label classification.
@@ -151,7 +163,7 @@ def __init__(
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
- scorer (Optional[Callable]): The scoring method.
+ save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/textcategorizer#init
"""
@@ -162,6 +174,7 @@ def __init__(
cfg = {"labels": [], "threshold": threshold}
self.cfg = dict(cfg)
self.scorer = scorer
+ self.save_activations = save_activations
@property
def support_missing_values(self):
diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd
index b1d2550a1ce..3e9a0a9584d 100644
--- a/spacy/pipeline/trainable_pipe.pxd
+++ b/spacy/pipeline/trainable_pipe.pxd
@@ -7,3 +7,4 @@ cdef class TrainablePipe(Pipe):
cdef public object model
cdef public object cfg
cdef public object scorer
+ cdef bint _save_activations
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 8f219b32797..bd360c9501b 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -2,10 +2,14 @@
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
-from thinc.api import Model, Optimizer, set_dropout_rate
+from thinc.api import set_dropout_rate, Model, Optimizer
+import warnings
from ..tokens.doc cimport Doc
+from ..training import validate_examples
+from ..errors import Errors, Warnings
+from .pipe import Pipe, deserialize_config
from .. import util
from ..errors import Errors
from ..language import Language
@@ -342,3 +346,11 @@ cdef class TrainablePipe(Pipe):
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self
+
+ @property
+ def save_activations(self):
+ return self._save_activations
+
+ @save_activations.setter
+ def save_activations(self, save_activations: bool):
+ self._save_activations = save_activations
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index 5a8f0aee2ab..ba2ed4e5ff3 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -1,3 +1,4 @@
+from typing import cast
import pickle
import hypothesis.strategies as st
@@ -8,6 +9,8 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees
+from spacy.pipeline.trainable_pipe import TrainablePipe
+from spacy.training import Example
from spacy.strings import StringStore
from spacy.training import Example
from spacy.util import make_tempdir
@@ -331,3 +334,26 @@ def test_empty_strings():
no_change = trees.add("xyz", "xyz")
empty = trees.add("", "")
assert no_change == empty
+
+
+def test_save_activations():
+ nlp = English()
+ lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer"))
+ lemmatizer.min_tree_freq = 1
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = lemmatizer.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "trainable_lemmatizer" not in doc.activations
+
+ lemmatizer.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["trainable_lemmatizer"].keys()) == [
+ "probabilities",
+ "tree_ids",
+ ]
+ assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO)
+ assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 00771a0f0f8..844bacb3b1f 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -1,7 +1,8 @@
-from typing import Any, Callable, Dict, Iterable, Tuple
+from typing import Callable, Iterable, Dict, Any, cast
import pytest
from numpy.testing import assert_equal
+from thinc.types import Ragged
from spacy import Language, registry, util
from spacy.attrs import ENT_KB_ID
@@ -9,8 +10,7 @@
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase, get_candidates
from spacy.lang.en import English
from spacy.ml import load_kb
-from spacy.ml.models.entity_linker import build_span_maker
-from spacy.pipeline import EntityLinker
+from spacy.pipeline import EntityLinker, TrainablePipe
from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
@@ -1194,16 +1194,64 @@ def create_kb(vocab):
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
-def test_span_maker_forward_with_empty():
- """The forward pass of the span maker may have a doc with no entities."""
+def test_save_activations():
nlp = English()
- doc1 = nlp("a b c")
- ent = doc1[0:1]
- ent.label_ = "X"
- doc1.ents = [ent]
- # no entities
- doc2 = nlp("x y z")
-
- # just to get a model
- span_maker = build_span_maker()
- span_maker([doc1, doc2], False)
+ vector_length = 3
+ assert "Q2146908" not in nlp.vocab.strings
+
+ # Convert the texts to docs to make sure we have doc.ents set for the training examples
+ train_examples = []
+ for text, annotation in TRAIN_DATA:
+ doc = nlp(text)
+ train_examples.append(Example.from_dict(doc, annotation))
+
+ def create_kb(vocab):
+ # create artificial KB - assign same prior weight to the two russ cochran's
+ # Q2146908 (Russ Cochran): American golfer
+ # Q7381115 (Russ Cochran): publisher
+ mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
+ mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
+ mykb.add_alias(
+ alias="Russ Cochran",
+ entities=["Q2146908", "Q7381115"],
+ probabilities=[0.5, 0.5],
+ )
+ return mykb
+
+ # Create the Entity Linker component and add it to the pipeline
+ entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True))
+ assert isinstance(entity_linker, EntityLinker)
+ entity_linker.set_kb(create_kb)
+ assert "Q2146908" in entity_linker.vocab.strings
+ assert "Q2146908" in entity_linker.kb.vocab.strings
+
+ # initialize the NEL pipe
+ nlp.initialize(get_examples=lambda: train_examples)
+
+ nO = entity_linker.model.get_dim("nO")
+
+ nlp.add_pipe("sentencizer", first=True)
+ patterns = [
+ {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]},
+ {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]},
+ ]
+ ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+ ruler.add_patterns(patterns)
+
+ doc = nlp("Russ Cochran was a publisher")
+ assert "entity_linker" not in doc.activations
+
+ entity_linker.save_activations = True
+ doc = nlp("Russ Cochran was a publisher")
+ assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"}
+ ents = doc.activations["entity_linker"]["ents"]
+ assert isinstance(ents, Ragged)
+ assert ents.data.shape == (2, 1)
+ assert ents.data.dtype == "uint64"
+ assert ents.lengths.shape == (1,)
+ scores = doc.activations["entity_linker"]["scores"]
+ assert isinstance(scores, Ragged)
+ assert scores.data.shape == (2, 1)
+ assert scores.data.dtype == "float32"
+ assert scores.lengths.shape == (1,)
diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py
index 0d895f23688..c2b65977ac3 100644
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@@ -1,3 +1,4 @@
+from typing import cast
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import get_current_ops
@@ -7,7 +8,8 @@
from spacy.lang.en import English
from spacy.language import Language
from spacy.morphology import Morphology
-from spacy.tests.util import make_tempdir
+from spacy.pipeline import TrainablePipe
+from spacy.attrs import MORPH
from spacy.tokens import Doc
from spacy.training import Example
@@ -224,3 +226,25 @@ def test_overfitting_IO():
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags
+
+
+def test_save_activations():
+ nlp = English()
+ morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer"))
+ train_examples = []
+ for inst in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
+ nlp.initialize(get_examples=lambda: train_examples)
+
+ doc = nlp("This is a test.")
+ assert "morphologizer" not in doc.activations
+
+ morphologizer.save_activations = True
+ doc = nlp("This is a test.")
+ assert "morphologizer" in doc.activations
+ assert set(doc.activations["morphologizer"].keys()) == {
+ "label_ids",
+ "probabilities",
+ }
+ assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6)
+ assert doc.activations["morphologizer"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py
index 6c76558123f..2e40d86ff48 100644
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@@ -1,3 +1,4 @@
+from typing import cast
import pytest
from numpy.testing import assert_equal
@@ -5,6 +6,7 @@
from spacy.attrs import SENT_START
from spacy.lang.en import English
from spacy.language import Language
+from spacy.pipeline import TrainablePipe
from spacy.tests.util import make_tempdir
from spacy.training import Example
@@ -101,3 +103,26 @@ def test_overfitting_IO():
# test internal pipe labels vs. Language.pipe_labels with hidden labels
assert nlp.get_pipe("senter").labels == ("I", "S")
assert "senter" not in nlp.pipe_labels
+
+
+def test_save_activations():
+ # Test if activations are correctly added to Doc when requested.
+ nlp = English()
+ senter = cast(TrainablePipe, nlp.add_pipe("senter"))
+
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = senter.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "senter" not in doc.activations
+
+ senter.save_activations = True
+ doc = nlp("This is a test.")
+ assert "senter" in doc.activations
+ assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"}
+ assert doc.activations["senter"]["probabilities"].shape == (5, nO)
+ assert doc.activations["senter"]["label_ids"].shape == (5,)
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index c143d193fa6..9678e9b63b8 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -594,19 +594,21 @@ def test_set_candidates(name):
assert docs[0].spans["candidates"][4].text == "Just a"
-@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
-@pytest.mark.parametrize("n_process", [1, 2])
-def test_spancat_multiprocessing(name, n_process):
- if isinstance(get_current_ops, NumpyOps) or n_process < 2:
- nlp = Language()
- spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
- train_examples = make_examples(nlp)
- nlp.initialize(get_examples=lambda: train_examples)
- texts = [
- "Just a sentence.",
- "I like London and Berlin",
- "I like Berlin",
- "I eat ham.",
- ]
- docs = list(nlp.pipe(texts, n_process=n_process))
- assert len(docs) == len(texts)
+def test_save_activations():
+ # Test if activations are correctly added to Doc when requested.
+ nlp = English()
+ spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
+ train_examples = make_examples(nlp)
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = spancat.model.get_dim("nO")
+ assert nO == 2
+ assert set(spancat.labels) == {"LOC", "PERSON"}
+
+ doc = nlp("This is a test.")
+ assert "spancat" not in doc.activations
+
+ spancat.save_activations = True
+ doc = nlp("This is a test.")
+ assert set(doc.activations["spancat"].keys()) == {"indices", "scores"}
+ assert doc.activations["spancat"]["indices"].shape == (12, 2)
+ assert doc.activations["spancat"]["scores"].shape == (12, nO)
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 4b5f1ee99fc..5deb323dd71 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,3 +1,4 @@
+from typing import cast
import pytest
from numpy.testing import assert_almost_equal, assert_equal
from thinc.api import compounding, get_current_ops
@@ -6,7 +7,8 @@
from spacy.attrs import TAG
from spacy.lang.en import English
from spacy.language import Language
-from spacy.training import Example
+from spacy.pipeline import TrainablePipe
+from thinc.api import compounding
from ..util import make_tempdir
@@ -235,6 +237,26 @@ def test_overfitting_IO():
assert doc3[0].tag_ != "N"
+def test_save_activations():
+ # Test if activations are correctly added to Doc when requested.
+ nlp = English()
+ tagger = cast(TrainablePipe, nlp.add_pipe("tagger"))
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ nlp.initialize(get_examples=lambda: train_examples)
+
+ doc = nlp("This is a test.")
+ assert "tagger" not in doc.activations
+
+ tagger.save_activations = True
+ doc = nlp("This is a test.")
+ assert "tagger" in doc.activations
+ assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"}
+ assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS))
+ assert doc.activations["tagger"]["label_ids"].shape == (5,)
+
+
def test_tagger_requires_labels():
nlp = English()
nlp.add_pipe("tagger")
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 8a0c1a9760d..710dac0571d 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,3 +1,4 @@
+from typing import cast
import random
import numpy.random
@@ -11,17 +12,13 @@
from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat
from spacy.lang.en import English
from spacy.language import Language
-from spacy.pipeline import TextCategorizer
-from spacy.pipeline.textcat import (
- single_label_bow_config,
- single_label_cnn_config,
- single_label_default_config,
-)
-from spacy.pipeline.textcat_multilabel import (
- multi_label_bow_config,
- multi_label_cnn_config,
- multi_label_default_config,
-)
+from spacy.pipeline import TextCategorizer, TrainablePipe
+from spacy.pipeline.textcat import single_label_bow_config
+from spacy.pipeline.textcat import single_label_cnn_config
+from spacy.pipeline.textcat import single_label_default_config
+from spacy.pipeline.textcat_multilabel import multi_label_bow_config
+from spacy.pipeline.textcat_multilabel import multi_label_cnn_config
+from spacy.pipeline.textcat_multilabel import multi_label_default_config
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tokens import Doc, DocBin
@@ -298,7 +295,7 @@ def test_issue9904():
nlp.initialize(get_examples)
examples = get_examples()
- scores = textcat.predict([eg.predicted for eg in examples])
+ scores = textcat.predict([eg.predicted for eg in examples])["probabilities"]
loss = textcat.get_loss(examples, scores)[0]
loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0]
@@ -949,24 +946,39 @@ def test_textcat_multi_threshold():
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-@pytest.mark.parametrize(
- "component_name,scorer",
- [
- ("textcat", "spacy.textcat_scorer.v1"),
- ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
- ],
-)
-def test_textcat_legacy_scorers(component_name, scorer):
- """Check that legacy scorers are registered and produce the expected score
- keys."""
+def test_save_activations():
nlp = English()
- nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+ textcat = cast(TrainablePipe, nlp.add_pipe("textcat"))
train_examples = []
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
nlp.initialize(get_examples=lambda: train_examples)
+ nO = textcat.model.get_dim("nO")
- # score the model (it's not actually trained but that doesn't matter)
- scores = nlp.evaluate(train_examples)
- assert 0 <= scores["cats_score"] <= 1
+ doc = nlp("This is a test.")
+ assert "textcat" not in doc.activations
+
+ textcat.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["textcat"].keys()) == ["probabilities"]
+ assert doc.activations["textcat"]["probabilities"].shape == (nO,)
+
+
+def test_save_activations_multi():
+ nlp = English()
+ textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel"))
+
+ train_examples = []
+ for text, annotations in TRAIN_DATA_MULTI_LABEL:
+ train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+ nlp.initialize(get_examples=lambda: train_examples)
+ nO = textcat.model.get_dim("nO")
+
+ doc = nlp("This is a test.")
+ assert "textcat_multilabel" not in doc.activations
+
+ textcat.save_activations = True
+ doc = nlp("This is a test.")
+ assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
+ assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index d9719609cdc..5e8975ed337 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -50,6 +50,8 @@ cdef class Doc:
cdef public float sentiment
+ cdef public dict activations
+
cdef public dict user_hooks
cdef public dict user_token_hooks
cdef public dict user_span_hooks
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 0fae118b4b6..5fda6f2f789 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -16,7 +16,7 @@ from typing import (
import numpy as np
from cymem.cymem import Pool
-from thinc.types import Floats1d, Floats2d, Ints2d
+from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged
from .span import Span
from .token import Token
from .span_groups import SpanGroups
@@ -41,6 +41,7 @@ class Doc:
max_length: int
length: int
sentiment: float
+ activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]]
cats: Dict[str, float]
user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 8db8c1d6f37..497656b6570 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -281,6 +281,7 @@ cdef class Doc:
self.length = 0
self.sentiment = 0.0
self.cats = {}
+ self.activations = {}
self.user_hooks = {}
self.user_token_hooks = {}
self.user_span_hooks = {}
diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx
index 0a582650076..310ce0dc88d 100644
--- a/website/docs/api/doc.mdx
+++ b/website/docs/api/doc.mdx
@@ -752,22 +752,23 @@ The L2 norm of the document's vector representation.
## Attributes {id="attributes"}
-| Name | Description |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `text` | A string representation of the document text. ~~str~~ |
-| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
-| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
-| `vocab` | The store of lexical types. ~~Vocab~~ |
-| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ |
-| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
-| `lang` | Language of the document's vocabulary. ~~int~~ |
-| `lang_` | Language of the document's vocabulary. ~~str~~ |
-| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
-| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
-| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
-| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
-| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
-| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
+| Name | Description |
+| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text` | A string representation of the document text. ~~str~~ |
+| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
+| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
+| `vocab` | The store of lexical types. ~~Vocab~~ |
+| `tensor` 2 | Container for dense vector representations. ~~numpy.ndarray~~ |
+| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
+| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ |
+| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ |
+| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
+| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
+| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
+| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
+| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ |
+| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
+| `activations` 4.0 | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ |
## Serialization fields {id="serialization-fields"}
diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx
index 82967482c90..17af19e8c38 100644
--- a/website/docs/api/edittreelemmatizer.mdx
+++ b/website/docs/api/edittreelemmatizer.mdx
@@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer")
> ```
-| Setting | Description |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ |
-| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ |
-| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
-| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ |
-| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
+| Setting | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ |
+| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ |
+| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 21d2e9015ce..85b872151fd 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -53,21 +53,20 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config)
> ```
-| Setting | Description |
-| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
-| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
-| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
-| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
-| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
-| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
-| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
-| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
-| `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
-| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
-| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
-| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
+| Setting | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
+| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
+| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
+| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
+| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
+| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
+| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
+| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx
index ce16f534219..1fda807cb32 100644
--- a/website/docs/api/morphologizer.mdx
+++ b/website/docs/api/morphologizer.mdx
@@ -42,13 +42,13 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("morphologizer", config=config)
> ```
-| Setting | Description |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
-| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
-| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
-| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
+| Setting | Description |
+| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
+| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx
@@ -400,8 +400,8 @@ coarse-grained POS as the feature `POS`.
> assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------ |
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |
## Morphologizer.label_data {id="label_data",tag="property",version="3"}
diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx
index 5435399f956..d5d096d7659 100644
--- a/website/docs/api/sentencerecognizer.mdx
+++ b/website/docs/api/sentencerecognizer.mdx
@@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("senter", config=config)
> ```
-| Setting | Description |
-| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
-| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
+| Setting | Description |
+| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/senter.pyx
diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx
index 98a1948eeab..258db794786 100644
--- a/website/docs/api/spancategorizer.mdx
+++ b/website/docs/api/spancategorizer.mdx
@@ -62,32 +62,15 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("spancat", config=config)
> ```
-> #### Example (spancat_singlelabel)
->
-> ```python
-> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
-> config = {
-> "spans_key": "labeled_spans",
-> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
-> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
-> # Additional spancat_singlelabel parameters
-> "negative_weight": 0.8,
-> "allow_overlap": True,
-> }
-> nlp.add_pipe("spancat_singlelabel", config=config)
-> ```
-
-| Setting | Description |
-| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
-| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
-| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
-| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
-| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
-| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
-| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
-| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
-| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
+| Setting | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
+| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
+| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
+| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
+| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ |
diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx
index d9b0506fb17..20852e8eb94 100644
--- a/website/docs/api/tagger.mdx
+++ b/website/docs/api/tagger.mdx
@@ -40,13 +40,13 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tagger", config=config)
> ```
-| Setting | Description |
-| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
-| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
-| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
-| `label_smoothing` 3.6 | [Label smoothing](https://arxiv.org/abs/1906.02629) factor. Defaults to `0.0`. ~~float~~ |
+| Setting | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
+| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
+| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/tagger.pyx
diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx
index a259b7b3c65..a1dfb6dd88e 100644
--- a/website/docs/api/textcategorizer.mdx
+++ b/website/docs/api/textcategorizer.mdx
@@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe).
-| Name | Description |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | The shared vocabulary. ~~Vocab~~ |
-| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
-| _keyword-only_ | |
-| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ |
-| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| Name | Description |
+| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
+| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
+| `save_activations` 4.0 | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~ |
## TextCategorizer.\_\_call\_\_ {id="call",tag="method"}