From 43f6624325e3c0299b63a80a35e84710789dd94d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 11 Dec 2023 21:35:20 +0200 Subject: [PATCH 1/6] Update README.MD --- README.MD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.MD b/README.MD index 5fa8d1cc9..c44422471 100644 --- a/README.MD +++ b/README.MD @@ -35,7 +35,7 @@ Presidio _(Origin from Latin praesidium ‘protection, garrison’)_ helps to en --- -## :mailbox_with_mail: Are you using Presidio? We'd love to know how :mailbox_with_mail: +## Are you using Presidio? We'd love to know how Please help us improve by taking [this short anonymous survey](https://forms.office.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR9LagCGNW01LpMix2pnFWFJUQjJDTVkwSlJYRkFPSUNNVlVRRVRWVDVNSy4u). From 4551b9b0d779e112fee1f7ffeadd2dbbac7936db Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 13 Dec 2023 13:19:35 +0200 Subject: [PATCH 2/6] Bugfix in SpacyRecognizer (#1221) --- .../spacy_recognizer.py | 3 +- .../presidio_analyzer/recognizer_registry.py | 28 +++++++++++++++++ .../tests/test_recognizer_registry.py | 22 +++++++++++++ .../tests/test_spacy_recognizer.py | 31 +++++++++++++++++++ 4 files changed, 83 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py index 322242004..f16f623eb 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py @@ -103,7 +103,8 @@ def analyze(self, text: str, entities, nlp_artifacts=None): # noqa D102 ner_scores = nlp_artifacts.scores for ner_entity, ner_score in zip(ner_entities, ner_scores): - if ner_entity.label_ not in entities: + if (ner_entity.label_ not in entities + or ner_entity.label_ not in self.supported_entities): logger.debug( f"Skipping entity {ner_entity.label_} " f"as it is not in the supported entities list" diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index 2f1f09833..7a83da178 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -318,3 +318,31 @@ def __instantiate_recognizer( if isinstance(inst, PatternRecognizer): inst.global_regex_flags = self.global_regex_flags return inst + + def _get_supported_languages(self) -> List[str]: + languages = [] + for rec in self.recognizers: + languages.append(rec.supported_language) + + return list(set(languages)) + + def get_supported_entities( + self, languages: Optional[List[str]] = None + ) -> List[str]: + """ + Return the supported entities by the set of recognizers loaded. + + :param languages: The languages to get the supported entities for. + If languages=None, returns all entities for all languages. + """ + if not languages: + languages = self._get_supported_languages() + + supported_entities = [] + for language in languages: + recognizers = self.get_recognizers(language=language, all_fields=True) + + for recognizer in recognizers: + supported_entities.extend(recognizer.get_supported_entities()) + + return list(set(supported_entities)) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 921014cf7..9e2bf398f 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -8,7 +8,9 @@ PatternRecognizer, EntityRecognizer, Pattern, + AnalyzerEngine ) +from presidio_analyzer.predefined_recognizers import SpacyRecognizer @pytest.fixture(scope="module") @@ -213,3 +215,23 @@ def test_predefined_pattern_recognizers_have_the_right_regex_flags(): for rec in registry.recognizers: if isinstance(rec, PatternRecognizer): assert rec.global_regex_flags == re.DOTALL + + +def test_recognizer_removed_and_returned_entities_are_correct(): + registry = RecognizerRegistry() + registry.load_predefined_recognizers() + registry.remove_recognizer("SpacyRecognizer") + sr = SpacyRecognizer(supported_entities=["DATE_TIME", "NRP"]) + registry.add_recognizer(sr) + + supported_entities = registry.get_supported_entities(languages=["en"]) + + assert "DATE_TIME" in supported_entities + assert "PERSON" not in supported_entities + + analyzer = AnalyzerEngine( + registry=registry, + supported_languages='en' + ) + + analyzer.analyze("My name is David", language="en") diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 92ed0947c..c77969cf4 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -1,5 +1,9 @@ + import pytest +from spacy.tokens import Span, Doc +from spacy.util import get_lang_class +from presidio_analyzer.nlp_engine import NlpArtifacts, SpacyNlpEngine from presidio_analyzer.predefined_recognizers import SpacyRecognizer from tests import assert_result_within_score_range @@ -14,6 +18,24 @@ def nlp_recognizer(nlp_recognizers): return nlp_recognizers["spacy"] +@pytest.fixture(scope="module") +def mock_nlp_artifacts(): + en_vocab=get_lang_class("en")().vocab + doc = Doc(en_vocab, words=["My", "name", "is", "Mitchell"]) + doc.ents = [Span(doc, 2, 3, label="PERSON")] + + nlp_artifacts = NlpArtifacts( + entities=doc.ents, + tokens=doc, + tokens_indices=[token.idx for token in doc], + lemmas=[token.lemma_ for token in doc], + nlp_engine=None, + language="en", + scores=[0.9 for _ in doc.ents], + ) + return nlp_artifacts + + def prepare_and_analyze(nlp, recognizer, text, ents): nlp_artifacts = nlp.process_text(text, "en") results = recognizer.analyze(text, ents, nlp_artifacts) @@ -84,3 +106,12 @@ def test_analyze_no_nlp_artifacts(): spacy_recognizer = SpacyRecognizer() res = spacy_recognizer.analyze(text="text", nlp_artifacts=None, entities=["PERSON"]) assert len(res) == 0 + + +def test_entity_not_returned_if_not_in_supported_entities(mock_nlp_artifacts): + spacy_recognizer = SpacyRecognizer(supported_entities=["NRP"]) + + res = spacy_recognizer.analyze( + text="text", nlp_artifacts=mock_nlp_artifacts, entities=["DATE_TIME"] + ) + assert len(res) == 0 From b24dbeda54c41b8c3867de152666d82e339b8302 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 13 Dec 2023 18:57:10 +0200 Subject: [PATCH 3/6] Updates to demo website with new NLP Engine (#1181) --- ...rapper.py => azure_ai_language_wrapper.py} | 11 +- .../python/streamlit/flair_recognizer.py | 10 +- docs/samples/python/streamlit/flair_test.py | 27 -- .../streamlit/openai_fake_data_generator.py | 61 ++-- .../python/streamlit/presidio_helpers.py | 25 +- .../streamlit/presidio_nlp_engine_config.py | 158 +++++--- .../python/streamlit/presidio_streamlit.py | 71 ++-- .../samples/python/streamlit/requirements.txt | 5 +- .../python/streamlit/test_streamlit.py | 43 +++ .../streamlit/transformers_rec/__init__.py | 5 - .../transformers_rec/configuration.py | 124 ------- .../transformers_recognizer.py | 336 ------------------ 12 files changed, 263 insertions(+), 613 deletions(-) rename docs/samples/python/streamlit/{text_analytics_wrapper.py => azure_ai_language_wrapper.py} (94%) delete mode 100644 docs/samples/python/streamlit/flair_test.py create mode 100644 docs/samples/python/streamlit/test_streamlit.py delete mode 100644 docs/samples/python/streamlit/transformers_rec/__init__.py delete mode 100644 docs/samples/python/streamlit/transformers_rec/configuration.py delete mode 100644 docs/samples/python/streamlit/transformers_rec/transformers_recognizer.py diff --git a/docs/samples/python/streamlit/text_analytics_wrapper.py b/docs/samples/python/streamlit/azure_ai_language_wrapper.py similarity index 94% rename from docs/samples/python/streamlit/text_analytics_wrapper.py rename to docs/samples/python/streamlit/azure_ai_language_wrapper.py index c794fea77..36d43f4e9 100644 --- a/docs/samples/python/streamlit/text_analytics_wrapper.py +++ b/docs/samples/python/streamlit/azure_ai_language_wrapper.py @@ -10,8 +10,10 @@ logger = logging.getLogger("presidio-streamlit") -class TextAnalyticsWrapper(EntityRecognizer): + +class AzureAIServiceWrapper(EntityRecognizer): from azure.ai.textanalytics._models import PiiEntityCategory + TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory] def __init__( @@ -35,7 +37,7 @@ def __init__( super().__init__( supported_entities=supported_entities, supported_language=supported_language, - name="Azure Text Analytics PII", + name="Azure AI Language PII", ) self.ta_key = ta_key @@ -67,7 +69,7 @@ def analyze( for entity in res.entities: if entity.category not in self.supported_entities: continue - analysis_explanation = TextAnalyticsWrapper._build_explanation( + analysis_explanation = AzureAIServiceWrapper._build_explanation( original_score=entity.confidence_score, entity_type=entity.category, ) @@ -88,7 +90,7 @@ def _build_explanation( original_score: float, entity_type: str ) -> AnalysisExplanation: explanation = AnalysisExplanation( - recognizer=TextAnalyticsWrapper.__class__.__name__, + recognizer=AzureAIServiceWrapper.__class__.__name__, original_score=original_score, textual_explanation=f"Identified as {entity_type} by Text Analytics", ) @@ -100,6 +102,7 @@ def load(self) -> None: if __name__ == "__main__": import presidio_helpers + dotenv.load_dotenv() text = """ Here are a few example sentences we currently support: diff --git a/docs/samples/python/streamlit/flair_recognizer.py b/docs/samples/python/streamlit/flair_recognizer.py index acb69c887..f4c5ab096 100644 --- a/docs/samples/python/streamlit/flair_recognizer.py +++ b/docs/samples/python/streamlit/flair_recognizer.py @@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer): # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII ] - MODEL_LANGUAGES = { - "en": "flair/ner-english-large" - } + MODEL_LANGUAGES = {"en": "flair/ner-english-large"} PRESIDIO_EQUIVALENCES = { "PER": "PERSON", @@ -76,7 +74,7 @@ def __init__( supported_entities: Optional[List[str]] = None, check_label_groups: Optional[Tuple[Set, Set]] = None, model: SequenceTagger = None, - model_path: Optional[str] = None + model_path: Optional[str] = None, ): self.check_label_groups = ( check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS @@ -93,7 +91,9 @@ def __init__( self.model = SequenceTagger.load(model_path) else: print(f"Loading model for language {supported_language}") - self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language)) + self.model = SequenceTagger.load( + self.MODEL_LANGUAGES.get(supported_language) + ) super().__init__( supported_entities=supported_entities, diff --git a/docs/samples/python/streamlit/flair_test.py b/docs/samples/python/streamlit/flair_test.py deleted file mode 100644 index b237ff9bc..000000000 --- a/docs/samples/python/streamlit/flair_test.py +++ /dev/null @@ -1,27 +0,0 @@ -# Import generic wrappers -from transformers import AutoModel, AutoTokenizer - - -if __name__ == "__main__": - - from flair.data import Sentence - from flair.models import SequenceTagger - - # load tagger - tagger = SequenceTagger.load("flair/ner-english-large") - - # make example sentence - sentence = Sentence("George Washington went to Washington") - - # predict NER tags - tagger.predict(sentence) - - # print sentence - print(sentence) - - # print predicted NER spans - print('The following NER tags are found:') - # iterate over entities and print - for entity in sentence.get_spans('ner'): - print(entity) - diff --git a/docs/samples/python/streamlit/openai_fake_data_generator.py b/docs/samples/python/streamlit/openai_fake_data_generator.py index d89458f56..1c10c8307 100644 --- a/docs/samples/python/streamlit/openai_fake_data_generator.py +++ b/docs/samples/python/streamlit/openai_fake_data_generator.py @@ -2,51 +2,45 @@ from typing import Optional import openai +from openai import OpenAI, AzureOpenAI import logging logger = logging.getLogger("presidio-streamlit") OpenAIParams = namedtuple( "open_ai_params", - ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"], + ["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"], ) -def set_openai_params(openai_params: OpenAIParams): - """Set the OpenAI API key. - :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name, - The latter only relate to Azure OpenAI deployments. - """ - openai.api_key = openai_params.openai_key - openai.api_version = openai_params.api_version - if openai_params.api_base: - openai.api_base = openai_params.api_base - openai.api_type = openai_params.api_type - - def call_completion_model( prompt: str, - model: str = "text-davinci-003", - max_tokens: int = 512, - deployment_id: Optional[str] = None, + openai_params: OpenAIParams, + max_tokens: Optional[int] = 256, ) -> str: """Creates a request for the OpenAI Completion service and returns the response. :param prompt: The prompt for the completion model - :param model: OpenAI model name - :param max_tokens: Model's max_tokens parameter - :param deployment_id: Azure OpenAI deployment ID + :param openai_params: OpenAI parameters for the completion model + :param max_tokens: The maximum number of tokens to generate. """ - if deployment_id: - response = openai.Completion.create( - deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens + if openai_params.api_type.lower() == "azure": + client = AzureOpenAI( + api_version=openai_params.api_version, + api_key=openai_params.openai_key, + azure_endpoint=openai_params.api_base, + azure_deployment=openai_params.deployment_id, ) else: - response = openai.Completion.create( - model=model, prompt=prompt, max_tokens=max_tokens - ) + client = OpenAI(api_key=openai_params.openai_key) - return response["choices"][0].text + response = client.completions.create( + model=openai_params.model, + prompt=prompt, + max_tokens=max_tokens, + ) + + return response.choices[0].text.strip() def create_prompt(anonymized_text: str) -> str: @@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str: a. Use completely random numbers, so every digit is drawn between 0 and 9. b. Use realistic names that come from diverse genders, ethnicities and countries. - c. If there are no placeholders, return the text as is and provide an answer. + c. If there are no placeholders, return the text as is. d. Keep the formatting as close to the original as possible. e. If PII exists in the input, replace it with fake values in the output. + f. Remove whitespace before and after the generated text - input: How do I change the limit on my credit card {{credit_card_number}}? + input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]] output: How do I change the limit on my credit card 2539 3519 2345 1555? - input: was the chief science officer at . + input: [[TEXT STARTS]] was the chief science officer at .[[TEXT ENDS]] output: Katherine Buckjov was the chief science officer at NASA. - input: Cameroon lives in . + input: [[TEXT STARTS]]Cameroon lives in .[[TEXT ENDS]] output: Vladimir lives in Moscow. - input: {anonymized_text} - output: - """ + + input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]] + output:""" return prompt diff --git a/docs/samples/python/streamlit/presidio_helpers.py b/docs/samples/python/streamlit/presidio_helpers.py index a64fe84ae..3cf53c3d4 100644 --- a/docs/samples/python/streamlit/presidio_helpers.py +++ b/docs/samples/python/streamlit/presidio_helpers.py @@ -16,16 +16,16 @@ from presidio_anonymizer.entities import OperatorConfig from openai_fake_data_generator import ( - set_openai_params, call_completion_model, - create_prompt, OpenAIParams, + create_prompt, ) from presidio_nlp_engine_config import ( create_nlp_engine_with_spacy, create_nlp_engine_with_flair, create_nlp_engine_with_transformers, - create_nlp_engine_with_azure_text_analytics, + create_nlp_engine_with_azure_ai_language, + create_nlp_engine_with_stanza, ) logger = logging.getLogger("presidio-streamlit") @@ -49,14 +49,16 @@ def nlp_engine_and_registry( """ # Set up NLP Engine according to the model of choice - if "spaCy" in model_family: + if "spacy" in model_family.lower(): return create_nlp_engine_with_spacy(model_path) - elif "flair" in model_family: + if "stanza" in model_family.lower(): + return create_nlp_engine_with_stanza(model_path) + elif "flair" in model_family.lower(): return create_nlp_engine_with_flair(model_path) - elif "HuggingFace" in model_family: + elif "huggingface" in model_family.lower(): return create_nlp_engine_with_transformers(model_path) - elif "Azure Text Analytics" in model_family: - return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint) + elif "azure ai language" in model_family.lower(): + return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint) else: raise ValueError(f"Model family {model_family} not supported") @@ -215,14 +217,9 @@ def create_fake_data( if not openai_params.openai_key: return "Please provide your OpenAI key" results = anonymize(text=text, operator="replace", analyze_results=analyze_results) - set_openai_params(openai_params) prompt = create_prompt(results.text) print(f"Prompt: {prompt}") - fake = call_openai_api( - prompt=prompt, - openai_model_name=openai_params.model, - openai_deployment_name=openai_params.deployment_name, - ) + fake = call_completion_model(prompt=prompt, openai_params=openai_params) return fake diff --git a/docs/samples/python/streamlit/presidio_nlp_engine_config.py b/docs/samples/python/streamlit/presidio_nlp_engine_config.py index eab179b8c..d4fb8c6f5 100644 --- a/docs/samples/python/streamlit/presidio_nlp_engine_config.py +++ b/docs/samples/python/streamlit/presidio_nlp_engine_config.py @@ -1,8 +1,12 @@ -from typing import Tuple import logging +from typing import Tuple + import spacy from presidio_analyzer import RecognizerRegistry -from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider +from presidio_analyzer.nlp_engine import ( + NlpEngine, + NlpEngineProvider, +) logger = logging.getLogger("presidio-streamlit") @@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy( ) -> Tuple[NlpEngine, RecognizerRegistry]: """ Instantiate an NlpEngine with a spaCy model - :param model_path: spaCy model path. + :param model_path: path to model / model name. """ + nlp_configuration = { + "nlp_engine_name": "spacy", + "models": [{"lang_code": "en", "model_name": model_path}], + "ner_model_configuration": { + "model_to_presidio_entity_mapping": { + "PER": "PERSON", + "PERSON": "PERSON", + "NORP": "NRP", + "FAC": "FACILITY", + "LOC": "LOCATION", + "GPE": "LOCATION", + "LOCATION": "LOCATION", + "ORG": "ORGANIZATION", + "ORGANIZATION": "ORGANIZATION", + "DATE": "DATE_TIME", + "TIME": "DATE_TIME", + }, + "low_confidence_score_multiplier": 0.4, + "low_score_entity_names": ["ORG", "ORGANIZATION"], + }, + } + + nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() + registry = RecognizerRegistry() - registry.load_predefined_recognizers() + registry.load_predefined_recognizers(nlp_engine=nlp_engine) - if not spacy.util.is_package(model_path): - spacy.cli.download(model_path) + return nlp_engine, registry + +def create_nlp_engine_with_stanza( + model_path: str, +) -> Tuple[NlpEngine, RecognizerRegistry]: + """ + Instantiate an NlpEngine with a stanza model + :param model_path: path to model / model name. + """ nlp_configuration = { - "nlp_engine_name": "spacy", + "nlp_engine_name": "stanza", "models": [{"lang_code": "en", "model_name": model_path}], + "ner_model_configuration": { + "model_to_presidio_entity_mapping": { + "PER": "PERSON", + "PERSON": "PERSON", + "NORP": "NRP", + "FAC": "FACILITY", + "LOC": "LOCATION", + "GPE": "LOCATION", + "LOCATION": "LOCATION", + "ORG": "ORGANIZATION", + "ORGANIZATION": "ORGANIZATION", + "DATE": "DATE_TIME", + "TIME": "DATE_TIME", + } + }, } nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() + registry = RecognizerRegistry() + registry.load_predefined_recognizers(nlp_engine=nlp_engine) + return nlp_engine, registry @@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers( would return NlpArtifacts such as POS and lemmas. :param model_path: HuggingFace model path. """ + print(f"Loading Transformers model: {model_path} of type {type(model_path)}") - from transformers_rec import ( - STANFORD_COFIGURATION, - BERT_DEID_CONFIGURATION, - TransformersRecognizer, - ) - - registry = RecognizerRegistry() - registry.load_predefined_recognizers() - - if not spacy.util.is_package("en_core_web_sm"): - spacy.cli.download("en_core_web_sm") - # Using a small spaCy model + a HF NER model - transformers_recognizer = TransformersRecognizer(model_path=model_path) - - if model_path == "StanfordAIMI/stanford-deidentifier-base": - transformers_recognizer.load_transformer(**STANFORD_COFIGURATION) - elif model_path == "obi/deid_roberta_i2b2": - transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) - else: - print(f"Warning: Model has no configuration, loading default.") - transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) - - # Use small spaCy model, no need for both spacy and HF models - # The transformers model is used here as a recognizer, not as an NlpEngine nlp_configuration = { - "nlp_engine_name": "spacy", - "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], + "nlp_engine_name": "transformers", + "models": [ + { + "lang_code": "en", + "model_name": {"spacy": "en_core_web_sm", "transformers": model_path}, + } + ], + "ner_model_configuration": { + "model_to_presidio_entity_mapping": { + "PER": "PERSON", + "PERSON": "PERSON", + "LOC": "LOCATION", + "LOCATION": "LOCATION", + "GPE": "LOCATION", + "ORG": "ORGANIZATION", + "ORGANIZATION": "ORGANIZATION", + "NORP": "NRP", + "AGE": "AGE", + "ID": "ID", + "EMAIL": "EMAIL", + "PATIENT": "PERSON", + "STAFF": "PERSON", + "HOSP": "ORGANIZATION", + "PATORG": "ORGANIZATION", + "DATE": "DATE_TIME", + "TIME": "DATE_TIME", + "PHONE": "PHONE_NUMBER", + "HCW": "PERSON", + "HOSPITAL": "ORGANIZATION", + "FACILITY": "LOCATION", + }, + "low_confidence_score_multiplier": 0.4, + "low_score_entity_names": ["ID"], + "labels_to_ignore": [ + "CARDINAL", + "EVENT", + "LANGUAGE", + "LAW", + "MONEY", + "ORDINAL", + "PERCENT", + "PRODUCT", + "QUANTITY", + "WORK_OF_ART", + ], + }, } - registry.add_recognizer(transformers_recognizer) - registry.remove_recognizer("SpacyRecognizer") - nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() + registry = RecognizerRegistry() + registry.load_predefined_recognizers(nlp_engine=nlp_engine) + return nlp_engine, registry @@ -91,6 +165,8 @@ def create_nlp_engine_with_flair( registry = RecognizerRegistry() registry.load_predefined_recognizers() + # there is no official Flair NlpEngine, hence we load it as an additional recognizer + if not spacy.util.is_package("en_core_web_sm"): spacy.cli.download("en_core_web_sm") # Using a small spaCy model + a Flair NER model @@ -107,7 +183,7 @@ def create_nlp_engine_with_flair( return nlp_engine, registry -def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): +def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str): """ Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model. The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model @@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): :param ta_key: Azure Text Analytics key. :param ta_endpoint: Azure Text Analytics endpoint. """ - from text_analytics_wrapper import TextAnalyticsWrapper + from azure_ai_language_wrapper import AzureAIServiceWrapper if not ta_key or not ta_endpoint: raise RuntimeError("Please fill in the Text Analytics endpoint details") @@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): registry = RecognizerRegistry() registry.load_predefined_recognizers() - ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key) + azure_ai_language_recognizer = AzureAIServiceWrapper( + ta_endpoint=ta_endpoint, ta_key=ta_key + ) nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], @@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() - registry.add_recognizer(ta_recognizer) + registry.add_recognizer(azure_ai_language_recognizer) registry.remove_recognizer("SpacyRecognizer") return nlp_engine, registry diff --git a/docs/samples/python/streamlit/presidio_streamlit.py b/docs/samples/python/streamlit/presidio_streamlit.py index 33a268d75..14adc4973 100644 --- a/docs/samples/python/streamlit/presidio_streamlit.py +++ b/docs/samples/python/streamlit/presidio_streamlit.py @@ -56,7 +56,8 @@ "flair/ner-english-large", "HuggingFace/obi/deid_roberta_i2b2", "HuggingFace/StanfordAIMI/stanford-deidentifier-base", - "Azure Text Analytics PII", + "stanza/en", + "Azure AI Language", "Other", ] if not allow_other_models: @@ -75,22 +76,22 @@ # Remove package prefix (if needed) st_model = ( st_model - if st_model_package not in ("spaCy", "HuggingFace") + if st_model_package.lower() not in ("spacy", "stanza", "huggingface") else "/".join(st_model.split("/")[1:]) ) if st_model == "Other": st_model_package = st.sidebar.selectbox( - "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"] + "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"] ) st_model = st.sidebar.text_input(f"NER model name", value="") -if st_model == "Azure Text Analytics PII": +if st_model == "Azure AI Language": st_ta_key = st.sidebar.text_input( - f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password" + f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password" ) st_ta_endpoint = st.sidebar.text_input( - f"Text Analytics endpoint", + f"Azure AI Language endpoint", value=os.getenv("TA_ENDPOINT", default=""), help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501 ) @@ -124,23 +125,18 @@ logger.debug(f"st_operator: {st_operator}") -if st_operator == "mask": - st_number_of_chars = st.sidebar.number_input( - "number of chars", value=st_number_of_chars, min_value=0, max_value=100 - ) - st_mask_char = st.sidebar.text_input( - "Mask character", value=st_mask_char, max_chars=1 - ) -elif st_operator == "encrypt": - st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key) -elif st_operator == "synthesize": + +def set_up_openai_synthesis(): + """Set up the OpenAI API key and model for text synthesis.""" + if os.getenv("OPENAI_TYPE", default="openai") == "Azure": openai_api_type = "azure" st_openai_api_base = st.sidebar.text_input( "Azure OpenAI base URL", value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""), ) - st_deployment_name = st.sidebar.text_input( + openai_key = os.getenv("AZURE_OPENAI_KEY", default="") + st_deployment_id = st.sidebar.text_input( "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="") ) st_openai_version = st.sidebar.text_input( @@ -148,11 +144,13 @@ value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"), ) else: - st_openai_version = openai_api_type = st_openai_api_base = None - st_deployment_name = "" + openai_api_type = "openai" + st_openai_version = st_openai_api_base = None + st_deployment_id = "" + openai_key = os.getenv("OPENAI_KEY", default="") st_openai_key = st.sidebar.text_input( "OPENAI_KEY", - value=os.getenv("OPENAI_KEY", default=""), + value=openai_key, help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.", type="password", ) @@ -161,12 +159,40 @@ value=os.getenv("OPENAI_MODEL", default="text-davinci-003"), help="See more here: https://platform.openai.com/docs/models/", ) + return ( + openai_api_type, + st_openai_api_base, + st_deployment_id, + st_openai_version, + st_openai_key, + st_openai_model, + ) + + +if st_operator == "mask": + st_number_of_chars = st.sidebar.number_input( + "number of chars", value=st_number_of_chars, min_value=0, max_value=100 + ) + st_mask_char = st.sidebar.text_input( + "Mask character", value=st_mask_char, max_chars=1 + ) +elif st_operator == "encrypt": + st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key) +elif st_operator == "synthesize": + ( + openai_api_type, + st_openai_api_base, + st_deployment_id, + st_openai_version, + st_openai_key, + st_openai_model, + ) = set_up_openai_synthesis() open_ai_params = OpenAIParams( openai_key=st_openai_key, model=st_openai_model, api_base=st_openai_api_base, - deployment_name=st_deployment_name, + deployment_id=st_deployment_id, api_version=st_openai_version, api_type=openai_api_type, ) @@ -214,7 +240,8 @@ \n\n[Code](https://aka.ms/presidio) | [Tutorial](https://microsoft.github.io/presidio/tutorial/) | [Installation](https://microsoft.github.io/presidio/installation/) | - [FAQ](https://microsoft.github.io/presidio/faq/) |""" + [FAQ](https://microsoft.github.io/presidio/faq/) | + [Feedback](https://forms.office.com/r/9ufyYjfDaY) |""" ) st.info( diff --git a/docs/samples/python/streamlit/requirements.txt b/docs/samples/python/streamlit/requirements.txt index c33ef18fa..62d28df7f 100644 --- a/docs/samples/python/streamlit/requirements.txt +++ b/docs/samples/python/streamlit/requirements.txt @@ -1,4 +1,5 @@ -presidio-analyzer +presidio-analyzer[transformers] +presidio-analyzer[stanza] presidio-anonymizer streamlit streamlit-tags @@ -6,8 +7,6 @@ pandas python-dotenv st-annotated-text torch -transformers flair openai -spacy azure-ai-textanalytics \ No newline at end of file diff --git a/docs/samples/python/streamlit/test_streamlit.py b/docs/samples/python/streamlit/test_streamlit.py new file mode 100644 index 000000000..8c3f15c5c --- /dev/null +++ b/docs/samples/python/streamlit/test_streamlit.py @@ -0,0 +1,43 @@ +from presidio_helpers import analyzer_engine, analyze, anonymize + + +def test_streamlit_logic(): + st_model = "en" # st_model = "StanfordAIMI/stanford-deidentifier-base" + st_model_package = "stanza" ##st_model_package = "HuggingFace" + st_ta_key = None + st_ta_endpoint = None + + analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint) + + # Read default text + with open("demo_text.txt") as f: + demo_text = f.readlines() + + st_text = "".join(demo_text) + + # instantiate and cache AnalyzerEngine + analyzer_engine(*analyzer_params) + + # Analyze + st_analyze_results = analyze( + *analyzer_params, + text=st_text, + entities="All", + language="en", + score_threshold=0.35, + return_decision_process=True, + allow_list=[], + deny_list=[], + ) + + # Anonymize + st_anonymize_results = anonymize( + text=st_text, + operator="replace", + mask_char=None, + number_of_chars=None, + encrypt_key=None, + analyze_results=st_analyze_results, + ) + + assert st_anonymize_results.text != "" diff --git a/docs/samples/python/streamlit/transformers_rec/__init__.py b/docs/samples/python/streamlit/transformers_rec/__init__.py deleted file mode 100644 index b1de880ed..000000000 --- a/docs/samples/python/streamlit/transformers_rec/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .configuration import BERT_DEID_CONFIGURATION, STANFORD_COFIGURATION -from .transformers_recognizer import TransformersRecognizer - -__all__ = ["BERT_DEID_CONFIGURATION", "STANFORD_COFIGURATION", "TransformersRecognizer"] - diff --git a/docs/samples/python/streamlit/transformers_rec/configuration.py b/docs/samples/python/streamlit/transformers_rec/configuration.py deleted file mode 100644 index ebf0439c5..000000000 --- a/docs/samples/python/streamlit/transformers_rec/configuration.py +++ /dev/null @@ -1,124 +0,0 @@ -## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py - -STANFORD_COFIGURATION = { - "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base", - "PRESIDIO_SUPPORTED_ENTITIES": [ - "LOCATION", - "PERSON", - "ORGANIZATION", - "AGE", - "PHONE_NUMBER", - "EMAIL", - "DATE_TIME", - "DEVICE", - "ZIP", - "PROFESSION", - "USERNAME", - "ID" - - ], - "LABELS_TO_IGNORE": ["O"], - "DEFAULT_EXPLANATION": "Identified as {} by the StanfordAIMI/stanford-deidentifier-base NER model", - "SUB_WORD_AGGREGATION": "simple", - "DATASET_TO_PRESIDIO_MAPPING": { - "DATE": "DATE_TIME", - "DOCTOR": "PERSON", - "PATIENT": "PERSON", - "HOSPITAL": "LOCATION", - "MEDICALRECORD": "ID", - "IDNUM": "ID", - "ORGANIZATION": "ORGANIZATION", - "ZIP": "ZIP", - "PHONE": "PHONE_NUMBER", - "USERNAME": "USERNAME", - "STREET": "LOCATION", - "PROFESSION": "PROFESSION", - "COUNTRY": "LOCATION", - "LOCATION-OTHER": "LOCATION", - "FAX": "PHONE_NUMBER", - "EMAIL": "EMAIL", - "STATE": "LOCATION", - "DEVICE": "DEVICE", - "ORG": "ORGANIZATION", - "AGE": "AGE", - }, - "MODEL_TO_PRESIDIO_MAPPING": { - "PER": "PERSON", - "PERSON": "PERSON", - "LOC": "LOCATION", - "ORG": "ORGANIZATION", - "AGE": "AGE", - "PATIENT": "PERSON", - "HCW": "PERSON", - "HOSPITAL": "LOCATION", - "PATORG": "ORGANIZATION", - "DATE": "DATE_TIME", - "PHONE": "PHONE_NUMBER", - "VENDOR": "ORGANIZATION", - }, - "CHUNK_OVERLAP_SIZE": 40, - "CHUNK_SIZE": 600, - "ID_SCORE_MULTIPLIER": 0.4, - "ID_ENTITY_NAME": "ID" -} - - -BERT_DEID_CONFIGURATION = { - "PRESIDIO_SUPPORTED_ENTITIES": [ - "LOCATION", - "PERSON", - "ORGANIZATION", - "AGE", - "PHONE_NUMBER", - "EMAIL", - "DATE_TIME", - "ZIP", - "PROFESSION", - "USERNAME", - "ID" - ], - "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2", - "LABELS_TO_IGNORE": ["O"], - "DEFAULT_EXPLANATION": "Identified as {} by the obi/deid_roberta_i2b2 NER model", - "SUB_WORD_AGGREGATION": "simple", - "DATASET_TO_PRESIDIO_MAPPING": { - "DATE": "DATE_TIME", - "DOCTOR": "PERSON", - "PATIENT": "PERSON", - "HOSPITAL": "ORGANIZATION", - "MEDICALRECORD": "O", - "IDNUM": "O", - "ORGANIZATION": "ORGANIZATION", - "ZIP": "O", - "PHONE": "PHONE_NUMBER", - "USERNAME": "", - "STREET": "LOCATION", - "PROFESSION": "PROFESSION", - "COUNTRY": "LOCATION", - "LOCATION-OTHER": "LOCATION", - "FAX": "PHONE_NUMBER", - "EMAIL": "EMAIL", - "STATE": "LOCATION", - "DEVICE": "O", - "ORG": "ORGANIZATION", - "AGE": "AGE", - }, - "MODEL_TO_PRESIDIO_MAPPING": { - "PER": "PERSON", - "LOC": "LOCATION", - "ORG": "ORGANIZATION", - "AGE": "AGE", - "ID": "ID", - "EMAIL": "EMAIL", - "PATIENT": "PERSON", - "STAFF": "PERSON", - "HOSP": "ORGANIZATION", - "PATORG": "ORGANIZATION", - "DATE": "DATE_TIME", - "PHONE": "PHONE_NUMBER", - }, - "CHUNK_OVERLAP_SIZE": 40, - "CHUNK_SIZE": 600, - "ID_SCORE_MULTIPLIER": 0.4, - "ID_ENTITY_NAME": "ID" -} diff --git a/docs/samples/python/streamlit/transformers_rec/transformers_recognizer.py b/docs/samples/python/streamlit/transformers_rec/transformers_recognizer.py deleted file mode 100644 index ddf997808..000000000 --- a/docs/samples/python/streamlit/transformers_rec/transformers_recognizer.py +++ /dev/null @@ -1,336 +0,0 @@ -# Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py - -import copy -import logging -from typing import Optional, List - -import torch -from presidio_analyzer import ( - RecognizerResult, - EntityRecognizer, - AnalysisExplanation, -) -from presidio_analyzer.nlp_engine import NlpArtifacts - -from .configuration import BERT_DEID_CONFIGURATION - - -logger = logging.getLogger("presidio-analyzer") - -try: - from transformers import ( - AutoTokenizer, - AutoModelForTokenClassification, - pipeline, - TokenClassificationPipeline, - ) - -except ImportError: - logger.error("transformers_rec is not installed") - - -class TransformersRecognizer(EntityRecognizer): - """ - Wrapper for a transformers_rec model, if needed to be used within Presidio Analyzer. - The class loads models hosted on HuggingFace - https://huggingface.co/ - and loads the model and tokenizer into a TokenClassification pipeline. - Samples are split into short text chunks, ideally shorter than max_length input_ids of the individual model, - to avoid truncation by the Tokenizer and loss of information - - A configuration object should be maintained for each dataset-model combination and translate - entities names into a standardized view. A sample of a configuration file is attached in - the example. - :param supported_entities: List of entities to run inference on - :type supported_entities: Optional[List[str]] - :param pipeline: Instance of a TokenClassificationPipeline including a Tokenizer and a Model, defaults to None - :type pipeline: Optional[TokenClassificationPipeline], optional - :param model_path: string referencing a HuggingFace uploaded model to be used for Inference, defaults to None - :type model_path: Optional[str], optional - - :example - >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry - >model_path = "obi/deid_roberta_i2b2" - >transformers_recognizer = TransformersRecognizer(model_path=model_path, - >supported_entities = model_configuration.get("PRESIDIO_SUPPORTED_ENTITIES")) - >transformers_recognizer.load_transformer(**model_configuration) - >registry = RecognizerRegistry() - >registry.add_recognizer(transformers_recognizer) - >analyzer = AnalyzerEngine(registry=registry) - >sample = "My name is Christopher and I live in Irbid." - >results = analyzer.analyze(sample, language="en",return_decision_process=True) - - >for result in results: - > print(result,'----', sample[result.start:result.end]) - """ - - def load(self) -> None: - pass - - def __init__( - self, - model_path: Optional[str] = None, - pipeline: Optional[TokenClassificationPipeline] = None, - supported_entities: Optional[List[str]] = None, - ): - if not supported_entities: - supported_entities = BERT_DEID_CONFIGURATION[ - "PRESIDIO_SUPPORTED_ENTITIES" - ] - super().__init__( - supported_entities=supported_entities, - name=f"Transformers model {model_path}", - ) - - self.model_path = model_path - self.pipeline = pipeline - self.is_loaded = False - - self.aggregation_mechanism = None - self.ignore_labels = None - self.model_to_presidio_mapping = None - self.entity_mapping = None - self.default_explanation = None - self.text_overlap_length = None - self.chunk_length = None - self.id_entity_name = None - self.id_score_reduction = None - - def load_transformer(self, **kwargs) -> None: - """Load external configuration parameters and set default values. - - :param kwargs: define default values for class attributes and modify pipeline behavior - **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format - **MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format - **SUB_WORD_AGGREGATION(str) - define how to aggregate sub-word tokens into full words and spans as defined - in HuggingFace https://huggingface.co/transformers/v4.8.0/main_classes/pipelines.html#transformers.TokenClassificationPipeline # noqa - **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk - when splitting a single text into multiple inferences - **CHUNK_SIZE (int) - number of characters in each chunk of text - **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"] - **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations - **ID_ENTITY_NAME (str) - name of the ID entity - **ID_SCORE_REDUCTION (float) - score multiplier for ID entities - """ - - self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {}) - self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {}) - self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"]) - self.aggregation_mechanism = kwargs.get("SUB_WORD_AGGREGATION", "simple") - self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None) - self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40) - self.chunk_length = kwargs.get("CHUNK_SIZE", 600) - self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID") - self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5) - - if not self.pipeline: - if not self.model_path: - self.model_path = "obi/deid_roberta_i2b2" - logger.warning( - f"Both 'model' and 'model_path' arguments are None. Using default model_path={self.model_path}" - ) - - self._load_pipeline() - - def _load_pipeline(self) -> None: - """Initialize NER transformers_rec pipeline using the model_path provided""" - - logging.debug(f"Initializing NER pipeline using {self.model_path} path") - device = 0 if torch.cuda.is_available() else -1 - self.pipeline = pipeline( - "ner", - model=AutoModelForTokenClassification.from_pretrained(self.model_path), - tokenizer=AutoTokenizer.from_pretrained(self.model_path), - # Will attempt to group sub-entities to word level - aggregation_strategy=self.aggregation_mechanism, - device=device, - framework="pt", - ignore_labels=self.ignore_labels, - ) - - self.is_loaded = True - - def get_supported_entities(self) -> List[str]: - """ - Return supported entities by this model. - :return: List of the supported entities. - """ - return self.supported_entities - - # Class to use transformers_rec with Presidio as an external recognizer. - def analyze( - self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None - ) -> List[RecognizerResult]: - """ - Analyze text using transformers_rec model to produce NER tagging. - :param text : The text for analysis. - :param entities: Not working properly for this recognizer. - :param nlp_artifacts: Not used by this recognizer. - :return: The list of Presidio RecognizerResult constructed from the recognized - transformers_rec detections. - """ - - results = list() - # Run transformer model on the provided text - ner_results = self._get_ner_results_for_text(text) - - for res in ner_results: - res["entity_group"] = self.__check_label_transformer(res["entity_group"]) - if not res["entity_group"]: - continue - - if res["entity_group"] == self.id_entity_name: - print(f"ID entity found, multiplying score by {self.id_score_reduction}") - res["score"] = res["score"] * self.id_score_reduction - - textual_explanation = self.default_explanation.format(res["entity_group"]) - explanation = self.build_transformers_explanation( - float(round(res["score"], 2)), textual_explanation, res["word"] - ) - transformers_result = self._convert_to_recognizer_result(res, explanation) - - results.append(transformers_result) - - return results - - @staticmethod - def split_text_to_word_chunks( - input_length: int, chunk_length: int, overlap_length: int - ) -> List[List]: - """The function calculates chunks of text with size chunk_length. Each chunk has overlap_length number of - words to create context and continuity for the model - - :param input_length: Length of input_ids for a given text - :type input_length: int - :param chunk_length: Length of each chunk of input_ids. - Should match the max input length of the transformer model - :type chunk_length: int - :param overlap_length: Number of overlapping words in each chunk - :type overlap_length: int - :return: List of start and end positions for individual text chunks - :rtype: List[List] - """ - if input_length < chunk_length: - return [[0, input_length]] - if chunk_length <= overlap_length: - logger.warning( - "overlap_length should be shorter than chunk_length, setting overlap_length to by half of chunk_length" - ) - overlap_length = chunk_length // 2 - return [ - [i, min([i + chunk_length, input_length])] - for i in range( - 0, input_length - overlap_length, chunk_length - overlap_length - ) - ] - - def _get_ner_results_for_text(self, text: str) -> List[dict]: - """The function runs model inference on the provided text. - The text is split into chunks with n overlapping characters. - The results are then aggregated and duplications are removed. - - :param text: The text to run inference on - :type text: str - :return: List of entity predictions on the word level - :rtype: List[dict] - """ - model_max_length = self.pipeline.tokenizer.model_max_length - # calculate inputs based on the text - text_length = len(text) - # split text into chunks - if text_length <= model_max_length: - predictions = self.pipeline(text) - else: - logger.info( - f"splitting the text into chunks, length {text_length} > {model_max_length}" - ) - predictions = list() - chunk_indexes = TransformersRecognizer.split_text_to_word_chunks( - text_length, self.chunk_length, self.text_overlap_length - ) - - # iterate over text chunks and run inference - for chunk_start, chunk_end in chunk_indexes: - chunk_text = text[chunk_start:chunk_end] - chunk_preds = self.pipeline(chunk_text) - - # align indexes to match the original text - add to each position the value of chunk_start - aligned_predictions = list() - for prediction in chunk_preds: - prediction_tmp = copy.deepcopy(prediction) - prediction_tmp["start"] += chunk_start - prediction_tmp["end"] += chunk_start - aligned_predictions.append(prediction_tmp) - - predictions.extend(aligned_predictions) - - # remove duplicates - predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}] - return predictions - - @staticmethod - def _convert_to_recognizer_result( - prediction_result: dict, explanation: AnalysisExplanation - ) -> RecognizerResult: - """The method parses NER model predictions into a RecognizerResult format to enable down the stream analysis - - :param prediction_result: A single example of entity prediction - :type prediction_result: dict - :param explanation: Textual representation of model prediction - :type explanation: str - :return: An instance of RecognizerResult which is used to model evaluation calculations - :rtype: RecognizerResult - """ - - transformers_results = RecognizerResult( - entity_type=prediction_result["entity_group"], - start=prediction_result["start"], - end=prediction_result["end"], - score=float(round(prediction_result["score"], 2)), - analysis_explanation=explanation, - ) - - return transformers_results - - def build_transformers_explanation( - self, - original_score: float, - explanation: str, - pattern: str, - ) -> AnalysisExplanation: - """ - Create explanation for why this result was detected. - :param original_score: Score given by this recognizer - :param explanation: Explanation string - :param pattern: Regex pattern used - :return Structured explanation and scores of a NER model prediction - :rtype: AnalysisExplanation - """ - explanation = AnalysisExplanation( - recognizer=self.__class__.__name__, - original_score=float(original_score), - textual_explanation=explanation, - pattern=pattern, - ) - return explanation - - def __check_label_transformer(self, label: str) -> Optional[str]: - """The function validates the predicted label is identified by Presidio - and maps the string into a Presidio representation - :param label: Predicted label by the model - :return: Returns the adjusted entity name - """ - - # convert model label to presidio label - entity = self.model_to_presidio_mapping.get(label, None) - - if entity in self.ignore_labels: - return None - - if entity is None: - logger.warning(f"Found unrecognized label {label}, returning entity as is") - return label - - if entity not in self.supported_entities: - logger.warning(f"Found entity {entity} which is not supported by Presidio") - return entity - return entity From 6c22e00e878a69baecbd0cdd1456d6520bde65a1 Mon Sep 17 00:00:00 2001 From: Ari Roffe Date: Sun, 17 Dec 2023 03:59:42 -0600 Subject: [PATCH 4/6] Update index.md (#1233) Seems like https://github.com/microsoft/presidio/pull/1177 broke the link from home page to samples --- docs/analyzer/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/analyzer/index.md b/docs/analyzer/index.md index 6412834ad..56824ef0b 100644 --- a/docs/analyzer/index.md +++ b/docs/analyzer/index.md @@ -84,4 +84,4 @@ Follow the [API Spec](https://microsoft.github.io/presidio/api-docs/api-docs.htm ## Samples -Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/python/index.md). +Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/). From e7197daa3551e0a71085ddd89df211faab94fd11 Mon Sep 17 00:00:00 2001 From: Ari Roffe Date: Sun, 17 Dec 2023 10:23:07 -0600 Subject: [PATCH 5/6] Update adding_recognizers.md (#1232) --- docs/analyzer/adding_recognizers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md index 86738c767..c382d6edc 100644 --- a/docs/analyzer/adding_recognizers.md +++ b/docs/analyzer/adding_recognizers.md @@ -77,7 +77,7 @@ analyzer = AnalyzerEngine() analyzer.registry.add_recognizer(titles_recognizer) -results = analyzer.analyze(text=text,language="en") +results = analyzer.analyze(text=text, language="en") print(results) ``` From a6e253d0e7846c761144aa172844ddd6016a3970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Bot=C3=ADa?= <62219950+ebotiab@users.noreply.github.com> Date: Thu, 21 Dec 2023 12:21:27 +0100 Subject: [PATCH 6/6] Bugfix in NerModelConfiguration (#1230) --- .../presidio_analyzer/nlp_engine/ner_model_configuration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index a88545090..8408f776a 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -26,7 +26,7 @@ HOSPITAL="ORGANIZATION", ) -LOW_SCORE_ENTITY_NAMES = {} +LOW_SCORE_ENTITY_NAMES = set() LABELS_TO_IGNORE = { "O", "ORG", @@ -94,12 +94,12 @@ def __post_init__(self): @classmethod def _validate_input(cls, ner_model_configuration_dict: Dict) -> None: key_to_type = { - "labels_to_ignore": list, + "labels_to_ignore": Collection, "aggregation_strategy": str, "alignment_mode": str, "model_to_presidio_entity_mapping": dict, "low_confidence_score_multiplier": float, - "low_score_entity_names": list, + "low_score_entity_names": Collection, "stride": int, }