Merge branch 'main' into feature/pl-lt-et-recognizers

microsoft · Dec 24, 2023 · e21b1f1 · e21b1f1
2 parents eccb4c5 + a6e253d
commit e21b1f1
Show file tree

Hide file tree

Showing 20 changed files with 352 additions and 620 deletions.
diff --git a/README.MD b/README.MD
@@ -35,7 +35,7 @@ Presidio _(Origin from Latin praesidium ‘protection, garrison’)_ helps to en
 
 ---
 
-## :mailbox_with_mail: Are you using Presidio? We'd love to know how :mailbox_with_mail:
+## Are you using Presidio? We'd love to know how
 
 Please help us improve by taking [this short anonymous survey](https://forms.office.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR9LagCGNW01LpMix2pnFWFJUQjJDTVkwSlJYRkFPSUNNVlVRRVRWVDVNSy4u).
 

diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md
@@ -77,7 +77,7 @@ analyzer = AnalyzerEngine()
 
 analyzer.registry.add_recognizer(titles_recognizer)
 
-results = analyzer.analyze(text=text,language="en")
+results = analyzer.analyze(text=text, language="en")
 print(results)
 ```
 

diff --git a/docs/analyzer/index.md b/docs/analyzer/index.md
@@ -84,4 +84,4 @@ Follow the [API Spec](https://microsoft.github.io/presidio/api-docs/api-docs.htm
 
 ## Samples
 
-Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/python/index.md).
+Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/).
diff --git a/...ython/streamlit/text_analytics_wrapper.py → ...on/streamlit/azure_ai_language_wrapper.py b/...ython/streamlit/text_analytics_wrapper.py → ...on/streamlit/azure_ai_language_wrapper.py
@@ -10,8 +10,10 @@
 
 logger = logging.getLogger("presidio-streamlit")
 
-class TextAnalyticsWrapper(EntityRecognizer):
+
+class AzureAIServiceWrapper(EntityRecognizer):
     from azure.ai.textanalytics._models import PiiEntityCategory
+
     TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
 
     def __init__(
@@ -35,7 +37,7 @@ def __init__(
         super().__init__(
             supported_entities=supported_entities,
             supported_language=supported_language,
-            name="Azure Text Analytics PII",
+            name="Azure AI Language PII",
         )
 
         self.ta_key = ta_key
@@ -67,7 +69,7 @@ def analyze(
             for entity in res.entities:
                 if entity.category not in self.supported_entities:
                     continue
-                analysis_explanation = TextAnalyticsWrapper._build_explanation(
+                analysis_explanation = AzureAIServiceWrapper._build_explanation(
                     original_score=entity.confidence_score,
                     entity_type=entity.category,
                 )
@@ -88,7 +90,7 @@ def _build_explanation(
         original_score: float, entity_type: str
     ) -> AnalysisExplanation:
         explanation = AnalysisExplanation(
-            recognizer=TextAnalyticsWrapper.__class__.__name__,
+            recognizer=AzureAIServiceWrapper.__class__.__name__,
             original_score=original_score,
             textual_explanation=f"Identified as {entity_type} by Text Analytics",
         )
@@ -100,6 +102,7 @@ def load(self) -> None:
 
 if __name__ == "__main__":
     import presidio_helpers
+
     dotenv.load_dotenv()
     text = """
     Here are a few example sentences we currently support:

diff --git a/docs/samples/python/streamlit/flair_recognizer.py b/docs/samples/python/streamlit/flair_recognizer.py
@@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
         # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
     ]
 
-    MODEL_LANGUAGES = {
-        "en": "flair/ner-english-large"
-    }
+    MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
 
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
@@ -76,7 +74,7 @@ def __init__(
         supported_entities: Optional[List[str]] = None,
         check_label_groups: Optional[Tuple[Set, Set]] = None,
         model: SequenceTagger = None,
-        model_path: Optional[str] = None
+        model_path: Optional[str] = None,
     ):
         self.check_label_groups = (
             check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
@@ -93,7 +91,9 @@ def __init__(
             self.model = SequenceTagger.load(model_path)
         else:
             print(f"Loading model for language {supported_language}")
-            self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
+            self.model = SequenceTagger.load(
+                self.MODEL_LANGUAGES.get(supported_language)
+            )
 
         super().__init__(
             supported_entities=supported_entities,

diff --git a/docs/samples/python/streamlit/flair_test.py b/docs/samples/python/streamlit/flair_test.py
diff --git a/docs/samples/python/streamlit/openai_fake_data_generator.py b/docs/samples/python/streamlit/openai_fake_data_generator.py
@@ -2,51 +2,45 @@
 from typing import Optional
 
 import openai
+from openai import OpenAI, AzureOpenAI
 import logging
 
 logger = logging.getLogger("presidio-streamlit")
 
 OpenAIParams = namedtuple(
     "open_ai_params",
-    ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
+    ["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"],
 )
 
 
-def set_openai_params(openai_params: OpenAIParams):
-    """Set the OpenAI API key.
-    :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
-    The latter only relate to Azure OpenAI deployments.
-    """
-    openai.api_key = openai_params.openai_key
-    openai.api_version = openai_params.api_version
-    if openai_params.api_base:
-        openai.api_base = openai_params.api_base
-        openai.api_type = openai_params.api_type
-
-
 def call_completion_model(
     prompt: str,
-    model: str = "text-davinci-003",
-    max_tokens: int = 512,
-    deployment_id: Optional[str] = None,
+    openai_params: OpenAIParams,
+    max_tokens: Optional[int] = 256,
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
 
     :param prompt: The prompt for the completion model
-    :param model: OpenAI model name
-    :param max_tokens: Model's max_tokens parameter
-    :param deployment_id: Azure OpenAI deployment ID
+    :param openai_params: OpenAI parameters for the completion model
+    :param max_tokens: The maximum number of tokens to generate.
     """
-    if deployment_id:
-        response = openai.Completion.create(
-            deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
+    if openai_params.api_type.lower() == "azure":
+        client = AzureOpenAI(
+            api_version=openai_params.api_version,
+            api_key=openai_params.openai_key,
+            azure_endpoint=openai_params.api_base,
+            azure_deployment=openai_params.deployment_id,
         )
     else:
-        response = openai.Completion.create(
-            model=model, prompt=prompt, max_tokens=max_tokens
-        )
+        client = OpenAI(api_key=openai_params.openai_key)
 
-    return response["choices"][0].text
+    response = client.completions.create(
+        model=openai_params.model,
+        prompt=prompt,
+        max_tokens=max_tokens,
+    )
+
+    return response.choices[0].text.strip()
 
 
 def create_prompt(anonymized_text: str) -> str:
@@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str:
 
     a. Use completely random numbers, so every digit is drawn between 0 and 9.
     b. Use realistic names that come from diverse genders, ethnicities and countries.
-    c. If there are no placeholders, return the text as is and provide an answer.
+    c. If there are no placeholders, return the text as is.
     d. Keep the formatting as close to the original as possible.
     e. If PII exists in the input, replace it with fake values in the output.
+    f. Remove whitespace before and after the generated text
     
-    input: How do I change the limit on my credit card {{credit_card_number}}?
+    input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
     output: How do I change the limit on my credit card 2539 3519 2345 1555?
-    input: <PERSON> was the chief science officer at <ORGANIZATION>.
+    input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
     output: Katherine Buckjov was the chief science officer at NASA.
-    input: Cameroon lives in <LOCATION>.
+    input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
     output: Vladimir lives in Moscow.
-    input: {anonymized_text}
-    output:
-    """
+    
+    input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
+    output:"""
     return prompt
diff --git a/docs/samples/python/streamlit/presidio_helpers.py b/docs/samples/python/streamlit/presidio_helpers.py
@@ -16,16 +16,16 @@
 from presidio_anonymizer.entities import OperatorConfig
 
 from openai_fake_data_generator import (
-    set_openai_params,
     call_completion_model,
-    create_prompt,
     OpenAIParams,
+    create_prompt,
 )
 from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
     create_nlp_engine_with_flair,
     create_nlp_engine_with_transformers,
-    create_nlp_engine_with_azure_text_analytics,
+    create_nlp_engine_with_azure_ai_language,
+    create_nlp_engine_with_stanza,
 )
 
 logger = logging.getLogger("presidio-streamlit")
@@ -49,14 +49,16 @@ def nlp_engine_and_registry(
     """
 
     # Set up NLP Engine according to the model of choice
-    if "spaCy" in model_family:
+    if "spacy" in model_family.lower():
         return create_nlp_engine_with_spacy(model_path)
-    elif "flair" in model_family:
+    if "stanza" in model_family.lower():
+        return create_nlp_engine_with_stanza(model_path)
+    elif "flair" in model_family.lower():
         return create_nlp_engine_with_flair(model_path)
-    elif "HuggingFace" in model_family:
+    elif "huggingface" in model_family.lower():
         return create_nlp_engine_with_transformers(model_path)
-    elif "Azure Text Analytics" in model_family:
-        return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
+    elif "azure ai language" in model_family.lower():
+        return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
     else:
         raise ValueError(f"Model family {model_family} not supported")
 
@@ -215,14 +217,9 @@ def create_fake_data(
     if not openai_params.openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
-    set_openai_params(openai_params)
     prompt = create_prompt(results.text)
     print(f"Prompt: {prompt}")
-    fake = call_openai_api(
-        prompt=prompt,
-        openai_model_name=openai_params.model,
-        openai_deployment_name=openai_params.deployment_name,
-    )
+    fake = call_completion_model(prompt=prompt, openai_params=openai_params)
     return fake
-Original file line number
+Diff line change
@@ Expand Up @@
     ---
-    ## :mailbox_with_mail: Are you using Presidio? We'd love to know how :mailbox_with_mail:
+    ## Are you using Presidio? We'd love to know how
     Please help us improve by taking [this short anonymous survey](https://forms.office.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR9LagCGNW01LpMix2pnFWFJUQjJDTVkwSlJYRkFPSUNNVlVRRVRWVDVNSy4u).
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -84,4 +84,4 @@ Follow the [API Spec](https://microsoft.github.io/presidio/api-docs/api-docs.htm

		## Samples

		Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/python/index.md).
		Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/).