Skip to content

Commit

Permalink
Merge branch 'main' into feature/pl-lt-et-recognizers
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 authored Dec 24, 2023
2 parents eccb4c5 + a6e253d commit e21b1f1
Show file tree
Hide file tree
Showing 20 changed files with 352 additions and 620 deletions.
2 changes: 1 addition & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Presidio _(Origin from Latin praesidium ‘protection, garrison’)_ helps to en

---

## :mailbox_with_mail: Are you using Presidio? We'd love to know how :mailbox_with_mail:
## Are you using Presidio? We'd love to know how

Please help us improve by taking [this short anonymous survey](https://forms.office.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR9LagCGNW01LpMix2pnFWFJUQjJDTVkwSlJYRkFPSUNNVlVRRVRWVDVNSy4u).

Expand Down
2 changes: 1 addition & 1 deletion docs/analyzer/adding_recognizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ analyzer = AnalyzerEngine()

analyzer.registry.add_recognizer(titles_recognizer)

results = analyzer.analyze(text=text,language="en")
results = analyzer.analyze(text=text, language="en")
print(results)
```

Expand Down
2 changes: 1 addition & 1 deletion docs/analyzer/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,4 @@ Follow the [API Spec](https://microsoft.github.io/presidio/api-docs/api-docs.htm

## Samples

Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/python/index.md).
Samples illustrating the usage of the Presidio Analyzer can be found in the [Python samples](../samples/).
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

logger = logging.getLogger("presidio-streamlit")

class TextAnalyticsWrapper(EntityRecognizer):

class AzureAIServiceWrapper(EntityRecognizer):
from azure.ai.textanalytics._models import PiiEntityCategory

TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]

def __init__(
Expand All @@ -35,7 +37,7 @@ def __init__(
super().__init__(
supported_entities=supported_entities,
supported_language=supported_language,
name="Azure Text Analytics PII",
name="Azure AI Language PII",
)

self.ta_key = ta_key
Expand Down Expand Up @@ -67,7 +69,7 @@ def analyze(
for entity in res.entities:
if entity.category not in self.supported_entities:
continue
analysis_explanation = TextAnalyticsWrapper._build_explanation(
analysis_explanation = AzureAIServiceWrapper._build_explanation(
original_score=entity.confidence_score,
entity_type=entity.category,
)
Expand All @@ -88,7 +90,7 @@ def _build_explanation(
original_score: float, entity_type: str
) -> AnalysisExplanation:
explanation = AnalysisExplanation(
recognizer=TextAnalyticsWrapper.__class__.__name__,
recognizer=AzureAIServiceWrapper.__class__.__name__,
original_score=original_score,
textual_explanation=f"Identified as {entity_type} by Text Analytics",
)
Expand All @@ -100,6 +102,7 @@ def load(self) -> None:

if __name__ == "__main__":
import presidio_helpers

dotenv.load_dotenv()
text = """
Here are a few example sentences we currently support:
Expand Down
10 changes: 5 additions & 5 deletions docs/samples/python/streamlit/flair_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
# ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
]

MODEL_LANGUAGES = {
"en": "flair/ner-english-large"
}
MODEL_LANGUAGES = {"en": "flair/ner-english-large"}

PRESIDIO_EQUIVALENCES = {
"PER": "PERSON",
Expand All @@ -76,7 +74,7 @@ def __init__(
supported_entities: Optional[List[str]] = None,
check_label_groups: Optional[Tuple[Set, Set]] = None,
model: SequenceTagger = None,
model_path: Optional[str] = None
model_path: Optional[str] = None,
):
self.check_label_groups = (
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
Expand All @@ -93,7 +91,9 @@ def __init__(
self.model = SequenceTagger.load(model_path)
else:
print(f"Loading model for language {supported_language}")
self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
self.model = SequenceTagger.load(
self.MODEL_LANGUAGES.get(supported_language)
)

super().__init__(
supported_entities=supported_entities,
Expand Down
27 changes: 0 additions & 27 deletions docs/samples/python/streamlit/flair_test.py

This file was deleted.

61 changes: 28 additions & 33 deletions docs/samples/python/streamlit/openai_fake_data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,45 @@
from typing import Optional

import openai
from openai import OpenAI, AzureOpenAI
import logging

logger = logging.getLogger("presidio-streamlit")

OpenAIParams = namedtuple(
"open_ai_params",
["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"],
)


def set_openai_params(openai_params: OpenAIParams):
"""Set the OpenAI API key.
:param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
The latter only relate to Azure OpenAI deployments.
"""
openai.api_key = openai_params.openai_key
openai.api_version = openai_params.api_version
if openai_params.api_base:
openai.api_base = openai_params.api_base
openai.api_type = openai_params.api_type


def call_completion_model(
prompt: str,
model: str = "text-davinci-003",
max_tokens: int = 512,
deployment_id: Optional[str] = None,
openai_params: OpenAIParams,
max_tokens: Optional[int] = 256,
) -> str:
"""Creates a request for the OpenAI Completion service and returns the response.
:param prompt: The prompt for the completion model
:param model: OpenAI model name
:param max_tokens: Model's max_tokens parameter
:param deployment_id: Azure OpenAI deployment ID
:param openai_params: OpenAI parameters for the completion model
:param max_tokens: The maximum number of tokens to generate.
"""
if deployment_id:
response = openai.Completion.create(
deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
if openai_params.api_type.lower() == "azure":
client = AzureOpenAI(
api_version=openai_params.api_version,
api_key=openai_params.openai_key,
azure_endpoint=openai_params.api_base,
azure_deployment=openai_params.deployment_id,
)
else:
response = openai.Completion.create(
model=model, prompt=prompt, max_tokens=max_tokens
)
client = OpenAI(api_key=openai_params.openai_key)

return response["choices"][0].text
response = client.completions.create(
model=openai_params.model,
prompt=prompt,
max_tokens=max_tokens,
)

return response.choices[0].text.strip()


def create_prompt(anonymized_text: str) -> str:
Expand All @@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str:
a. Use completely random numbers, so every digit is drawn between 0 and 9.
b. Use realistic names that come from diverse genders, ethnicities and countries.
c. If there are no placeholders, return the text as is and provide an answer.
c. If there are no placeholders, return the text as is.
d. Keep the formatting as close to the original as possible.
e. If PII exists in the input, replace it with fake values in the output.
f. Remove whitespace before and after the generated text
input: How do I change the limit on my credit card {{credit_card_number}}?
input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
output: How do I change the limit on my credit card 2539 3519 2345 1555?
input: <PERSON> was the chief science officer at <ORGANIZATION>.
input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
output: Katherine Buckjov was the chief science officer at NASA.
input: Cameroon lives in <LOCATION>.
input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
output: Vladimir lives in Moscow.
input: {anonymized_text}
output:
"""
input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
output:"""
return prompt
25 changes: 11 additions & 14 deletions docs/samples/python/streamlit/presidio_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@
from presidio_anonymizer.entities import OperatorConfig

from openai_fake_data_generator import (
set_openai_params,
call_completion_model,
create_prompt,
OpenAIParams,
create_prompt,
)
from presidio_nlp_engine_config import (
create_nlp_engine_with_spacy,
create_nlp_engine_with_flair,
create_nlp_engine_with_transformers,
create_nlp_engine_with_azure_text_analytics,
create_nlp_engine_with_azure_ai_language,
create_nlp_engine_with_stanza,
)

logger = logging.getLogger("presidio-streamlit")
Expand All @@ -49,14 +49,16 @@ def nlp_engine_and_registry(
"""

# Set up NLP Engine according to the model of choice
if "spaCy" in model_family:
if "spacy" in model_family.lower():
return create_nlp_engine_with_spacy(model_path)
elif "flair" in model_family:
if "stanza" in model_family.lower():
return create_nlp_engine_with_stanza(model_path)
elif "flair" in model_family.lower():
return create_nlp_engine_with_flair(model_path)
elif "HuggingFace" in model_family:
elif "huggingface" in model_family.lower():
return create_nlp_engine_with_transformers(model_path)
elif "Azure Text Analytics" in model_family:
return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
elif "azure ai language" in model_family.lower():
return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
else:
raise ValueError(f"Model family {model_family} not supported")

Expand Down Expand Up @@ -215,14 +217,9 @@ def create_fake_data(
if not openai_params.openai_key:
return "Please provide your OpenAI key"
results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
set_openai_params(openai_params)
prompt = create_prompt(results.text)
print(f"Prompt: {prompt}")
fake = call_openai_api(
prompt=prompt,
openai_model_name=openai_params.model,
openai_deployment_name=openai_params.deployment_name,
)
fake = call_completion_model(prompt=prompt, openai_params=openai_params)
return fake


Expand Down
Loading

0 comments on commit e21b1f1

Please sign in to comment.