Skip to content

Commit

Permalink
move to entity_recognizer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
SharonHart committed Jan 26, 2025
1 parent 1cf3924 commit 5d5d57d
Show file tree
Hide file tree
Showing 19 changed files with 61 additions and 89 deletions.
15 changes: 14 additions & 1 deletion presidio-analyzer/presidio_analyzer/entity_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from abc import abstractmethod
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

from presidio_analyzer import RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
Expand Down Expand Up @@ -196,3 +196,16 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]
filtered_results.append(result)

return filtered_results

@staticmethod
def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
"""
Cleanse the input string of the replacement pairs specified as argument.
:param text: input string
:param replacement_pairs: pairs of what has to be replaced with which value
:return: cleansed string
"""
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AbaRoutingRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -60,7 +59,7 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = ValidationUtils.sanitize_value(
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
return self.__checksum(sanitized_value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuAbnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -73,7 +72,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
abn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuAcnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -70,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
acn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuMedicareRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -70,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
medicare_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuTfnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -76,7 +75,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
tfn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class CreditCardRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -60,7 +59,7 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = ValidationUtils.sanitize_value(
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
checksum = self.__luhn_checksum(sanitized_value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class EsNieRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -55,7 +54,7 @@ def __init__(
def validate_result(self, pattern_text: str) -> bool:
"""Validate the pattern by using the control character."""

pattern_text = ValidationUtils.sanitize_value(
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class EsNifRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -48,7 +47,7 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
pattern_text = ValidationUtils.sanitize_value(
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
letter = pattern_text[-1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
EOS,
regex_per_country,
)
from presidio_analyzer.validation import ValidationUtils

logger = logging.getLogger("presidio-analyzer")

Expand Down Expand Up @@ -80,7 +79,7 @@ def __init__(

def validate_result(self, pattern_text: str): # noqa D102
try:
pattern_text = ValidationUtils.sanitize_value(
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
is_valid_checksum = (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class InAadhaarRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -58,7 +57,7 @@ def __init__(

def validate_result(self, pattern_text: str) -> bool:
"""Determine absolute value based on calculation."""
sanitized_value = ValidationUtils.sanitize_value(
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
return self.__check_aadhaar(sanitized_value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class InVehicleRegistrationRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -349,7 +348,7 @@ def __init__(

def validate_result(self, pattern_text: str) -> bool:
"""Determine absolute value based on calculation."""
sanitized_value = ValidationUtils.sanitize_value(
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
return self.__check_vehicle_registration(sanitized_value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class ItVatCodeRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -63,7 +62,7 @@ def validate_result(self, pattern_text: str) -> bool:
"""

# Pre-processing before validation checks
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)

# Edge-case that passes the checksum even though it is not a
# valid italian vat code.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer

# https://www.meditec.com/blog/dea-numbers-what-do-they-mean

Expand Down Expand Up @@ -51,7 +50,7 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = ValidationUtils.sanitize_value(
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
checksum = self.__luhn_checksum(sanitized_value)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer.validation import ValidationUtils
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class NhsRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -60,7 +59,7 @@ def validate_result(self, pattern_text: str) -> bool:
Only the part in text that was detected by the regex engine
:return: A bool indicating whether the validation was successful.
"""
text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
total = sum(
[int(c) * multiplier for c, multiplier in zip(text, reversed(range(11)))]
)
Expand Down
5 changes: 0 additions & 5 deletions presidio-analyzer/presidio_analyzer/validation/__init__.py

This file was deleted.

23 changes: 0 additions & 23 deletions presidio-analyzer/presidio_analyzer/validation/validation_utils.py

This file was deleted.

20 changes: 20 additions & 0 deletions presidio-analyzer/tests/test_entity_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,23 @@ def test_when_remove_duplicates_contained_shorter_length_results_removed():
]
results = EntityRecognizer.remove_duplicates(arr)
assert len(results) == 1

from presidio_analyzer.validation import ValidationUtils
import pytest

sanitizer_test_set = [
[" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"],
["def", "", "def"],
]

@pytest.mark.parametrize("input_text, params, expected_output", sanitizer_test_set)
def test_sanitize_value(input_text, params, expected_output):
"""
Test to assert sanitize_value functionality from base class.
:param input_text: input string
:param params: List of tuples, indicating what has to be sanitized with which
:param expected_output: sanitized value
:return: True/False
"""
assert ValidationUtils.sanitize_value(input_text, params) == expected_output
19 changes: 0 additions & 19 deletions presidio-analyzer/tests/test_validation_utils.py

This file was deleted.

0 comments on commit 5d5d57d

Please sign in to comment.