From 5d5d57d743a7ae05b883cbf0aac69a7db5e3d6d2 Mon Sep 17 00:00:00 2001 From: Sharon Hart Date: Sun, 26 Jan 2025 15:09:19 +0200 Subject: [PATCH] move to entity_recognizer.py --- .../presidio_analyzer/entity_recognizer.py | 15 +++++++++++- .../aba_routing_recognizer.py | 5 ++-- .../au_abn_recognizer.py | 5 ++-- .../au_acn_recognizer.py | 5 ++-- .../au_medicare_recognizer.py | 5 ++-- .../au_tfn_recognizer.py | 5 ++-- .../credit_card_recognizer.py | 5 ++-- .../es_nie_recognizer.py | 5 ++-- .../es_nif_recognizer.py | 5 ++-- .../predefined_recognizers/iban_recognizer.py | 3 +-- .../in_aadhaar_recognizer.py | 5 ++-- .../in_vehicle_registration_recognizer.py | 5 ++-- .../predefined_recognizers/it_vat_code.py | 5 ++-- .../medical_license_recognizer.py | 5 ++-- .../uk_nhs_recognizer.py | 5 ++-- .../presidio_analyzer/validation/__init__.py | 5 ---- .../validation/validation_utils.py | 23 ------------------- .../tests/test_entity_recognizer.py | 20 ++++++++++++++++ .../tests/test_validation_utils.py | 19 --------------- 19 files changed, 61 insertions(+), 89 deletions(-) delete mode 100644 presidio-analyzer/presidio_analyzer/validation/__init__.py delete mode 100644 presidio-analyzer/presidio_analyzer/validation/validation_utils.py delete mode 100644 presidio-analyzer/tests/test_validation_utils.py diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index fea96dde4..0b7037ad6 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -1,6 +1,6 @@ import logging from abc import abstractmethod -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple from presidio_analyzer import RecognizerResult from presidio_analyzer.nlp_engine import NlpArtifacts @@ -196,3 +196,16 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult] filtered_results.append(result) return filtered_results + + @staticmethod + def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + """ + Cleanse the input string of the replacement pairs specified as argument. + + :param text: input string + :param replacement_pairs: pairs of what has to be replaced with which value + :return: cleansed string + """ + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py index bafeddb12..4143991e9 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/aba_routing_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class AbaRoutingRecognizer(PatternRecognizer): @@ -60,7 +59,7 @@ def __init__( ) def validate_result(self, pattern_text: str) -> bool: # noqa D102 - sanitized_value = ValidationUtils.sanitize_value( + sanitized_value = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) return self.__checksum(sanitized_value) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py index 7bf0fc29c..08c4d2a58 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_abn_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class AuAbnRecognizer(PatternRecognizer): @@ -73,7 +72,7 @@ def validate_result(self, pattern_text: str) -> bool: :return: A bool indicating whether the validation was successful. """ # Pre-processing before validation checks - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) abn_list = [int(digit) for digit in text if not digit.isspace()] # Set weights based on digit position diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py index a8679090d..808cb3440 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_acn_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class AuAcnRecognizer(PatternRecognizer): @@ -70,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool: :return: A bool indicating whether the validation was successful. """ # Pre-processing before validation checks - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) acn_list = [int(digit) for digit in text if not digit.isspace()] # Set weights based on digit position diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py index 35a75b9fa..7492a289d 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_medicare_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class AuMedicareRecognizer(PatternRecognizer): @@ -70,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool: :return: A bool indicating whether the validation was successful. """ # Pre-processing before validation checks - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) medicare_list = [int(digit) for digit in text if not digit.isspace()] # Set weights based on digit position diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py index 556b38884..f5082e00d 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/au_tfn_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class AuTfnRecognizer(PatternRecognizer): @@ -76,7 +75,7 @@ def validate_result(self, pattern_text: str) -> bool: :return: A bool indicating whether the validation was successful. """ # Pre-processing before validation checks - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) tfn_list = [int(digit) for digit in text if not digit.isspace()] # Set weights based on digit position diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py index 0665c2121..240fb580d 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/credit_card_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class CreditCardRecognizer(PatternRecognizer): @@ -60,7 +59,7 @@ def __init__( ) def validate_result(self, pattern_text: str) -> bool: # noqa D102 - sanitized_value = ValidationUtils.sanitize_value( + sanitized_value = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) checksum = self.__luhn_checksum(sanitized_value) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nie_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nie_recognizer.py index a409cebe2..647528d24 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nie_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nie_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class EsNieRecognizer(PatternRecognizer): @@ -55,7 +54,7 @@ def __init__( def validate_result(self, pattern_text: str) -> bool: """Validate the pattern by using the control character.""" - pattern_text = ValidationUtils.sanitize_value( + pattern_text = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py index da26a6ced..30c370d52 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/es_nif_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class EsNifRecognizer(PatternRecognizer): @@ -48,7 +47,7 @@ def __init__( ) def validate_result(self, pattern_text: str) -> bool: # noqa D102 - pattern_text = ValidationUtils.sanitize_value( + pattern_text = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) letter = pattern_text[-1] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py index 905311095..9cf62ea51 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/iban_recognizer.py @@ -16,7 +16,6 @@ EOS, regex_per_country, ) -from presidio_analyzer.validation import ValidationUtils logger = logging.getLogger("presidio-analyzer") @@ -80,7 +79,7 @@ def __init__( def validate_result(self, pattern_text: str): # noqa D102 try: - pattern_text = ValidationUtils.sanitize_value( + pattern_text = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) is_valid_checksum = ( diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py index b4b8a22b7..397db20bc 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class InAadhaarRecognizer(PatternRecognizer): @@ -58,7 +57,7 @@ def __init__( def validate_result(self, pattern_text: str) -> bool: """Determine absolute value based on calculation.""" - sanitized_value = ValidationUtils.sanitize_value( + sanitized_value = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) return self.__check_aadhaar(sanitized_value) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py index 6af90529f..b090599e1 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class InVehicleRegistrationRecognizer(PatternRecognizer): @@ -349,7 +348,7 @@ def __init__( def validate_result(self, pattern_text: str) -> bool: """Determine absolute value based on calculation.""" - sanitized_value = ValidationUtils.sanitize_value( + sanitized_value = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) return self.__check_vehicle_registration(sanitized_value) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py index e12b1c4a9..acb7ba328 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/it_vat_code.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class ItVatCodeRecognizer(PatternRecognizer): @@ -63,7 +62,7 @@ def validate_result(self, pattern_text: str) -> bool: """ # Pre-processing before validation checks - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) # Edge-case that passes the checksum even though it is not a # valid italian vat code. diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py index 71d6f73b9..c3ac6f451 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/medical_license_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer # https://www.meditec.com/blog/dea-numbers-what-do-they-mean @@ -51,7 +50,7 @@ def __init__( ) def validate_result(self, pattern_text: str) -> bool: # noqa D102 - sanitized_value = ValidationUtils.sanitize_value( + sanitized_value = EntityRecognizer.sanitize_value( pattern_text, self.replacement_pairs ) checksum = self.__luhn_checksum(sanitized_value) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py index 5318d3b72..c274e7fc3 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nhs_recognizer.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple -from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.validation import ValidationUtils +from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer class NhsRecognizer(PatternRecognizer): @@ -60,7 +59,7 @@ def validate_result(self, pattern_text: str) -> bool: Only the part in text that was detected by the regex engine :return: A bool indicating whether the validation was successful. """ - text = ValidationUtils.sanitize_value(pattern_text, self.replacement_pairs) + text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs) total = sum( [int(c) * multiplier for c, multiplier in zip(text, reversed(range(11)))] ) diff --git a/presidio-analyzer/presidio_analyzer/validation/__init__.py b/presidio-analyzer/presidio_analyzer/validation/__init__.py deleted file mode 100644 index 1b63aa11f..000000000 --- a/presidio-analyzer/presidio_analyzer/validation/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Validation.""" - -from .validation_utils import ValidationUtils - -__all__ = ["ValidationUtils"] diff --git a/presidio-analyzer/presidio_analyzer/validation/validation_utils.py b/presidio-analyzer/presidio_analyzer/validation/validation_utils.py deleted file mode 100644 index 91dfbaf16..000000000 --- a/presidio-analyzer/presidio_analyzer/validation/validation_utils.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import List, Tuple - - -class ValidationUtils: - """ - Utility functions for Presidio Analyzer. - - The class provides a bundle of utility functions that help centralizing the - logic for re-usability and maintainability - """ - - @staticmethod - def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: - """ - Cleanse the input string of the replacement pairs specified as argument. - - :param text: input string - :param replacement_pairs: pairs of what has to be replaced with which value - :return: cleansed string - """ - for search_string, replacement_string in replacement_pairs: - text = text.replace(search_string, replacement_string) - return text diff --git a/presidio-analyzer/tests/test_entity_recognizer.py b/presidio-analyzer/tests/test_entity_recognizer.py index e66231199..e5e43b9db 100644 --- a/presidio-analyzer/tests/test_entity_recognizer.py +++ b/presidio-analyzer/tests/test_entity_recognizer.py @@ -119,3 +119,23 @@ def test_when_remove_duplicates_contained_shorter_length_results_removed(): ] results = EntityRecognizer.remove_duplicates(arr) assert len(results) == 1 + +from presidio_analyzer.validation import ValidationUtils +import pytest + +sanitizer_test_set = [ + [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], + ["def", "", "def"], +] + +@pytest.mark.parametrize("input_text, params, expected_output", sanitizer_test_set) +def test_sanitize_value(input_text, params, expected_output): + """ + Test to assert sanitize_value functionality from base class. + + :param input_text: input string + :param params: List of tuples, indicating what has to be sanitized with which + :param expected_output: sanitized value + :return: True/False + """ + assert ValidationUtils.sanitize_value(input_text, params) == expected_output diff --git a/presidio-analyzer/tests/test_validation_utils.py b/presidio-analyzer/tests/test_validation_utils.py deleted file mode 100644 index 4ef9b7c1f..000000000 --- a/presidio-analyzer/tests/test_validation_utils.py +++ /dev/null @@ -1,19 +0,0 @@ -from presidio_analyzer.validation import ValidationUtils -import pytest - -sanitizer_test_set = [ - [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], - ["def", "", "def"], -] - -@pytest.mark.parametrize("input_text, params, expected_output", sanitizer_test_set) -def test_sanitize_value(input_text, params, expected_output): - """ - Test to assert sanitize_value functionality from base class. - - :param input_text: input string - :param params: List of tuples, indicating what has to be sanitized with which - :param expected_output: sanitized value - :return: True/False - """ - assert ValidationUtils.sanitize_value(input_text, params) == expected_output