From 4b0d97e95cf5b88387671187deb5d9e95ede7fa7 Mon Sep 17 00:00:00 2001 From: Ace Kulshrestha Date: Thu, 4 Jun 2020 11:21:44 -0700 Subject: [PATCH 1/2] Add code sample for string replacement based deidentification. --- dlp/deid.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ dlp/deid_test.py | 10 ++++++ 2 files changed, 103 insertions(+) diff --git a/dlp/deid.py b/dlp/deid.py index b08a341dd82e..b497b4810d85 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -83,8 +83,72 @@ def deidentify_with_mask( # [END dlp_deidentify_masking] +# [START dlp_deidentify_replace] +def deidentify_with_replace( + project, + string, + info_types, + replacement_str=None, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing matched input values with a value you specify. + Args: + project: The Google Cloud project id to use as a parent resource. + string: The string to deidentify (will be treated as text). + replacement_str: The string to replace all values that match given + info types. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_config": { + "new_value": { + "string_value": replacement_str, + } + } + } + } + ] + } + } + + # Construct item + item = {"value": string} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + +# [END dlp_deidentify_replace] # [START dlp_deidentify_fpe] + + def deidentify_with_fpe( project, string, @@ -476,6 +540,28 @@ def write_data(data): help="The character to mask matching sensitive data with.", ) + replace_parser = subparsers.add_parser( + "deid_replace", + help="Deidentify sensitive data in a string by replacing it with " + "another string.", + ) + replace_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_parser.add_argument("item", help="The string to deidentify.") + replace_parser.add_argument("replacement_str", help="The string to " + "replace all matched values with.") + fpe_parser = subparsers.add_parser( "deid_fpe", help="Deidentify sensitive data in a string using Format Preserving " @@ -636,6 +722,13 @@ def write_data(data): masking_character=args.masking_character, number_to_mask=args.number_to_mask, ) + elif args.content == "deid_replace": + deidentify_with_replace( + args.project, + args.item, + args.info_types, + replacement_str=args.replacement_str, + ) elif args.content == "deid_fpe": deidentify_with_fpe( args.project, diff --git a/dlp/deid_test.py b/dlp/deid_test.py index db14b5758e96..b45de396599f 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys): assert "My SSN is *******27" in out +def test_deidentify_with_replace(capsys): + deid.deidentify_with_replace( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], + replacement_str="REPLACEMENT_STR" + ) + + out, _ = capsys.readouterr() + assert "My SSN is REPLACEMENT_STR" in out + + def test_deidentify_with_fpe(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, From debe98e475b51f017b313519f61178d8d8c1a9d5 Mon Sep 17 00:00:00 2001 From: Ace Kulshrestha Date: Fri, 5 Jun 2020 10:51:48 -0700 Subject: [PATCH 2/2] Fix docstring nits --- dlp/deid.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index b497b4810d85..c57f2890266b 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -21,13 +21,13 @@ # [START dlp_deidentify_masking] def deidentify_with_mask( - project, string, info_types, masking_character=None, number_to_mask=0 + project, input_str, info_types, masking_character=None, number_to_mask=0 ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in a match. If omitted or set to zero, the API will default to no @@ -67,7 +67,7 @@ def deidentify_with_mask( } # Construct item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.deidentify_content( @@ -86,15 +86,16 @@ def deidentify_with_mask( # [START dlp_deidentify_replace] def deidentify_with_replace( project, - string, + input_str, info_types, - replacement_str=None, + replacement_str="REPLACEMENT_STR", ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by replacing matched input values with a value you specify. Args: project: The Google Cloud project id to use as a parent resource. - string: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. replacement_str: The string to replace all values that match given info types. Returns: @@ -131,7 +132,7 @@ def deidentify_with_replace( } # Construct item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.deidentify_content( @@ -151,7 +152,7 @@ def deidentify_with_replace( def deidentify_with_fpe( project, - string, + input_str, info_types, alphabet=None, surrogate_type=None, @@ -162,7 +163,7 @@ def deidentify_with_fpe( string using Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet @@ -230,7 +231,7 @@ def deidentify_with_fpe( } # Convert string to item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.deidentify_content( @@ -250,7 +251,7 @@ def deidentify_with_fpe( # [START dlp_reidentify_fpe] def reidentify_with_fpe( project, - string, + input_str, alphabet=None, surrogate_type=None, key_name=None, @@ -260,7 +261,7 @@ def reidentify_with_fpe( string that was encrypted by Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet @@ -319,7 +320,7 @@ def reidentify_with_fpe( } # Convert string to item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.reidentify_content(