Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add code sample for string replacement based deidentification. #3956

Merged
merged 7 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 103 additions & 9 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@

# [START dlp_deidentify_masking]
def deidentify_with_mask(
project, string, info_types, masking_character=None, number_to_mask=0
project, input_str, info_types, masking_character=None, number_to_mask=0
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string by masking it with a character.
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
masking_character: The character to mask matching sensitive data with.
number_to_mask: The maximum number of sensitive characters to mask in
a match. If omitted or set to zero, the API will default to no
Expand Down Expand Up @@ -67,7 +67,7 @@ def deidentify_with_mask(
}

# Construct item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
Expand All @@ -83,11 +83,76 @@ def deidentify_with_mask(

# [END dlp_deidentify_masking]

# [START dlp_deidentify_replace]
def deidentify_with_replace(
project,
input_str,
info_types,
replacement_str="REPLACEMENT_STR",
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string by replacing matched input values with a value you specify.
Args:
project: The Google Cloud project id to use as a parent resource.
input_str: The string to deidentify (will be treated as text).
info_types: A list of strings representing info types to look for.
replacement_str: The string to replace all values that match given
info types.
Returns:
None; the response from the API is printed to the terminal.
"""
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Construct inspect configuration dictionary
inspect_config = {
"info_types": [{"name": info_type} for info_type in info_types]
}

# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"replace_config": {
"new_value": {
"string_value": replacement_str,
ackul marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}
]
}
}
Comment on lines +118 to +132
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard for me to wrap my head around this because of how deeply this is nested.

Could we perhaps use two dictionaries?

transformation = {
    "primitive_transformation": {
        "replace_config": {
            "new_value": {
                "string_value": replacement_str,
            }
        }
    }
}

deidentify_config = {
    "info_type_transformations": {
        "transformations": [transformation]
    }
}

Copy link
Contributor Author

@ackul ackul Jun 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. I was trying to follow surrounding code for consistency. Do you feel strongly about the nesting?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, I didn't notice the existing function. I'm fine with keeping this as is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to breaking this out - I think there is significantly more nesting going on here than in the other function that it feel necessary


# Construct item
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)

# Print out the results.
print(response.item.value)

# [END dlp_deidentify_replace]

# [START dlp_deidentify_fpe]


def deidentify_with_fpe(
project,
string,
input_str,
info_types,
alphabet=None,
surrogate_type=None,
Expand All @@ -98,7 +163,7 @@ def deidentify_with_fpe(
string using Format Preserving Encryption (FPE).
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
Expand Down Expand Up @@ -166,7 +231,7 @@ def deidentify_with_fpe(
}

# Convert string to item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
Expand All @@ -186,7 +251,7 @@ def deidentify_with_fpe(
# [START dlp_reidentify_fpe]
def reidentify_with_fpe(
project,
string,
input_str,
alphabet=None,
surrogate_type=None,
key_name=None,
Expand All @@ -196,7 +261,7 @@ def reidentify_with_fpe(
string that was encrypted by Format Preserving Encryption (FPE).
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
Expand Down Expand Up @@ -255,7 +320,7 @@ def reidentify_with_fpe(
}

# Convert string to item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.reidentify_content(
Expand Down Expand Up @@ -531,6 +596,28 @@ def redact_sensitive_data(project, item, info_types):
help="The character to mask matching sensitive data with.",
)

replace_parser = subparsers.add_parser(
"deid_replace",
help="Deidentify sensitive data in a string by replacing it with "
"another string.",
)
replace_parser.add_argument(
"--info_types",
nargs="+",
help="Strings representing info types to look for. A full list of "
"info categories and types is available from the API. Examples "
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
replace_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
replace_parser.add_argument("item", help="The string to deidentify.")
replace_parser.add_argument("replacement_str", help="The string to "
"replace all matched values with.")

fpe_parser = subparsers.add_parser(
"deid_fpe",
help="Deidentify sensitive data in a string using Format Preserving "
Expand Down Expand Up @@ -715,6 +802,13 @@ def redact_sensitive_data(project, item, info_types):
masking_character=args.masking_character,
number_to_mask=args.number_to_mask,
)
elif args.content == "deid_replace":
deidentify_with_replace(
args.project,
args.item,
args.info_types,
replacement_str=args.replacement_str,
)
elif args.content == "deid_fpe":
deidentify_with_fpe(
args.project,
Expand Down
10 changes: 10 additions & 0 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys):
assert "My SSN is *******27" in out


def test_deidentify_with_replace(capsys):
deid.deidentify_with_replace(
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"],
replacement_str="REPLACEMENT_STR"
)

out, _ = capsys.readouterr()
assert "My SSN is REPLACEMENT_STR" in out


def test_deidentify_with_fpe(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
Expand Down