Skip to content

Commit

Permalink
Add text redaction sample using DLP
Browse files Browse the repository at this point in the history
  • Loading branch information
sethmoo committed Jun 5, 2020
1 parent a4277b1 commit b63f932
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 4 deletions.
9 changes: 5 additions & 4 deletions dlp/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -339,13 +339,12 @@ To run this sample:
.. code-block:: bash
$ python deid.py
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift} ...
usage: deid.py [-h] {deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact} ...
Uses of the Data Loss Prevention API for deidentifying sensitive data.
positional arguments:
{deid_mask,deid_fpe,reid_fpe,deid_date_shift}
{deid_mask,deid_fpe,reid_fpe,deid_date_shift,redact}
Select how to submit content to the API.
deid_mask Deidentify sensitive data in a string by masking it
with a character.
Expand All @@ -355,6 +354,8 @@ To run this sample:
Preserving Encryption (FPE).
deid_date_shift Deidentify dates in a CSV file by pseudorandomly
shifting them.
redact Redact sensitive data in a string by replacing it with
the info type of the data.
optional arguments:
-h, --help show this help message and exit
Expand All @@ -378,4 +379,4 @@ to `browse the source`_ and `report issues`_.
https://github.com/GoogleCloudPlatform/google-cloud-python/issues
.. _Google Cloud SDK: https://cloud.google.com/sdk/
.. _Google Cloud SDK: https://cloud.google.com/sdk/
86 changes: 86 additions & 0 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,62 @@ def write_data(data):
# [END dlp_deidentify_date_shift]


# [START dlp_redact_sensitive_data]
def redact_sensitive_data(project, string, info_types):
"""Uses the Data Loss Prevention API to redact sensitive data in a
string by replacing it with the info type.
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to redact (will be treated as text).
Returns:
None; the response from the API is printed to the terminal.
"""

# Import the client library
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Construct inspect configuration dictionary
inspect_config = {
"info_types": [{"name": info_type} for info_type in info_types]
}

# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"replace_with_info_type_config": {}
}
}
]
}
}

# Construct item
item = {"value": string}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)

# Print out the results.
print(response.item.value)


# [END dlp_redact_sensitive_data]


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
Expand Down Expand Up @@ -626,6 +682,30 @@ def write_data(data):
"key_name.",
)

redact_parser = subparsers.add_parser(
"redact",
help="Redact sensitive data in a string by replacing it with the "
"info type of the data.",
)
redact_parser.add_argument(
"--info_types",
action="append",
help="Strings representing info types to look for. A full list of "
"info categories and types is available from the API. Examples "
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
redact_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
redact_parser.add_argument(
"item",
help="The string to redact."
"Example: 'My credit card is 4242 4242 4242 4242'",
)

args = parser.parse_args()

if args.content == "deid_mask":
Expand Down Expand Up @@ -667,3 +747,9 @@ def write_data(data):
wrapped_key=args.wrapped_key,
key_name=args.key_name,
)
elif args.content == "redact":
redact_sensitive_data(
args.project,
string=args.item,
info_types=args.info_types,
)
14 changes: 14 additions & 0 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,17 @@ def test_reidentify_with_fpe(capsys):
out, _ = capsys.readouterr()

assert "731997681" not in out


def test_redact_sensitive_data(capsys):
url_to_redact = "https://cloud.google.com"
deid.redact_sensitive_data(
GCLOUD_PROJECT,
"My favorite site is " + url_to_redact,
["URL"],
)

out, _ = capsys.readouterr()

assert url_to_redact not in out
assert "My favorite site is [URL]" in out

0 comments on commit b63f932

Please sign in to comment.