diff --git a/dlp/README.rst b/dlp/README.rst new file mode 100644 index 000000000000..bcfca5d3e564 --- /dev/null +++ b/dlp/README.rst @@ -0,0 +1,183 @@ +.. This file is automatically generated. Do not edit this file directly. + +Google Data Loss Prevention Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/README.rst + + +This directory contains samples for Google Data Loss Prevention. `Google Data Loss Prevention`_ provides programmatic access to a powerful detection engine for personally identifiable information and other privacy-sensitive data in unstructured data streams. **This api is currently in beta**. + + + + +.. _Google Data Loss Prevention: https://cloud.google.com/dlp/docs/ + +Setup +------------------------------------------------------------------------------- + + +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started + +Install Dependencies +++++++++++++++++++++ + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + +Samples +------------------------------------------------------------------------------- + +Quickstart ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/quickstart.py;dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python quickstart.py + + +Inspect Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/inspect_content.py;dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python inspect_content.py + + usage: inspect_content.py [-h] {string,file,gcs} ... + + Sample app that uses the Data Loss Prevention API to inspect a string, a local + file or a file on Google Cloud Storage. + + positional arguments: + {string,file,gcs} Select how to submit content to the API. + string Inspect a string. + file Inspect a local file. + gcs Inspect files on Google Cloud Storage. + + optional arguments: + -h, --help show this help message and exit + + + +Redact Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/redact.py;dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python redact.py + + usage: redact.py [-h] {string,image} ... + + Sample app that uses the Data Loss Prevent API to redact the contents of a + string or an image file. + + positional arguments: + {string,image} Select how to submit content to the API. + string Inspect a string. + image Inspect an image file. + + optional arguments: + -h, --help show this help message and exit + + + +Display Metadata ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/metadata.py;dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python metadata.py + + usage: metadata.py [-h] {categories,info_types} ... + + Sample app that queries the Data Loss Prevention API for supported categories + and info types. + + positional arguments: + {categories,info_types} + Select which type of metadata to view. + categories Fetch the list of info type categories. + info_types Fetch the list of info types in a specified category. + + optional arguments: + -h, --help show this help message and exit + + + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/dlp/README.rst.in b/dlp/README.rst.in new file mode 100644 index 000000000000..57c73a743338 --- /dev/null +++ b/dlp/README.rst.in @@ -0,0 +1,32 @@ +# This file is used to generate README.rst + +product: + name: Google Data Loss Prevention + short_name: Data Loss Prevention + url: https://cloud.google.com/dlp/docs/ + description: > + `Google Data Loss Prevention`_ provides programmatic access to a powerful + detection engine for personally identifiable information and other + privacy-sensitive data in unstructured data streams. + **This api is currently in beta**. + +setup: +- auth +- install_deps + +samples: +- name: Quickstart + file: quickstart.py +- name: Inspect Content + file: inspect_content.py + show_help: true +- name: Redact Content + file: redact.py + show_help: true +- name: Display Metadata + file: metadata.py + show_help: true + +cloud_client_library: true + +folder: dlp \ No newline at end of file diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py new file mode 100644 index 000000000000..ae80fc33883b --- /dev/null +++ b/dlp/inspect_content.py @@ -0,0 +1,317 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevention API to inspect a string, a +local file or a file on Google Cloud Storage.""" + +from __future__ import print_function + +import argparse + + +# [START inspect_string] +def inspect_string(item, info_types=None, min_likelihood=None, + max_findings=None, include_quote=True): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + item: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. If + info_types is omitted, the API will use a limited default set. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if info_types is not None: + info_types = [{'name': info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'max_findings': max_findings, + 'include_quote': include_quote, + } + + # Construct the items list (in this case, only one item, in string form). + items = [{'type': 'text/plain', 'value': item}] + + # Call the API. + response = dlp.inspect_content(inspect_config, items) + + # Print out the results. + if response.results[0].findings: + for finding in response.results[0].findings: + try: + print('Quote: {}'.format(finding.quote)) + except AttributeError: + pass + print('Info type: {}'.format(finding.info_type.name)) + print('Likelihood: {}'.format(finding.likelihood)) + else: + print('No findings.') +# [END inspect_string] + + +# [START inspect_file] +def inspect_file(filename, info_types=None, min_likelihood=None, + max_findings=None, include_quote=True, mime_type=None): + """Uses the Data Loss Prevention API to analyze a file for protected data. + Args: + filename: The path to the file to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. If + info_types is omitted, the API will use a limited default set. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + + import mimetypes + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if info_types is not None: + info_types = [{'name': info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'max_findings': max_findings, + 'include_quote': include_quote, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] or 'application/octet-stream' + + # Construct the items list (in this case, only one item, containing the + # file's byte data). + with open(filename, mode='rb') as f: + items = [{'type': mime_type, 'data': f.read()}] + + # Call the API. + response = dlp.inspect_content(inspect_config, items) + + # Print out the results. + if response.results[0].findings: + for finding in response.results[0].findings: + try: + print('Quote: {}'.format(finding.quote)) + except AttributeError: + pass + print('Info type: {}'.format(finding.info_type.name)) + print('Likelihood: {}'.format(finding.likelihood)) + else: + print('No findings.') +# [END inspect_file] + + +# [START inspect_gcs_file] +def inspect_gcs_file(bucket, filename, info_types=None, min_likelihood=None, + max_findings=None): + """Uses the Data Loss Prevention API to analyze a file on GCS. + Args: + bucket: The name of the GCS bucket containing the file, as a string. + filename: The name of the file in the bucket, including the path, as a + string; e.g. 'images/myfile.png'. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. If + info_types is omitted, the API will use a limited default set. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if info_types is not None: + info_types = [{'name': info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'max_findings': max_findings, + } + + # Construct a cloud_storage_options dictionary with the file's URL. + url = 'gs://{}/{}'.format(bucket, filename) + storage_config = { + 'cloud_storage_options': { + 'file_set': {'url': url} + } + } + + operation = dlp.create_inspect_operation(inspect_config, storage_config, + None) + + # Get the operation result name, which can be used to look up the full + # results. This call blocks until the operation is complete; to avoid + # blocking, use operation.add_done_callback(fn) instead. + operation_result = operation.result() + + response = dlp.list_inspect_findings(operation_result.name) + + if response.result.findings: + for finding in response.result.findings: + print('Info type: {}'.format(finding.info_type.name)) + print('Likelihood: {}'.format(finding.likelihood)) + else: + print('No findings.') +# [END inspect_gcs_file] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest='content', help='Select how to submit content to the API.') + + parser_string = subparsers.add_parser('string', help='Inspect a string.') + parser_string.add_argument('item', help='The string to inspect.') + parser_string.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' + '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' + 'the API will use a limited default set. Specify this flag ' + 'multiple times to specify multiple info types.') + parser_string.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_string.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + parser_string.add_argument( + '--include_quote', type=bool, + help='A boolean for whether to display a quote of the detected ' + 'information in the results.') + + parser_file = subparsers.add_parser('file', help='Inspect a local file.') + parser_file.add_argument( + 'filename', help='The path to the file to inspect.') + parser_file.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' + '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' + 'the API will use a limited default set. Specify this flag ' + 'multiple times to specify multiple info types.') + parser_file.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_file.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + parser_file.add_argument( + '--include_quote', type=bool, + help='A boolean for whether to display a quote of the detected ' + 'information in the results.') + parser_file.add_argument( + '--mime_type', + help='The MIME type of the file. If not specified, the type is ' + 'inferred via the Python standard library\'s mimetypes module.') + + parser_gcs = subparsers.add_parser( + 'gcs', help='Inspect files on Google Cloud Storage.') + parser_gcs.add_argument( + 'bucket', help='The name of the GCS bucket containing the file.') + parser_gcs.add_argument( + 'filename', + help='The name of the file in the bucket, including the path, e.g. ' + '"images/myfile.png". Wildcards are permitted.') + parser_gcs.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' + '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' + 'the API will use a limited default set. Specify this flag ' + 'multiple times to specify multiple info types.') + parser_gcs.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_gcs.add_argument( + '--max_findings', type=int, + help='The maximum number of findings to report; 0 = no maximum.') + + args = parser.parse_args() + + if args.content == 'string': + inspect_string( + args.item, info_types=args.info_types, + min_likelihood=args.min_likelihood, + include_quote=args.include_quote) + elif args.content == 'file': + inspect_file( + args.filename, info_types=args.info_types, + min_likelihood=args.min_likelihood, + include_quote=args.include_quote, + mime_type=args.mime_type) + elif args.content == 'gcs': + inspect_gcs_file( + args.bucket, args.filename, info_types=args.info_types, + min_likelihood=args.min_likelihood) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py new file mode 100644 index 000000000000..e6de4245f75d --- /dev/null +++ b/dlp/inspect_content_test.py @@ -0,0 +1,167 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import google.cloud.exceptions +import google.cloud.storage + +import pytest + +import inspect_content + + +GCLOUD_PROJECT = os.getenv('GCLOUD_PROJECT') +TEST_BUCKET_NAME = GCLOUD_PROJECT + '-dlp-python-client-test' +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCE_FILE_NAMES = ['test.txt', 'test.png', 'harmless.txt', 'accounts.txt'] + + +@pytest.fixture(scope='module') +def bucket(request): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + blob.delete() + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +def test_inspect_string(capsys): + test_string = 'I am Gary and my email is gary@example.com' + + inspect_content.inspect_string( + test_string, include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_string_with_info_types(capsys): + test_string = 'I am Gary and my email is gary@example.com' + + inspect_content.inspect_string( + test_string, info_types=['US_MALE_NAME'], include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: US_MALE_NAME' in out + assert 'Info type: EMAIL_ADDRESS' not in out + + +def test_inspect_string_no_results(capsys): + test_string = 'Nothing to see here' + + inspect_content.inspect_string( + test_string, include_quote=True) + + out, _ = capsys.readouterr() + assert 'No findings' in out + + +def test_inspect_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') + + inspect_content.inspect_file( + test_filepath, include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_file_with_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') + + inspect_content.inspect_file( + test_filepath, ['PHONE_NUMBER'], include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: PHONE_NUMBER' in out + assert 'Info type: EMAIL_ADDRESS' not in out + + +def test_inspect_file_no_results(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'harmless.txt') + + inspect_content.inspect_file( + test_filepath, include_quote=True) + + out, _ = capsys.readouterr() + assert 'No findings' in out + + +def test_inspect_image_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') + + inspect_content.inspect_file( + test_filepath, include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: PHONE_NUMBER' in out + + +def test_inspect_gcs_file(bucket, capsys): + inspect_content.inspect_gcs_file(bucket.name, 'test.txt') + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_gcs_file_with_info_types(bucket, capsys): + inspect_content.inspect_gcs_file( + bucket.name, 'test.txt', info_types=['EMAIL_ADDRESS']) + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_gcs_file_no_results(bucket, capsys): + inspect_content.inspect_gcs_file(bucket.name, 'harmless.txt') + + out, _ = capsys.readouterr() + assert 'No findings' in out + + +def test_inspect_gcs_image_file(bucket, capsys): + inspect_content.inspect_gcs_file(bucket.name, 'test.png') + + out, _ = capsys.readouterr() + assert 'Info type: EMAIL_ADDRESS' in out + + +def test_inspect_gcs_multiple_files(bucket, capsys): + inspect_content.inspect_gcs_file(bucket.name, '*') + + out, _ = capsys.readouterr() + assert 'Info type: PHONE_NUMBER' in out + assert 'Info type: CREDIT_CARD' in out diff --git a/dlp/metadata.py b/dlp/metadata.py new file mode 100644 index 000000000000..fbe88ec6b839 --- /dev/null +++ b/dlp/metadata.py @@ -0,0 +1,99 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse + + +# [START list_info_types] +def list_info_types(category, language_code='en-US'): + """List types of sensitive information within a category. + Args: + category: The category of info types to list; e.g. 'PII'. + language_code: The BCP-47 language code to use, e.g. 'en-US'. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Make the API call. + response = dlp.list_info_types(category, language_code) + + # Print the results to the console. + print('Info types in {category}:'.format(category=category)) + for info_type in response.info_types: + print('{name}: {display_name}'.format( + name=info_type.name, display_name=info_type.display_name)) +# [END list_info_types] + + +# [START list_categories] +def list_categories(language_code='en-US'): + """List root categories of sensitive information. + Args: + language_code: The BCP-47 language code to use, e.g. 'en-US'. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Make the API call. + response = dlp.list_root_categories(language_code) + + # Print the results to the console. + print('Categories:') + for category in response.categories: + print('{name}: {display_name}'.format( + name=category.name, display_name=category.display_name)) +# [END list_categories] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest='metadata', help='Select which type of metadata to view.') + + parser_categories = subparsers.add_parser( + 'categories', help='Fetch the list of info type categories.') + parser_categories.add_argument( + '--language_code', + help='The BCP-47 language code to use, e.g. \'en-US\'.') + + parser_info_types = subparsers.add_parser( + 'info_types', + help='Fetch the list of info types in a specified category.') + parser_info_types.add_argument( + 'category', help='The category of info types to list; e.g. \'PII\'.') + parser_info_types.add_argument( + '--language_code', + help='The BCP-47 language code to use, e.g. \'en-US\'.') + + args = parser.parse_args() + + if args.metadata == 'categories': + list_categories(language_code=args.language_code) + elif args.metadata == 'info_types': + list_info_types(args.category, language_code=args.language_code) diff --git a/dlp/metadata_test.py b/dlp/metadata_test.py new file mode 100644 index 000000000000..816b6f6e4281 --- /dev/null +++ b/dlp/metadata_test.py @@ -0,0 +1,29 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import metadata + + +def test_fetch_categories(capsys): + metadata.list_categories() + + out, _ = capsys.readouterr() + assert 'PII' in out + + +def test_fetch_info_types(capsys): + metadata.list_info_types('PII') + + out, _ = capsys.readouterr() + assert 'EMAIL_ADDRESS' in out diff --git a/dlp/quickstart.py b/dlp/quickstart.py new file mode 100644 index 000000000000..40d731433899 --- /dev/null +++ b/dlp/quickstart.py @@ -0,0 +1,76 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + + +def quickstart(): + """Demonstrates use of the Data Loss Prevention API client library.""" + + # [START quickstart] + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # The string to inspect + content = 'Robert Frost' + + # Construct the list of content items to inspect; in this case, only one. + items = [{'type': 'text/plain', 'value': content}] + + # The info types to search for in the content. + info_types = [{'name': 'US_MALE_NAME'}, {'name': 'US_FEMALE_NAME'}] + + # The minimum likelihood to constitute a match. Optional. + min_likelihood = 'LIKELIHOOD_UNSPECIFIED' + + # The maximum number of findings to report (0 = server maximum). Optional. + max_findings = 0 + + # Whether to include the matching string in the results. Optional. + include_quote = True + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + 'max_findings': max_findings, + 'include_quote': include_quote, + } + + # Call the API. + response = dlp.inspect_content(inspect_config, items) + + # Print out the results. + if response.results[0].findings: + for finding in response.results[0].findings: + try: + print('Quote: {}'.format(finding.quote)) + except AttributeError: + pass + print('Info type: {}'.format(finding.info_type.name)) + print('Likelihood: {}'.format(finding.likelihood)) + else: + print('No findings.') + # [END quickstart] + + +if __name__ == '__main__': + quickstart() diff --git a/dlp/quickstart_test.py b/dlp/quickstart_test.py new file mode 100644 index 000000000000..5b8faf88099d --- /dev/null +++ b/dlp/quickstart_test.py @@ -0,0 +1,22 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import quickstart + + +def test_quickstart(capsys): + quickstart.quickstart() + + out, _ = capsys.readouterr() + assert 'US_MALE_NAME' in out diff --git a/dlp/redact.py b/dlp/redact.py new file mode 100644 index 000000000000..8666d761c78f --- /dev/null +++ b/dlp/redact.py @@ -0,0 +1,214 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to redact the contents of +a string or an image file.""" + +from __future__ import print_function + +import argparse +import mimetypes + + +# [START redact_string] +def redact_string(item, replace_string, info_types=None, min_likelihood=None): + """Uses the Data Loss Prevention API to redact protected data in a string. + Args: + item: The string to inspect. + replace_string: The string to use to replace protected data; for + instance, '***' or 'REDACTED'. An empty string is permitted. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. If + info_types is omitted, the API will use a limited default set. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if info_types is not None: + info_types = [{'name': info_type} for info_type in info_types] + + # Prepare replace_configs, a list of dictionaries. Each dictionary contains + # an info_type and the string to which that info_type will be redacted upon + # detection. This sample uses the same "replace_string" for all info types, + # though the API supports using different ones for each type. + replace_configs = [] + + if info_types is not None: + for info_type in info_types: + replace_configs.append( + {'info_type': info_type, + 'replace_with': replace_string}) + else: + # If no info_type is specified, prepare a single dictionary with only a + # replace_string as a catch-all. + replace_configs.append({'replace_with': replace_string}) + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + redact_config = { + 'info_types': info_types, + 'min_likelihood': min_likelihood, + } + + # Construct the items list (in this case, only one item, in string form). + items = [{'type': 'text/plain', 'value': item}] + + # Call the API. + response = dlp.redact_content(redact_config, items, replace_configs) + + # Print out the results. + print(response.items[0].value) +# [END redact_string] + + +# [START redact_image] +def redact_image(filename, output_filename, + info_types=None, min_likelihood=None, mime_type=None): + """Uses the Data Loss Prevention API to redact protected data in an image. + Args: + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. If + info_types is omitted, the API will use a limited default set. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). The info_types are not submitted + # directly in this example, but are used in the construction of + # image_redaction_configs. + if info_types is not None: + info_types = [{'name': info_type} for info_type in info_types] + + # Prepare image_redaction_configs, a list of dictionaries. Each dictionary + # contains an info_type and optionally the color used for the replacement. + # The color is omitted in this sample, so the default (black) will be used. + image_redaction_configs = [] + + if info_types is not None: + for info_type in info_types: + image_redaction_configs.append({'info_type': info_type}) + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + redact_config = { + 'min_likelihood': min_likelihood, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] or 'application/octet-stream' + + # Construct the items list (in this case, only one item, containing the + # image file's byte data). + with open(filename, mode='rb') as f: + items = [{'type': mime_type, 'data': f.read()}] + + # Call the API. + response = dlp.redact_content( + redact_config, items, None, + image_redaction_configs=image_redaction_configs) + + # Write out the results. + with open(output_filename, mode='wb') as f: + f.write(response.items[0].data) + print("Wrote {byte_count} to {filename}".format( + byte_count=len(response.items[0].data), filename=output_filename)) +# [END redact_string] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest='content', help='Select how to submit content to the API.') + + parser_string = subparsers.add_parser('string', help='Inspect a string.') + parser_string.add_argument('item', help='The string to inspect.') + parser_string.add_argument( + 'replace_string', + help='The string to use to replace protected data; for instance, ' + '"***" or "REDACTED".') + parser_string.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' + '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' + 'the API will use a limited default set. Specify this flag ' + 'multiple times to specify multiple info types.') + parser_string.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + + parser_file = subparsers.add_parser('image', help='Inspect an image file.') + parser_file.add_argument( + 'filename', help='The path to the file to inspect.') + parser_file.add_argument( + 'output_filename', + help='The path to which the redacted image will be written.') + parser_file.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", ' + '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, ' + 'the API will use a limited default set. Specify this flag ' + 'multiple times to specify multiple info types.') + parser_file.add_argument( + '--min_likelihood', + choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', + 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'], + help='A string representing the minimum likelihood threshold that ' + 'constitutes a match.') + parser_file.add_argument( + '--mime_type', + help='The MIME type of the file. If not specified, the type is ' + 'inferred via the Python standard library\'s mimetypes module.') + + args = parser.parse_args() + + if args.content == 'string': + redact_string( + args.item, args.replace_string, info_types=args.info_types, + min_likelihood=args.min_likelihood) + elif args.content == 'image': + redact_image( + args.filename, args.output_filename, info_types=args.info_types, + min_likelihood=args.min_likelihood, mime_type=args.mime_type) diff --git a/dlp/redact_test.py b/dlp/redact_test.py new file mode 100644 index 000000000000..73d4cab20224 --- /dev/null +++ b/dlp/redact_test.py @@ -0,0 +1,81 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import redact + +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), 'resources') + + +@pytest.fixture(scope='module') +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_redact_string(capsys): + test_string = 'I am Gary and my email is gary@example.com' + + redact.redact_string(test_string, 'REDACTED') + + out, _ = capsys.readouterr() + assert 'REDACTED' in out + + +def test_redact_string_with_info_types(capsys): + test_string = 'My email is gary@example.com and my number is 206-555-5555' + + redact.redact_string( + test_string, 'REDACTED', info_types=['PHONE_NUMBER']) + + out, _ = capsys.readouterr() + assert 'REDACTED' in out + assert out.count('REDACTED') == 1 + + +def test_redact_string_no_findings(capsys): + test_string = 'Nothing to see here' + + redact.redact_string(test_string, 'REDACTED') + + out, _ = capsys.readouterr() + assert 'REDACTED' not in out + + +def test_redact_image_file(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') + output_filepath = os.path.join(tempdir, 'redacted.png') + + redact.redact_image(test_filepath, output_filepath) + + out, _ = capsys.readouterr() + assert output_filepath in out + + +def test_redact_image_file_with_infotype(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png') + output_filepath = os.path.join(tempdir, 'redacted_with_infotype.png') + + redact.redact_image( + test_filepath, output_filepath, + info_types=['EMAIL_ADDRESS', 'US_MALE_NAME']) + + out, _ = capsys.readouterr() + assert output_filepath in out diff --git a/dlp/requirements.txt b/dlp/requirements.txt new file mode 100644 index 000000000000..1c24785056db --- /dev/null +++ b/dlp/requirements.txt @@ -0,0 +1,2 @@ +google-cloud-dlp==0.1.0 +google-cloud-storage==1.7.0 diff --git a/dlp/resources/accounts.txt b/dlp/resources/accounts.txt new file mode 100644 index 000000000000..2763cd0ab820 --- /dev/null +++ b/dlp/resources/accounts.txt @@ -0,0 +1 @@ +My credit card number is 1234 5678 9012 3456, and my CVV is 789. \ No newline at end of file diff --git a/dlp/resources/harmless.txt b/dlp/resources/harmless.txt new file mode 100644 index 000000000000..5666de37ab23 --- /dev/null +++ b/dlp/resources/harmless.txt @@ -0,0 +1 @@ +This file is mostly harmless. diff --git a/dlp/resources/test.png b/dlp/resources/test.png new file mode 100644 index 000000000000..8f32c8258842 Binary files /dev/null and b/dlp/resources/test.png differ diff --git a/dlp/resources/test.txt b/dlp/resources/test.txt new file mode 100644 index 000000000000..c2ee3815bc9b --- /dev/null +++ b/dlp/resources/test.txt @@ -0,0 +1 @@ +My phone number is (223) 456-7890 and my email address is gary@somedomain.com. \ No newline at end of file