diff --git a/docs/index.rst b/docs/index.rst index 5cee1dbb4f63..03d7a86007b2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -157,6 +157,7 @@ language-usage Client language-document + language-responses .. toctree:: :maxdepth: 0 diff --git a/docs/language-responses.rst b/docs/language-responses.rst new file mode 100644 index 000000000000..ee6b30e9f3b3 --- /dev/null +++ b/docs/language-responses.rst @@ -0,0 +1,9 @@ +Natural Language Response Classes +================================= + +Entity +~~~~~~ + +.. automodule:: gcloud.language.entity + :members: + :show-inheritance: diff --git a/docs/language-usage.rst b/docs/language-usage.rst index 62ecff14fc01..c61076d6df2c 100644 --- a/docs/language-usage.rst +++ b/docs/language-usage.rst @@ -171,25 +171,29 @@ metadata and other properties. >>> entities = document.analyze_entities() >>> for entity in entities: ... print('=' * 20) - ... print(' name: %s' % (entity.name,)) - ... print(' type: %s' % (entity.entity_type,)) - ... print('metadata: %s' % (entity.metadata,)) - ... print('salience: %s' % (entity.salience,)) + ... print(' name: %s' % (entity.name,)) + ... print(' type: %s' % (entity.entity_type,)) + ... print('wikipedia_url: %s' % (entity.wikipedia_url,)) + ... print(' metadata: %s' % (entity.metadata,)) + ... print(' salience: %s' % (entity.salience,)) ==================== - name: Michelangelo Caravaggio - type: PERSON - metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Caravaggio'} - salience: 0.75942981 + name: Michelangelo Caravaggio + type: PERSON + wikipedia_url: http://en.wikipedia.org/wiki/Caravaggio + metadata: {} + salience: 0.7615959 ==================== - name: Italian - type: LOCATION - metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Italy'} - salience: 0.20193423 + name: Italian + type: LOCATION + wikipedia_url: http://en.wikipedia.org/wiki/Italy + metadata: {} + salience: 0.19960518 ==================== - name: The Calling of Saint Matthew - type: WORK_OF_ART - metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/index.html?curid=2838808'} - salience: 0.03863598 + name: The Calling of Saint Matthew + type: EVENT + wikipedia_url: http://en.wikipedia.org/wiki/The_Calling_of_St_Matthew_(Caravaggio) + metadata: {} + salience: 0.038798928 Analyze Sentiment ----------------- @@ -266,14 +270,16 @@ the response is :data:`None`. >>> # Entities present if include_entities=True >>> for entity in annotations.entities: ... print('=' * 20) - ... print(' name: %s' % (entity.name,)) - ... print(' type: %s' % (entity.entity_type,)) - ... print('metadata: %s' % (entity.metadata,)) - ... print('salience: %s' % (entity.salience,)) + ... print(' name: %s' % (entity.name,)) + ... print(' type: %s' % (entity.entity_type,)) + ... print('wikipedia_url: %s' % (entity.wikipedia_url,)) + ... print(' metadata: %s' % (entity.metadata,)) + ... print(' salience: %s' % (entity.salience,)) ==================== - name: Moon - type: LOCATION - metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Natural_satellite'} - salience: 0.11793101 + name: Moon + type: LOCATION + wikipedia_url: http://en.wikipedia.org/wiki/Natural_satellite + metadata: {} + salience: 0.11793101 .. _Features: https://cloud.google.com/natural-language/reference/rest/v1beta1/documents/annotateText#Features diff --git a/gcloud/language/document.py b/gcloud/language/document.py index f25c85a4f4c7..11b9db5e242a 100644 --- a/gcloud/language/document.py +++ b/gcloud/language/document.py @@ -17,6 +17,8 @@ A document is used to hold text to be analyzed and annotated. """ +from gcloud.language.entity import Entity + DEFAULT_LANGUAGE = 'en' """Default document language, English.""" @@ -101,3 +103,44 @@ def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT, self.doc_type = doc_type self.language = language self.encoding = encoding + + def _to_dict(self): + """Helper to convert the current document into a dictionary. + + To be used when constructing requests. + + :rtype: dict + :returns: The Document value as a JSON dictionary. + """ + info = { + 'type': self.doc_type, + 'language': self.language, + } + if self.content is not None: + info['content'] = self.content + elif self.gcs_url is not None: + info['gcsContentUri'] = self.gcs_url + return info + + def analyze_entities(self): + """Analyze the entities in the current document. + + Finds named entities (currently finds proper names as of August 2016) + in the text, entity types, salience, mentions for each entity, and + other properties. + + See: + https://cloud.google.com/natural-language/reference/\ + rest/v1beta1/documents/analyzeEntities + + :rtype: list + :returns: A list of :class:`Entity` returned from the API. + """ + data = { + 'document': self._to_dict(), + 'encodingType': self.encoding, + } + api_response = self.client.connection.api_request( + method='POST', path='analyzeEntities', data=data) + return [Entity.from_api_repr(entity) + for entity in api_response['entities']] diff --git a/gcloud/language/entity.py b/gcloud/language/entity.py new file mode 100644 index 000000000000..0b1c26f92da4 --- /dev/null +++ b/gcloud/language/entity.py @@ -0,0 +1,106 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Definition for Google Cloud Natural Language API entities. + +An entity is used to describe a proper name extracted from text. +""" + + +class EntityType(object): + """List of possible entity types.""" + + UNKNOWN = 'UNKNOWN' + """Unknown entity type.""" + + PERSON = 'PERSON' + """Person entity type.""" + + LOCATION = 'LOCATION' + """Location entity type.""" + + ORGANIZATION = 'ORGANIZATION' + """Organization entity type.""" + + EVENT = 'EVENT' + """Event entity type.""" + + WORK_OF_ART = 'WORK_OF_ART' + """Work of art entity type.""" + + CONSUMER_GOOD = 'CONSUMER_GOOD' + """Consumer good entity type.""" + + OTHER = 'OTHER' + """Other entity type (i.e. known but not classified).""" + + +class Entity(object): + """A Google Cloud Natural Language API entity. + + Represents a phrase in text that is a known entity, such as a person, + an organization, or location. The API associates information, such as + salience and mentions, with entities. + + The only supported metadata (as of August 2016) is ``wikipedia_url``, + so this value will be removed from the passed in ``metadata`` + and put in its own property. + + See: + https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity + + :type name: str + :param name: The name / phrase identified as the entity. + + :type entity_type: str + :param entity_type: The type of the entity. See + https://cloud.google.com/natural-language/\ + reference/rest/v1beta1/Entity#Type + + :type metadata: dict + :param metadata: The metadata associated with the entity. + + :type salience: float + :param salience: The prominence of the entity / phrase within the text + containing it. + + :type mentions: list + :param mentions: List of strings that mention the entity. + """ + + def __init__(self, name, entity_type, metadata, salience, mentions): + self.name = name + self.entity_type = entity_type + self.wikipedia_url = metadata.pop('wikipedia_url', None) + self.metadata = metadata + self.salience = salience + self.mentions = mentions + + @classmethod + def from_api_repr(cls, payload): + """Convert an Entity from the JSON API into an :class:`Entity`. + + :param payload: dict + :type payload: The value from the backend. + + :rtype: :class:`Entity` + :returns: The entity parsed from the API representation. + """ + name = payload['name'] + entity_type = payload['type'] + metadata = payload['metadata'] + salience = payload['salience'] + mentions = [value['text']['content'] + for value in payload['mentions']] + return cls(name, entity_type, metadata, salience, mentions) diff --git a/gcloud/language/test_document.py b/gcloud/language/test_document.py index 2b52f13a7b31..cd8eff3cdb97 100644 --- a/gcloud/language/test_document.py +++ b/gcloud/language/test_document.py @@ -62,3 +62,124 @@ def test_constructor_text_and_gcs(self): with self.assertRaises(ValueError): self._makeOne(None, content='abc', gcs_url='gs://some-bucket/some-obj.txt') + + def test__to_dict_with_content(self): + klass = self._getTargetClass() + content = 'Hello World' + document = self._makeOne(None, content=content) + info = document._to_dict() + self.assertEqual(info, { + 'content': content, + 'language': document.language, + 'type': klass.PLAIN_TEXT, + }) + + def test__to_dict_with_gcs(self): + klass = self._getTargetClass() + gcs_url = 'gs://some-bucket/some-obj.html' + document = self._makeOne(None, gcs_url=gcs_url) + info = document._to_dict() + self.assertEqual(info, { + 'gcsContentUri': gcs_url, + 'language': document.language, + 'type': klass.PLAIN_TEXT, + }) + + def test__to_dict_with_no_content(self): + klass = self._getTargetClass() + document = self._makeOne(None, content='') + document.content = None # Manually unset the content. + info = document._to_dict() + self.assertEqual(info, { + 'language': document.language, + 'type': klass.PLAIN_TEXT, + }) + + def test_analyze_entities(self): + from gcloud.language.entity import Entity + from gcloud.language.entity import EntityType + + name1 = 'R-O-C-K' + name2 = 'USA' + content = name1 + ' in the ' + name2 + wiki2 = 'http://en.wikipedia.org/wiki/United_States' + salience1 = 0.91391456 + salience2 = 0.086085409 + response = { + 'entities': [ + { + 'name': name1, + 'type': EntityType.OTHER, + 'metadata': {}, + 'salience': salience1, + 'mentions': [ + { + 'text': { + 'content': name1, + 'beginOffset': -1 + } + } + ] + }, + { + 'name': name2, + 'type': EntityType.LOCATION, + 'metadata': {'wikipedia_url': wiki2}, + 'salience': salience2, + 'mentions': [ + { + 'text': { + 'content': name2, + 'beginOffset': -1, + }, + }, + ], + }, + ], + 'language': 'en', + } + connection = _Connection(response) + client = _Client(connection=connection) + document = self._makeOne(client, content) + + entities = document.analyze_entities() + self.assertEqual(len(entities), 2) + entity1 = entities[0] + self.assertIsInstance(entity1, Entity) + self.assertEqual(entity1.name, name1) + self.assertEqual(entity1.entity_type, EntityType.OTHER) + self.assertEqual(entity1.wikipedia_url, None) + self.assertEqual(entity1.metadata, {}) + self.assertEqual(entity1.salience, salience1) + self.assertEqual(entity1.mentions, [name1]) + entity2 = entities[1] + self.assertIsInstance(entity2, Entity) + self.assertEqual(entity2.name, name2) + self.assertEqual(entity2.entity_type, EntityType.LOCATION) + self.assertEqual(entity2.wikipedia_url, wiki2) + self.assertEqual(entity2.metadata, {}) + self.assertEqual(entity2.salience, salience2) + self.assertEqual(entity2.mentions, [name2]) + + # Verify the request. + self.assertEqual(len(connection._requested), 1) + req = connection._requested[0] + self.assertEqual(req['path'], 'analyzeEntities') + self.assertEqual(req['method'], 'POST') + + +class _Connection(object): + + def __init__(self, response): + self._response = response + self._requested = [] + + def api_request(self, **kwargs): + self._requested.append(kwargs) + return self._response + + +class _Client(object): + + def __init__(self, connection=None): + self.connection = connection diff --git a/gcloud/language/test_entity.py b/gcloud/language/test_entity.py new file mode 100644 index 000000000000..34dde32c0ae1 --- /dev/null +++ b/gcloud/language/test_entity.py @@ -0,0 +1,71 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestEntity(unittest.TestCase): + + def _getTargetClass(self): + from gcloud.language.entity import Entity + return Entity + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_constructor_defaults(self): + name = 'Italian' + entity_type = 'LOCATION' + wiki_url = 'http://en.wikipedia.org/wiki/Italy' + metadata = {'wikipedia_url': wiki_url} + base_metadata = {'foo': 'bar'} + metadata.update(base_metadata) + salience = 0.19960518 + mentions = ['Italian'] + entity = self._makeOne(name, entity_type, metadata, + salience, mentions) + self.assertEqual(entity.name, name) + self.assertEqual(entity.entity_type, entity_type) + self.assertEqual(entity.wikipedia_url, wiki_url) + self.assertEqual(entity.metadata, base_metadata) + self.assertEqual(entity.salience, salience) + self.assertEqual(entity.mentions, mentions) + + def test_from_api_repr(self): + klass = self._getTargetClass() + name = 'Italy' + entity_type = 'LOCATION' + salience = 0.223 + wiki_url = 'http://en.wikipedia.org/wiki/Italy' + mention1 = 'Italy' + mention2 = 'To Italy' + mention3 = 'From Italy' + payload = { + 'name': name, + 'type': entity_type, + 'salience': salience, + 'metadata': {'wikipedia_url': wiki_url}, + 'mentions': [ + {'text': {'content': mention1}}, + {'text': {'content': mention2}}, + {'text': {'content': mention3}}, + ], + } + entity = klass.from_api_repr(payload) + self.assertEqual(entity.name, name) + self.assertEqual(entity.entity_type, entity_type) + self.assertEqual(entity.salience, salience) + self.assertEqual(entity.wikipedia_url, wiki_url) + self.assertEqual(entity.metadata, {}) + self.assertEqual(entity.mentions, [mention1, mention2, mention3]) diff --git a/system_tests/attempt_system_tests.py b/system_tests/attempt_system_tests.py index 5c53db1c6184..ce97a97fd33c 100644 --- a/system_tests/attempt_system_tests.py +++ b/system_tests/attempt_system_tests.py @@ -35,6 +35,7 @@ 'storage', 'bigquery', 'pubsub', + 'language', 'logging', 'translate', 'monitoring', diff --git a/system_tests/language.py b/system_tests/language.py new file mode 100644 index 000000000000..c23afc8711dd --- /dev/null +++ b/system_tests/language.py @@ -0,0 +1,68 @@ +# Copyright 2016 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from gcloud import language + + +class Config(object): + """Run-time configuration to be modified at set-up. + + This is a mutable stand-in to allow test set-up to modify + global state. + """ + CLIENT = None + + +def setUpModule(): + Config.CLIENT = language.Client() + + +class TestLanguage(unittest.TestCase): + + def test_analyze_entities(self): + from gcloud.language.entity import EntityType + + text_content = ("Michelangelo Caravaggio, Italian painter, is " + "known for 'The Calling of Saint Matthew'.") + document = Config.CLIENT.document_from_text(text_content) + entities = document.analyze_entities() + self.assertEqual(len(entities), 3) + entity1, entity2, entity3 = entities + # Verify entity 1. + self.assertEqual(entity1.name, 'Michelangelo Caravaggio') + self.assertEqual(entity1.entity_type, EntityType.PERSON) + self.assertTrue(0.7 < entity1.salience < 0.8) + self.assertEqual(entity1.mentions, [entity1.name]) + self.assertEqual(entity1.wikipedia_url, + 'http://en.wikipedia.org/wiki/Caravaggio') + self.assertEqual(entity1.metadata, {}) + # Verify entity 2. + self.assertEqual(entity2.name, 'Italian') + self.assertEqual(entity2.entity_type, EntityType.LOCATION) + self.assertTrue(0.15 < entity2.salience < 0.25) + self.assertEqual(entity2.mentions, [entity2.name]) + self.assertEqual(entity2.wikipedia_url, + 'http://en.wikipedia.org/wiki/Italy') + self.assertEqual(entity2.metadata, {}) + # Verify entity 3. + self.assertEqual(entity3.name, 'The Calling of Saint Matthew') + self.assertEqual(entity3.entity_type, EntityType.EVENT) + self.assertTrue(0 < entity3.salience < 0.1) + self.assertEqual(entity3.mentions, [entity3.name]) + wiki_url = ('http://en.wikipedia.org/wiki/' + 'The_Calling_of_St_Matthew_(Caravaggio)') + self.assertEqual(entity3.wikipedia_url, wiki_url) + self.assertEqual(entity3.metadata, {}) diff --git a/system_tests/run_system_test.py b/system_tests/run_system_test.py index 2fd6ff93fc2d..d3593db8be22 100644 --- a/system_tests/run_system_test.py +++ b/system_tests/run_system_test.py @@ -19,6 +19,7 @@ import bigquery import bigtable import datastore +import language import logging_ import monitoring import pubsub @@ -33,6 +34,7 @@ 'pubsub': pubsub, 'bigquery': bigquery, 'bigtable': bigtable, + 'language': language, 'logging': logging_, 'monitoring': monitoring, 'translate': translate,