Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Document.analyze_entities() in language package #2172

Merged
merged 5 commits into from
Aug 24, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@
language-usage
Client <language-client>
language-document
language-responses

.. toctree::
:maxdepth: 0
Expand Down
9 changes: 9 additions & 0 deletions docs/language-responses.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Natural Language Response Classes
=================================

Entity
~~~~~~

.. automodule:: gcloud.language.entity
:members:
:show-inheritance:
54 changes: 30 additions & 24 deletions docs/language-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,25 +171,29 @@ metadata and other properties.
>>> entities = document.analyze_entities()
>>> for entity in entities:
... print('=' * 20)
... print(' name: %s' % (entity.name,))
... print(' type: %s' % (entity.entity_type,))
... print('metadata: %s' % (entity.metadata,))
... print('salience: %s' % (entity.salience,))
... print(' name: %s' % (entity.name,))
... print(' type: %s' % (entity.entity_type,))
... print('wikipedia_url: %s' % (entity.wikipedia_url,))
... print(' metadata: %s' % (entity.metadata,))
... print(' salience: %s' % (entity.salience,))
====================
name: Michelangelo Caravaggio
type: PERSON
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Caravaggio'}
salience: 0.75942981
name: Michelangelo Caravaggio
type: PERSON
wikipedia_url: http://en.wikipedia.org/wiki/Caravaggio
metadata: {}
salience: 0.7615959
====================
name: Italian
type: LOCATION
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Italy'}
salience: 0.20193423
name: Italian
type: LOCATION
wikipedia_url: http://en.wikipedia.org/wiki/Italy
metadata: {}
salience: 0.19960518
====================
name: The Calling of Saint Matthew
type: WORK_OF_ART
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/index.html?curid=2838808'}
salience: 0.03863598
name: The Calling of Saint Matthew
type: EVENT
wikipedia_url: http://en.wikipedia.org/wiki/The_Calling_of_St_Matthew_(Caravaggio)
metadata: {}
salience: 0.038798928

Analyze Sentiment
-----------------
Expand Down Expand Up @@ -266,14 +270,16 @@ the response is :data:`None`.
>>> # Entities present if include_entities=True
>>> for entity in annotations.entities:
... print('=' * 20)
... print(' name: %s' % (entity.name,))
... print(' type: %s' % (entity.entity_type,))
... print('metadata: %s' % (entity.metadata,))
... print('salience: %s' % (entity.salience,))
... print(' name: %s' % (entity.name,))
... print(' type: %s' % (entity.entity_type,))
... print('wikipedia_url: %s' % (entity.wikipedia_url,))
... print(' metadata: %s' % (entity.metadata,))
... print(' salience: %s' % (entity.salience,))
====================
name: Moon
type: LOCATION
metadata: {'wikipedia_url': 'http://en.wikipedia.org/wiki/Natural_satellite'}
salience: 0.11793101
name: Moon
type: LOCATION
wikipedia_url: http://en.wikipedia.org/wiki/Natural_satellite
metadata: {}
salience: 0.11793101

.. _Features: https://cloud.google.com/natural-language/reference/rest/v1beta1/documents/annotateText#Features
43 changes: 43 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
A document is used to hold text to be analyzed and annotated.
"""

from gcloud.language.entity import Entity


DEFAULT_LANGUAGE = 'en'
"""Default document language, English."""
Expand Down Expand Up @@ -101,3 +103,44 @@ def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
self.doc_type = doc_type
self.language = language
self.encoding = encoding

def _to_dict(self):
"""Helper to convert the current document into a dictionary.

To be used when constructing requests.

:rtype: dict
:returns: The Document value as a JSON dictionary.
"""
info = {
'type': self.doc_type,
'language': self.language,
}
if self.content is not None:
info['content'] = self.content
elif self.gcs_url is not None:
info['gcsContentUri'] = self.gcs_url
return info

def analyze_entities(self):
"""Analyze the entities in the current document.

Finds named entities (currently finds proper names as of August 2016)
in the text, entity types, salience, mentions for each entity, and
other properties.

See:
https://cloud.google.com/natural-language/reference/\
rest/v1beta1/documents/analyzeEntities

:rtype: list
:returns: A list of :class:`Entity` returned from the API.
"""
data = {
'document': self._to_dict(),
'encodingType': self.encoding,
}
api_response = self.client.connection.api_request(
method='POST', path='analyzeEntities', data=data)
return [Entity.from_api_repr(entity)
for entity in api_response['entities']]
106 changes: 106 additions & 0 deletions gcloud/language/entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Definition for Google Cloud Natural Language API entities.

An entity is used to describe a proper name extracted from text.
"""


class EntityType(object):
"""List of possible entity types."""

UNKNOWN = 'UNKNOWN'
"""Unknown entity type."""

PERSON = 'PERSON'
"""Person entity type."""

LOCATION = 'LOCATION'
"""Location entity type."""

ORGANIZATION = 'ORGANIZATION'
"""Organization entity type."""

EVENT = 'EVENT'
"""Event entity type."""

WORK_OF_ART = 'WORK_OF_ART'
"""Work of art entity type."""

CONSUMER_GOOD = 'CONSUMER_GOOD'
"""Consumer good entity type."""

OTHER = 'OTHER'
"""Other entity type (i.e. known but not classified)."""


class Entity(object):
"""A Google Cloud Natural Language API entity.

Represents a phrase in text that is a known entity, such as a person,
an organization, or location. The API associates information, such as
salience and mentions, with entities.

The only supported metadata (as of August 2016) is ``wikipedia_url``,
so this value will be removed from the passed in ``metadata``
and put in its own property.

See:
https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity

:type name: str
:param name: The name / phrase identified as the entity.

:type entity_type: str
:param entity_type: The type of the entity. See
https://cloud.google.com/natural-language/\
reference/rest/v1beta1/Entity#Type

:type metadata: dict
:param metadata: The metadata associated with the entity.

:type salience: float
:param salience: The prominence of the entity / phrase within the text
containing it.

:type mentions: list
:param mentions: List of strings that mention the entity.
"""

def __init__(self, name, entity_type, metadata, salience, mentions):
self.name = name
self.entity_type = entity_type
self.wikipedia_url = metadata.pop('wikipedia_url', None)
self.metadata = metadata
self.salience = salience
self.mentions = mentions

@classmethod
def from_api_repr(cls, payload):
"""Convert an Entity from the JSON API into an :class:`Entity`.

:param payload: dict
:type payload: The value from the backend.

:rtype: :class:`Entity`
:returns: The entity parsed from the API representation.
"""
name = payload['name']
entity_type = payload['type']
metadata = payload['metadata']
salience = payload['salience']
mentions = [value['text']['content']
for value in payload['mentions']]
return cls(name, entity_type, metadata, salience, mentions)
121 changes: 121 additions & 0 deletions gcloud/language/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,124 @@ def test_constructor_text_and_gcs(self):
with self.assertRaises(ValueError):
self._makeOne(None, content='abc',
gcs_url='gs://some-bucket/some-obj.txt')

def test__to_dict_with_content(self):
klass = self._getTargetClass()
content = 'Hello World'
document = self._makeOne(None, content=content)
info = document._to_dict()
self.assertEqual(info, {
'content': content,
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test__to_dict_with_gcs(self):
klass = self._getTargetClass()
gcs_url = 'gs://some-bucket/some-obj.html'
document = self._makeOne(None, gcs_url=gcs_url)
info = document._to_dict()
self.assertEqual(info, {
'gcsContentUri': gcs_url,
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test__to_dict_with_no_content(self):
klass = self._getTargetClass()
document = self._makeOne(None, content='')
document.content = None # Manually unset the content.
info = document._to_dict()
self.assertEqual(info, {
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test_analyze_entities(self):
from gcloud.language.entity import Entity
from gcloud.language.entity import EntityType

name1 = 'R-O-C-K'
name2 = 'USA'
content = name1 + ' in the ' + name2
wiki2 = 'http://en.wikipedia.org/wiki/United_States'
salience1 = 0.91391456
salience2 = 0.086085409
response = {
'entities': [
{
'name': name1,
'type': EntityType.OTHER,
'metadata': {},
'salience': salience1,
'mentions': [
{
'text': {
'content': name1,
'beginOffset': -1
}
}
]
},
{
'name': name2,
'type': EntityType.LOCATION,
'metadata': {'wikipedia_url': wiki2},
'salience': salience2,
'mentions': [
{
'text': {
'content': name2,
'beginOffset': -1,
},
},
],
},
],
'language': 'en',
}
connection = _Connection(response)
client = _Client(connection=connection)
document = self._makeOne(client, content)

entities = document.analyze_entities()
self.assertEqual(len(entities), 2)
entity1 = entities[0]
self.assertIsInstance(entity1, Entity)
self.assertEqual(entity1.name, name1)
self.assertEqual(entity1.entity_type, EntityType.OTHER)
self.assertEqual(entity1.wikipedia_url, None)
self.assertEqual(entity1.metadata, {})
self.assertEqual(entity1.salience, salience1)
self.assertEqual(entity1.mentions, [name1])
entity2 = entities[1]
self.assertIsInstance(entity2, Entity)
self.assertEqual(entity2.name, name2)
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
self.assertEqual(entity2.wikipedia_url, wiki2)
self.assertEqual(entity2.metadata, {})
self.assertEqual(entity2.salience, salience2)
self.assertEqual(entity2.mentions, [name2])

# Verify the request.
self.assertEqual(len(connection._requested), 1)
req = connection._requested[0]
self.assertEqual(req['path'], 'analyzeEntities')
self.assertEqual(req['method'], 'POST')


class _Connection(object):

def __init__(self, response):
self._response = response
self._requested = []

def api_request(self, **kwargs):
self._requested.append(kwargs)
return self._response


class _Client(object):

def __init__(self, connection=None):
self.connection = connection
Loading