Skip to content

Commit

Permalink
Merge pull request #2164 from dhermes/language-impl-2
Browse files Browse the repository at this point in the history
Implement Document factory constructors on language client
  • Loading branch information
dhermes authored Aug 23, 2016
2 parents d64c9c3 + 094c071 commit 9b3a082
Show file tree
Hide file tree
Showing 8 changed files with 377 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@

language-usage
Client <language-client>
language-document

.. toctree::
:maxdepth: 0
Expand Down
6 changes: 6 additions & 0 deletions docs/language-document.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Document
~~~~~~~~

.. automodule:: gcloud.language.document
:members:
:show-inheritance:
8 changes: 4 additions & 4 deletions docs/language-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,21 +127,21 @@ to content stored in `Google Cloud Storage`_. We can use the

.. code-block:: python
>>> document = client.document_from_blob(bucket='my-text-bucket',
... blob='sentiment-me.txt')
>>> document = client.document_from_blob('my-text-bucket',
... 'sentiment-me.txt')
>>> document.gcs_url
'gs://my-text-bucket/sentiment-me.txt'
>>> document.doc_type == language.Document.PLAIN_TEXT
True
and the :meth:`~gcloud.language.client.Client.document_from_uri`
and the :meth:`~gcloud.language.client.Client.document_from_url`
method. In either case, the document type can be specified with
the ``doc_type`` argument:

.. code-block:: python
>>> gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
>>> document = client.document_from_uri(
>>> document = client.document_from_url(
... gcs_url, doc_type=language.Document.HTML)
>>> document.gcs_url == gcs_url
True
Expand Down
1 change: 1 addition & 0 deletions gcloud/language/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
"""Client library for Google Cloud Natural Language API."""

from gcloud.language.client import Client
from gcloud.language.document import Document
93 changes: 93 additions & 0 deletions gcloud/language/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from gcloud.client import JSONClient
from gcloud.language.connection import Connection
from gcloud.language.document import Document


class Client(JSONClient):
Expand All @@ -40,3 +41,95 @@ class Client(JSONClient):
"""

_connection_class = Connection

def document_from_text(self, content, **kwargs):
"""Create a plain text document bound to this client.
:type content: str
:param content: The document plain text content.
:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.
:rtype: :class:`Document`
:returns: A plain-text document bound to this client.
:raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a
keyword argument.
"""
if 'doc_type' in kwargs:
raise TypeError('Cannot pass doc_type')
return Document(self, content=content,
doc_type=Document.PLAIN_TEXT, **kwargs)

def document_from_html(self, content, **kwargs):
"""Create an HTML document bound to this client.
:type content: str
:param content: The document HTML text content.
:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.
:rtype: :class:`Document`
:returns: An HTML document bound to this client.
:raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a
keyword argument.
"""
if 'doc_type' in kwargs:
raise TypeError('Cannot pass doc_type')
return Document(self, content=content,
doc_type=Document.HTML, **kwargs)

def document_from_url(self, gcs_url,
doc_type=Document.PLAIN_TEXT, **kwargs):
"""Create a Cloud Storage document bound to this client.
:type gcs_url: str
:param gcs_url: The URL of the Google Cloud Storage object
holding the content. Of the form
``gs://{bucket}/{blob-name}``.
:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can also be specified
as HTML via :attr:`~.Document.HTML`.
:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.
:rtype: :class:`Document`
:returns: A document bound to this client.
"""
return Document(self, gcs_url=gcs_url, doc_type=doc_type, **kwargs)

def document_from_blob(self, bucket_name, blob_name,
doc_type=Document.PLAIN_TEXT, **kwargs):
"""Create a Cloud Storage document bound to this client.
:type bucket_name: str
:param bucket_name: The name of the bucket that contains the
document text.
:type blob_name: str
:param blob_name: The name of the blob (within the bucket) that
contains document text.
:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can also be specified
as HTML via :attr:`~.Document.HTML`.
:type kwargs: dict
:param kwargs: Remaining keyword arguments to be passed along to the
:class:`Document` constructor.
:rtype: :class:`Document`
:returns: A document bound to this client.
"""
# NOTE: We assume that the bucket and blob name don't
# need to be URL-encoded.
gcs_url = 'gs://%s/%s' % (bucket_name, blob_name)
return self.document_from_url(gcs_url, doc_type=doc_type, **kwargs)
103 changes: 103 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Definition for Google Cloud Natural Language API documents.
A document is used to hold text to be analyzed and annotated.
"""


DEFAULT_LANGUAGE = 'en'
"""Default document language, English."""


class Encoding(object):
"""Document text encoding types."""

NONE = 'NONE'
"""Unspecified encoding type."""

UTF8 = 'UTF8'
"""UTF-8 encoding type."""

UTF16 = 'UTF16'
"""UTF-16 encoding type."""

UTF32 = 'UTF32'
"""UTF-32 encoding type."""


class Document(object):
"""Document to send to Google Cloud Natural Language API.
Represents either plain text or HTML, and the content is either
stored on the document or referred to in a Google Cloud Storage
object.
:type client: :class:`~gcloud.language.client.Client`
:param client: A client which holds credentials and project
configuration.
:type content: str
:param content: (Optional) The document text content (either plain
text or HTML).
:type gcs_url: str
:param gcs_url: (Optional) The URL of the Google Cloud Storage object
holding the content. Of the form
``gs://{bucket}/{blob-name}``.
:type doc_type: str
:param doc_type: (Optional) The type of text in the document.
Defaults to plain text. Can be one of
:attr:`~.Document.PLAIN_TEXT` or
or :attr:`~.Document.HTML`.
:type language: str
:param language: (Optional) The language of the document text.
Defaults to :data:`DEFAULT_LANGUAGE`.
:type encoding: str
:param encoding: (Optional) The encoding of the document text.
Defaults to UTF-8. Can be one of
:attr:`~.Encoding.UTF8`, :attr:`~.Encoding.UTF16`
or :attr:`~.Encoding.UTF32`.
:raises: :class:`~exceptions.ValueError` both ``content`` and ``gcs_url``
are specified or if neither are specified.
"""

TYPE_UNSPECIFIED = 'TYPE_UNSPECIFIED'
"""Unspecified document type."""

PLAIN_TEXT = 'PLAIN_TEXT'
"""Plain text document type."""

HTML = 'HTML'
"""HTML document type."""

def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
language=DEFAULT_LANGUAGE, encoding=Encoding.UTF8):
if content is not None and gcs_url is not None:
raise ValueError('A Document cannot contain both local text and '
'a link to text in a Google Cloud Storage object')
if content is None and gcs_url is None:
raise ValueError('A Document must contain either local text or a '
'link to text in a Google Cloud Storage object')
self.client = client
self.content = content
self.gcs_url = gcs_url
self.doc_type = doc_type
self.language = language
self.encoding = encoding
106 changes: 105 additions & 1 deletion gcloud/language/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,118 @@ def _makeOne(self, *args, **kw):

def test_ctor(self):
from gcloud.language.connection import Connection

project = 'PROJECT'
creds = _Credentials()
http = object()
client = self._makeOne(project=project, credentials=creds, http=http)
self.assertTrue(isinstance(client.connection, Connection))
self.assertIsInstance(client.connection, Connection)
self.assertTrue(client.connection.credentials is creds)
self.assertTrue(client.connection.http is http)

def test_document_from_text_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

content = 'abc'
language = 'es'
document = client.document_from_text(content, language=language)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertEqual(document.content, content)
# Test the default arg.
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)
# Test the kwargs as well.
self.assertEqual(document.language, language)

def test_document_from_text_factory_failure(self):
creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

with self.assertRaises(TypeError):
client.document_from_text('abc', doc_type='foo')

def test_document_from_html_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

content = '<html>abc</html>'
language = 'ja'
document = client.document_from_html(content, language=language)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertEqual(document.content, content)
# Test the default arg.
self.assertEqual(document.doc_type, Document.HTML)
# Test the kwargs as well.
self.assertEqual(document.language, language)

def test_document_from_html_factory_failure(self):
creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

with self.assertRaises(TypeError):
client.document_from_html('abc', doc_type='foo')

def test_document_from_url_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
document = client.document_from_url(gcs_url)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)

def test_document_from_url_factory_explicit(self):
from gcloud.language.document import Document
from gcloud.language.document import Encoding

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

encoding = Encoding.UTF32
gcs_url = 'gs://my-text-bucket/sentiment-me.txt'
document = client.document_from_url(gcs_url, doc_type=Document.HTML,
encoding=encoding)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.HTML)
self.assertEqual(document.encoding, encoding)

def test_document_from_blob_factory(self):
from gcloud.language.document import Document

creds = _Credentials()
client = self._makeOne(project='PROJECT',
credentials=creds, http=object())

bucket_name = 'my-text-bucket'
blob_name = 'sentiment-me.txt'
gcs_url = 'gs://%s/%s' % (bucket_name, blob_name)
document = client.document_from_blob(bucket_name, blob_name)
self.assertIsInstance(document, Document)
self.assertIs(document.client, client)
self.assertIsNone(document.content)
self.assertEqual(document.gcs_url, gcs_url)
self.assertEqual(document.doc_type, Document.PLAIN_TEXT)


class _Credentials(object):

Expand Down
Loading

0 comments on commit 9b3a082

Please sign in to comment.