diff --git a/docs/index.rst b/docs/index.rst index fc81fbfdee35..38a12478ae0b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -155,6 +155,7 @@ language-usage Client + language-document .. toctree:: :maxdepth: 0 diff --git a/docs/language-document.rst b/docs/language-document.rst new file mode 100644 index 000000000000..17ebab4e1930 --- /dev/null +++ b/docs/language-document.rst @@ -0,0 +1,6 @@ +Document +~~~~~~~~ + +.. automodule:: gcloud.language.document + :members: + :show-inheritance: diff --git a/docs/language-usage.rst b/docs/language-usage.rst index 83b965bad1b4..62ecff14fc01 100644 --- a/docs/language-usage.rst +++ b/docs/language-usage.rst @@ -127,21 +127,21 @@ to content stored in `Google Cloud Storage`_. We can use the .. code-block:: python - >>> document = client.document_from_blob(bucket='my-text-bucket', - ... blob='sentiment-me.txt') + >>> document = client.document_from_blob('my-text-bucket', + ... 'sentiment-me.txt') >>> document.gcs_url 'gs://my-text-bucket/sentiment-me.txt' >>> document.doc_type == language.Document.PLAIN_TEXT True -and the :meth:`~gcloud.language.client.Client.document_from_uri` +and the :meth:`~gcloud.language.client.Client.document_from_url` method. In either case, the document type can be specified with the ``doc_type`` argument: .. code-block:: python >>> gcs_url = 'gs://my-text-bucket/sentiment-me.txt' - >>> document = client.document_from_uri( + >>> document = client.document_from_url( ... gcs_url, doc_type=language.Document.HTML) >>> document.gcs_url == gcs_url True diff --git a/gcloud/language/__init__.py b/gcloud/language/__init__.py index 180c4993a5b2..e4123a035541 100644 --- a/gcloud/language/__init__.py +++ b/gcloud/language/__init__.py @@ -15,3 +15,4 @@ """Client library for Google Cloud Natural Language API.""" from gcloud.language.client import Client +from gcloud.language.document import Document diff --git a/gcloud/language/client.py b/gcloud/language/client.py index 08695e358fff..50b95c56c7cc 100644 --- a/gcloud/language/client.py +++ b/gcloud/language/client.py @@ -17,6 +17,7 @@ from gcloud.client import JSONClient from gcloud.language.connection import Connection +from gcloud.language.document import Document class Client(JSONClient): @@ -40,3 +41,95 @@ class Client(JSONClient): """ _connection_class = Connection + + def document_from_text(self, content, **kwargs): + """Create a plain text document bound to this client. + + :type content: str + :param content: The document plain text content. + + :type kwargs: dict + :param kwargs: Remaining keyword arguments to be passed along to the + :class:`Document` constructor. + + :rtype: :class:`Document` + :returns: A plain-text document bound to this client. + :raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a + keyword argument. + """ + if 'doc_type' in kwargs: + raise TypeError('Cannot pass doc_type') + return Document(self, content=content, + doc_type=Document.PLAIN_TEXT, **kwargs) + + def document_from_html(self, content, **kwargs): + """Create an HTML document bound to this client. + + :type content: str + :param content: The document HTML text content. + + :type kwargs: dict + :param kwargs: Remaining keyword arguments to be passed along to the + :class:`Document` constructor. + + :rtype: :class:`Document` + :returns: An HTML document bound to this client. + :raises: :class:`~exceptions.TypeError` if ``doc_type`` is passed as a + keyword argument. + """ + if 'doc_type' in kwargs: + raise TypeError('Cannot pass doc_type') + return Document(self, content=content, + doc_type=Document.HTML, **kwargs) + + def document_from_url(self, gcs_url, + doc_type=Document.PLAIN_TEXT, **kwargs): + """Create a Cloud Storage document bound to this client. + + :type gcs_url: str + :param gcs_url: The URL of the Google Cloud Storage object + holding the content. Of the form + ``gs://{bucket}/{blob-name}``. + + :type doc_type: str + :param doc_type: (Optional) The type of text in the document. + Defaults to plain text. Can also be specified + as HTML via :attr:`~.Document.HTML`. + + :type kwargs: dict + :param kwargs: Remaining keyword arguments to be passed along to the + :class:`Document` constructor. + + :rtype: :class:`Document` + :returns: A document bound to this client. + """ + return Document(self, gcs_url=gcs_url, doc_type=doc_type, **kwargs) + + def document_from_blob(self, bucket_name, blob_name, + doc_type=Document.PLAIN_TEXT, **kwargs): + """Create a Cloud Storage document bound to this client. + + :type bucket_name: str + :param bucket_name: The name of the bucket that contains the + document text. + + :type blob_name: str + :param blob_name: The name of the blob (within the bucket) that + contains document text. + + :type doc_type: str + :param doc_type: (Optional) The type of text in the document. + Defaults to plain text. Can also be specified + as HTML via :attr:`~.Document.HTML`. + + :type kwargs: dict + :param kwargs: Remaining keyword arguments to be passed along to the + :class:`Document` constructor. + + :rtype: :class:`Document` + :returns: A document bound to this client. + """ + # NOTE: We assume that the bucket and blob name don't + # need to be URL-encoded. + gcs_url = 'gs://%s/%s' % (bucket_name, blob_name) + return self.document_from_url(gcs_url, doc_type=doc_type, **kwargs) diff --git a/gcloud/language/document.py b/gcloud/language/document.py new file mode 100644 index 000000000000..f25c85a4f4c7 --- /dev/null +++ b/gcloud/language/document.py @@ -0,0 +1,103 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Definition for Google Cloud Natural Language API documents. + +A document is used to hold text to be analyzed and annotated. +""" + + +DEFAULT_LANGUAGE = 'en' +"""Default document language, English.""" + + +class Encoding(object): + """Document text encoding types.""" + + NONE = 'NONE' + """Unspecified encoding type.""" + + UTF8 = 'UTF8' + """UTF-8 encoding type.""" + + UTF16 = 'UTF16' + """UTF-16 encoding type.""" + + UTF32 = 'UTF32' + """UTF-32 encoding type.""" + + +class Document(object): + """Document to send to Google Cloud Natural Language API. + + Represents either plain text or HTML, and the content is either + stored on the document or referred to in a Google Cloud Storage + object. + + :type client: :class:`~gcloud.language.client.Client` + :param client: A client which holds credentials and project + configuration. + + :type content: str + :param content: (Optional) The document text content (either plain + text or HTML). + + :type gcs_url: str + :param gcs_url: (Optional) The URL of the Google Cloud Storage object + holding the content. Of the form + ``gs://{bucket}/{blob-name}``. + + :type doc_type: str + :param doc_type: (Optional) The type of text in the document. + Defaults to plain text. Can be one of + :attr:`~.Document.PLAIN_TEXT` or + or :attr:`~.Document.HTML`. + + :type language: str + :param language: (Optional) The language of the document text. + Defaults to :data:`DEFAULT_LANGUAGE`. + + :type encoding: str + :param encoding: (Optional) The encoding of the document text. + Defaults to UTF-8. Can be one of + :attr:`~.Encoding.UTF8`, :attr:`~.Encoding.UTF16` + or :attr:`~.Encoding.UTF32`. + + :raises: :class:`~exceptions.ValueError` both ``content`` and ``gcs_url`` + are specified or if neither are specified. + """ + + TYPE_UNSPECIFIED = 'TYPE_UNSPECIFIED' + """Unspecified document type.""" + + PLAIN_TEXT = 'PLAIN_TEXT' + """Plain text document type.""" + + HTML = 'HTML' + """HTML document type.""" + + def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT, + language=DEFAULT_LANGUAGE, encoding=Encoding.UTF8): + if content is not None and gcs_url is not None: + raise ValueError('A Document cannot contain both local text and ' + 'a link to text in a Google Cloud Storage object') + if content is None and gcs_url is None: + raise ValueError('A Document must contain either local text or a ' + 'link to text in a Google Cloud Storage object') + self.client = client + self.content = content + self.gcs_url = gcs_url + self.doc_type = doc_type + self.language = language + self.encoding = encoding diff --git a/gcloud/language/test_client.py b/gcloud/language/test_client.py index 5b25a5d083ee..ca3c47ed38d3 100644 --- a/gcloud/language/test_client.py +++ b/gcloud/language/test_client.py @@ -26,14 +26,118 @@ def _makeOne(self, *args, **kw): def test_ctor(self): from gcloud.language.connection import Connection + project = 'PROJECT' creds = _Credentials() http = object() client = self._makeOne(project=project, credentials=creds, http=http) - self.assertTrue(isinstance(client.connection, Connection)) + self.assertIsInstance(client.connection, Connection) self.assertTrue(client.connection.credentials is creds) self.assertTrue(client.connection.http is http) + def test_document_from_text_factory(self): + from gcloud.language.document import Document + + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + content = 'abc' + language = 'es' + document = client.document_from_text(content, language=language) + self.assertIsInstance(document, Document) + self.assertIs(document.client, client) + self.assertEqual(document.content, content) + # Test the default arg. + self.assertEqual(document.doc_type, Document.PLAIN_TEXT) + # Test the kwargs as well. + self.assertEqual(document.language, language) + + def test_document_from_text_factory_failure(self): + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + with self.assertRaises(TypeError): + client.document_from_text('abc', doc_type='foo') + + def test_document_from_html_factory(self): + from gcloud.language.document import Document + + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + content = 'abc' + language = 'ja' + document = client.document_from_html(content, language=language) + self.assertIsInstance(document, Document) + self.assertIs(document.client, client) + self.assertEqual(document.content, content) + # Test the default arg. + self.assertEqual(document.doc_type, Document.HTML) + # Test the kwargs as well. + self.assertEqual(document.language, language) + + def test_document_from_html_factory_failure(self): + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + with self.assertRaises(TypeError): + client.document_from_html('abc', doc_type='foo') + + def test_document_from_url_factory(self): + from gcloud.language.document import Document + + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + gcs_url = 'gs://my-text-bucket/sentiment-me.txt' + document = client.document_from_url(gcs_url) + self.assertIsInstance(document, Document) + self.assertIs(document.client, client) + self.assertIsNone(document.content) + self.assertEqual(document.gcs_url, gcs_url) + self.assertEqual(document.doc_type, Document.PLAIN_TEXT) + + def test_document_from_url_factory_explicit(self): + from gcloud.language.document import Document + from gcloud.language.document import Encoding + + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + encoding = Encoding.UTF32 + gcs_url = 'gs://my-text-bucket/sentiment-me.txt' + document = client.document_from_url(gcs_url, doc_type=Document.HTML, + encoding=encoding) + self.assertIsInstance(document, Document) + self.assertIs(document.client, client) + self.assertIsNone(document.content) + self.assertEqual(document.gcs_url, gcs_url) + self.assertEqual(document.doc_type, Document.HTML) + self.assertEqual(document.encoding, encoding) + + def test_document_from_blob_factory(self): + from gcloud.language.document import Document + + creds = _Credentials() + client = self._makeOne(project='PROJECT', + credentials=creds, http=object()) + + bucket_name = 'my-text-bucket' + blob_name = 'sentiment-me.txt' + gcs_url = 'gs://%s/%s' % (bucket_name, blob_name) + document = client.document_from_blob(bucket_name, blob_name) + self.assertIsInstance(document, Document) + self.assertIs(document.client, client) + self.assertIsNone(document.content) + self.assertEqual(document.gcs_url, gcs_url) + self.assertEqual(document.doc_type, Document.PLAIN_TEXT) + class _Credentials(object): diff --git a/gcloud/language/test_document.py b/gcloud/language/test_document.py new file mode 100644 index 000000000000..2b52f13a7b31 --- /dev/null +++ b/gcloud/language/test_document.py @@ -0,0 +1,64 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + + +class TestDocument(unittest.TestCase): + + def _getTargetClass(self): + from gcloud.language.document import Document + return Document + + def _makeOne(self, *args, **kw): + return self._getTargetClass()(*args, **kw) + + def test_constructor_defaults(self): + import gcloud.language.document as MUT + + client = object() + content = 'abc' + document = self._makeOne(client, content) + self.assertIs(document.client, client) + self.assertEqual(document.content, content) + self.assertIsNone(document.gcs_url) + self.assertEqual(document.doc_type, MUT.Document.PLAIN_TEXT) + self.assertEqual(document.language, MUT.DEFAULT_LANGUAGE) + self.assertEqual(document.encoding, MUT.Encoding.UTF8) + + def test_constructor_explicit(self): + import gcloud.language.document as MUT + + client = object() + gcs_url = 'gs://some-bucket/some-obj.html' + language = 'ja' + document = self._makeOne(client, gcs_url=gcs_url, + doc_type=MUT.Document.HTML, + language=language, + encoding=MUT.Encoding.UTF32) + self.assertIs(document.client, client) + self.assertIsNone(document.content) + self.assertEqual(document.gcs_url, gcs_url) + self.assertEqual(document.doc_type, MUT.Document.HTML) + self.assertEqual(document.language, language) + self.assertEqual(document.encoding, MUT.Encoding.UTF32) + + def test_constructor_no_text(self): + with self.assertRaises(ValueError): + self._makeOne(None, content=None, gcs_url=None) + + def test_constructor_text_and_gcs(self): + with self.assertRaises(ValueError): + self._makeOne(None, content='abc', + gcs_url='gs://some-bucket/some-obj.txt')