Skip to content

Commit

Permalink
Adding Document.analyze_entities() in language.
Browse files Browse the repository at this point in the history
  • Loading branch information
dhermes committed Aug 23, 2016
1 parent 7dbdefb commit a34109d
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 0 deletions.
43 changes: 43 additions & 0 deletions gcloud/language/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
A document is used to hold text to be analyzed and annotated.
"""

from gcloud.language.entity import Entity


DEFAULT_LANGUAGE = 'en'
"""Default document language, English."""
Expand Down Expand Up @@ -101,3 +103,44 @@ def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
self.doc_type = doc_type
self.language = language
self.encoding = encoding

def _to_dict(self):
"""Helper to convert the current document into a dictionary.
To be used when constructing requests.
:rtype: dict
:returns: The Document value as a JSON dictionary.
"""
info = {
'type': self.doc_type,
'language': self.language,
}
if self.content is not None:
info['content'] = self.content
elif self.gcs_url is not None:
info['gcsContentUri'] = self.gcs_url
return info

def analyze_entities(self):
"""Analyze the entities in the current document.
Finds named entities (currently finds proper names as of August 2016)
in the text, entity types, salience, mentions for each entity, and
other properties.
See:
https://cloud.google.com/natural-language/reference/\
rest/v1beta1/documents/analyzeEntities
:rtype: list
:returns: A list of :class:`Entity` returned from the API.
"""
data = {
'document': self._to_dict(),
'encodingType': self.encoding,
}
api_response = self.client.connection.api_request(
method='POST', path='analyzeEntities', data=data)
return [Entity.from_api_repr(entity)
for entity in api_response['entities']]
124 changes: 124 additions & 0 deletions gcloud/language/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,127 @@ def test_constructor_text_and_gcs(self):
with self.assertRaises(ValueError):
self._makeOne(None, content='abc',
gcs_url='gs://some-bucket/some-obj.txt')

def test__to_dict_with_content(self):
klass = self._getTargetClass()
content = 'Hello World'
document = self._makeOne(None, content=content)
info = document._to_dict()
self.assertEqual(info, {
'content': content,
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test__to_dict_with_gcs(self):
klass = self._getTargetClass()
gcs_url = 'gs://some-bucket/some-obj.html'
document = self._makeOne(None, gcs_url=gcs_url)
info = document._to_dict()
self.assertEqual(info, {
'gcsContentUri': gcs_url,
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test__to_dict_with_no_content(self):
klass = self._getTargetClass()
document = self._makeOne(None, content='')
document.content = None # Manually unset the content.
info = document._to_dict()
self.assertEqual(info, {
'language': document.language,
'type': klass.PLAIN_TEXT,
})

def test_analyze_entities(self):
from gcloud.language.entity import Entity
from gcloud.language.entity import EntityType

name1 = 'R-O-C-K'
name2 = 'USA'
content = name1 + ' in the ' + name2
metadata1 = {
'wikipedia_url': 'http://en.wikipedia.org/wiki/Rock_music',
}
metadata2 = {
'wikipedia_url': 'http://en.wikipedia.org/wiki/United_States',
}
salience1 = 0.91391456
salience2 = 0.086085409
response = {
'entities': [
{
'name': name1,
'type': EntityType.OTHER,
'metadata': metadata1,
'salience': salience1,
'mentions': [
{
'text': {
'content': name1,
'beginOffset': -1
}
}
]
},
{
'name': name2,
'type': EntityType.LOCATION,
'metadata': metadata2,
'salience': salience2,
'mentions': [
{
'text': {
'content': name2,
'beginOffset': -1,
},
},
],
},
],
'language': 'en',
}
connection = _Connection(response)
client = _Client(connection=connection)
document = self._makeOne(client, content)

entities = document.analyze_entities()
self.assertEqual(len(entities), 2)
entity1 = entities[0]
self.assertIsInstance(entity1, Entity)
self.assertEqual(entity1.name, name1)
self.assertEqual(entity1.entity_type, EntityType.OTHER)
self.assertEqual(entity1.metadata, metadata1)
self.assertEqual(entity1.salience, salience1)
self.assertEqual(entity1.mentions, [name1])
entity2 = entities[1]
self.assertIsInstance(entity2, Entity)
self.assertEqual(entity2.name, name2)
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
self.assertEqual(entity2.metadata, metadata2)
self.assertEqual(entity2.salience, salience2)
self.assertEqual(entity2.mentions, [name2])

# Verify the request.
self.assertEqual(len(connection._requested), 1)
req = connection._requested[0]
self.assertEqual(req['path'], 'analyzeEntities')
self.assertEqual(req['method'], 'POST')


class _Connection(object):

def __init__(self, response):
self._response = response
self._requested = []

def api_request(self, **kwargs):
self._requested.append(kwargs)
return self._response


class _Client(object):

def __init__(self, connection=None):
self.connection = connection

0 comments on commit a34109d

Please sign in to comment.