Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add format_as to extract() methods #101

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 34 additions & 11 deletions parsel/selector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
XPath selectors based on lxml
"""

import re
import sys

import six
Expand All @@ -16,6 +16,7 @@ def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
super(SafeXMLParser, self).__init__(*args, **kwargs)


_ctgroup = {
'html': {'_parser': html.HTMLParser,
'_csstranslator': HTMLTranslator(),
Expand Down Expand Up @@ -118,23 +119,29 @@ def re_first(self, regex, default=None, replace_entities=True):
else:
return default

def extract(self):
def extract(self, format_as=None):
"""
Call the ``.extract()`` method for each element is this list and return
their results flattened, as a list of unicode strings.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
return [x.extract() for x in self]
return [x.extract(format_as=format_as) for x in self]

getall = extract

def extract_first(self, default=None):
def extract_first(self, default=None, format_as=None):
"""
Return the result of ``.extract()`` for the first element in this list.
If the list is empty, return the default value.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
for x in self:
return x.extract()
return x.extract(format_as=format_as)
else:
return default

get = extract_first


Expand Down Expand Up @@ -276,23 +283,37 @@ def re_first(self, regex, default=None, replace_entities=True):
"""
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)

def extract(self):
def extract(self, format_as=None):
"""
Serialize and return the matched nodes in a single unicode string.
Percent encoded content is unquoted.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
if format_as and format_as not in _ctgroup:
raise ValueError('format_as argument has to be one of: {}'.format(list(_ctgroup)))
try:
return etree.tostring(self.root,
method=self._tostring_method,
encoding='unicode',
with_tail=False)
except (AttributeError, TypeError):
body = etree.tostring(
self.root,
method=format_as or self._tostring_method,
encoding='unicode',
with_tail=False,
pretty_print=bool(format_as),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look right, it's using the boolean value of format_as.
Can you please add tests for the full behavior? :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eliasdorneles the logic behind this is that if format_as is explicitly specified then it should pretty_print as well. Alternatively pretty_print argument should be added separately I guess

)
# if xml remove leading and trailing <html> and <body> tags
if format_as == 'xml' and body.strip().startswith('<html>'):
re_padding = re.compile('^\s{0,4}')
body = body.split('\n')[2:-3]
body = '\n'.join(re_padding.sub('', b) for b in body) + '\n'
return body
except (AttributeError, TypeError) as e:
if self.root is True:
return u'1'
elif self.root is False:
return u'0'
else:
return six.text_type(self.root)

get = extract

def getall(self):
Expand Down Expand Up @@ -331,9 +352,11 @@ def __bool__(self):
given by the contents it selects.
"""
return bool(self.extract())

__nonzero__ = __bool__

def __str__(self):
data = repr(self.extract()[:40])
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)

__repr__ = __str__
39 changes: 37 additions & 2 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,41 @@ def test_extract_first_default(self):

self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')

def test_extract_format_as(self):
"""Test if extract_first() returns first element"""
body = u'<ul><li><span>1</span></li><li><span>2</span></li></ul>'
# html
sel = self.sscls(text=body)
self.assertEqual(
sel.extract(format_as='xml'),
'<ul>\n <li>\n <span>1</span>\n </li>\n <li>\n <span>2</span>\n </li>\n</ul>\n')
self.assertEqual(
sel.xpath('//ul').extract(format_as='xml'),
['<ul>\n <li>\n <span>1</span>\n </li>\n <li>\n <span>2</span>\n </li>\n</ul>\n'])
self.assertEqual(
sel.xpath('//ul').extract(format_as=None),
['<ul><li><span>1</span></li><li><span>2</span></li></ul>'])
self.assertEqual(
sel.extract(format_as='html'),
'<html><body><ul>\n<li><span>1</span></li>\n<li><span>2</span></li>\n</ul></body></html>\n')
self.assertEqual(
sel.xpath('//ul').extract(format_as='html'),
['<ul>\n<li><span>1</span></li>\n<li><span>2</span></li>\n</ul>\n'])
self.assertEqual(
sel.extract(format_as=None),
'<html><body><ul><li><span>1</span></li><li><span>2</span></li></ul></body></html>')
self.assertEqual(
sel.xpath('//ul').extract(format_as=None),
['<ul><li><span>1</span></li><li><span>2</span></li></ul>'])
# xml
sel = self.sscls(text=body, type='xml')
self.assertEqual(
sel.extract(format_as='xml'),
'<ul>\n <li>\n <span>1</span>\n </li>\n <li>\n <span>2</span>\n </li>\n</ul>\n')
# errors
with self.assertRaises(ValueError):
sel.extract(format_as='json')

def test_selector_get_alias(self):
"""Test if get() returns extracted value on a Selector"""
body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
Expand Down Expand Up @@ -455,7 +490,7 @@ def test_re(self):
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
["10", "20"])

# Test named group, hit and miss
x = self.sscls(text=u'foobar')
self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
Expand All @@ -468,7 +503,7 @@ def test_re(self):
def test_re_replace_entities(self):
body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
x = self.sscls(text=body)

name_re = re.compile('{"foo":(.*)}')

# by default, only &amp; and &lt; are preserved ;
Expand Down