Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add format_as to extract() methods #101

Closed
wants to merge 5 commits into from
Closed
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(self, *args, **kwargs):
kwargs.setdefault('resolve_entities', False)
super(SafeXMLParser, self).__init__(*args, **kwargs)


_ctgroup = {
'html': {'_parser': html.HTMLParser,
'_csstranslator': HTMLTranslator(),
Expand Down Expand Up @@ -118,23 +119,29 @@ def re_first(self, regex, default=None, replace_entities=True):
else:
return default

def extract(self):
def extract(self, format_as=None):
"""
Call the ``.extract()`` method for each element is this list and return
their results flattened, as a list of unicode strings.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
return [x.extract() for x in self]
return [x.extract(format_as=format_as) for x in self]

getall = extract

def extract_first(self, default=None):
def extract_first(self, default=None, format_as=None):
"""
Return the result of ``.extract()`` for the first element in this list.
If the list is empty, return the default value.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
for x in self:
return x.extract()
return x.extract(format_as=format_as)
else:
return default

get = extract_first


Expand Down Expand Up @@ -276,23 +283,29 @@ def re_first(self, regex, default=None, replace_entities=True):
"""
return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)

def extract(self):
def extract(self, format_as=None):
"""
Serialize and return the matched nodes in a single unicode string.
Percent encoded content is unquoted.

``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
"""
try:
return etree.tostring(self.root,
method=self._tostring_method,
encoding='unicode',
with_tail=False)
return etree.tostring(
self.root,
method=format_as or self._tostring_method,
encoding='unicode',
with_tail=False,
pretty_print=bool(format_as),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look right, it's using the boolean value of format_as.
Can you please add tests for the full behavior? :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eliasdorneles the logic behind this is that if format_as is explicitly specified then it should pretty_print as well. Alternatively pretty_print argument should be added separately I guess

)
except (AttributeError, TypeError):
if self.root is True:
return u'1'
elif self.root is False:
return u'0'
else:
return six.text_type(self.root)

get = extract

def getall(self):
Expand Down Expand Up @@ -331,9 +344,11 @@ def __bool__(self):
given by the contents it selects.
"""
return bool(self.extract())

__nonzero__ = __bool__

def __str__(self):
data = repr(self.extract()[:40])
return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)

__repr__ = __str__