scrapy · Granitosaurus · Sep 30, 2017 · Sep 30, 2017 · Oct 17, 2017 · Oct 18, 2017
diff --git a/parsel/selector.py b/parsel/selector.py
@@ -1,7 +1,7 @@
 """
 XPath selectors based on lxml
 """
-
+import re
 import sys
 
 import six
@@ -16,6 +16,7 @@ def __init__(self, *args, **kwargs):
         kwargs.setdefault('resolve_entities', False)
         super(SafeXMLParser, self).__init__(*args, **kwargs)
 
+
 _ctgroup = {
     'html': {'_parser': html.HTMLParser,
              '_csstranslator': HTMLTranslator(),
@@ -118,23 +119,29 @@ def re_first(self, regex, default=None, replace_entities=True):
         else:
             return default
 
-    def extract(self):
+    def extract(self, format_as=None):
         """
         Call the ``.extract()`` method for each element is this list and return
         their results flattened, as a list of unicode strings.
+
+        ``format_as`` can take either ``xml`` or ``html`` values to pretty format the output. 
         """
-        return [x.extract() for x in self]
+        return [x.extract(format_as=format_as) for x in self]
+
     getall = extract
 
-    def extract_first(self, default=None):
+    def extract_first(self, default=None, format_as=None):
         """
         Return the result of ``.extract()`` for the first element in this list.
         If the list is empty, return the default value.
+
+        ``format_as`` can take either ``xml`` or ``html`` values to pretty format the output. 
         """
         for x in self:
-            return x.extract()
+            return x.extract(format_as=format_as)
         else:
             return default
+
     get = extract_first
 
 
@@ -276,23 +283,37 @@ def re_first(self, regex, default=None, replace_entities=True):
         """
         return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
 
-    def extract(self):
+    def extract(self, format_as=None):
         """
         Serialize and return the matched nodes in a single unicode string.
         Percent encoded content is unquoted.
+
+        ``format_as`` can take either ``xml`` or ``html`` values to pretty format the output.
         """
+        if format_as and format_as not in _ctgroup:
+            raise ValueError('format_as argument has to be one of: {}'.format(list(_ctgroup)))
         try:
-            return etree.tostring(self.root,
-                                  method=self._tostring_method,
-                                  encoding='unicode',
-                                  with_tail=False)
-        except (AttributeError, TypeError):
+            body = etree.tostring(
+                self.root,
+                method=format_as or self._tostring_method,
+                encoding='unicode',
+                with_tail=False,
+                pretty_print=bool(format_as),
+            )
+            # if xml remove leading and trailing <html> and <body> tags
+            if format_as == 'xml' and body.strip().startswith('<html>'):
+                re_padding = re.compile('^\s{0,4}')
+                body = body.split('\n')[2:-3]
+                body = '\n'.join(re_padding.sub('', b) for b in body) + '\n'
+            return body
+        except (AttributeError, TypeError) as e:
             if self.root is True:
                 return u'1'
             elif self.root is False:
                 return u'0'
             else:
                 return six.text_type(self.root)
+
     get = extract
 
     def getall(self):
@@ -331,9 +352,11 @@ def __bool__(self):
         given by the contents it selects.
         """
         return bool(self.extract())
+
     __nonzero__ = __bool__
 
     def __str__(self):
         data = repr(self.extract()[:40])
         return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
+
     __repr__ = __str__
diff --git a/tests/test_selector.py b/tests/test_selector.py
@@ -139,6 +139,41 @@ def test_extract_first_default(self):
 
         self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
 
+    def test_extract_format_as(self):
+        """Test if extract_first() returns first element"""
+        body = u'<ul><li><span>1</span></li><li><span>2</span></li></ul>'
+        # html
+        sel = self.sscls(text=body)
+        self.assertEqual(
+            sel.extract(format_as='xml'),
+            '<ul>\n  <li>\n    <span>1</span>\n  </li>\n  <li>\n    <span>2</span>\n  </li>\n</ul>\n')
+        self.assertEqual(
+            sel.xpath('//ul').extract(format_as='xml'),
+            ['<ul>\n  <li>\n    <span>1</span>\n  </li>\n  <li>\n    <span>2</span>\n  </li>\n</ul>\n'])
+        self.assertEqual(
+            sel.xpath('//ul').extract(format_as=None),
+            ['<ul><li><span>1</span></li><li><span>2</span></li></ul>'])
+        self.assertEqual(
+            sel.extract(format_as='html'),
+            '<html><body><ul>\n<li><span>1</span></li>\n<li><span>2</span></li>\n</ul></body></html>\n')
+        self.assertEqual(
+            sel.xpath('//ul').extract(format_as='html'),
+            ['<ul>\n<li><span>1</span></li>\n<li><span>2</span></li>\n</ul>\n'])
+        self.assertEqual(
+            sel.extract(format_as=None),
+            '<html><body><ul><li><span>1</span></li><li><span>2</span></li></ul></body></html>')
+        self.assertEqual(
+            sel.xpath('//ul').extract(format_as=None),
+            ['<ul><li><span>1</span></li><li><span>2</span></li></ul>'])
+        # xml
+        sel = self.sscls(text=body, type='xml')
+        self.assertEqual(
+            sel.extract(format_as='xml'),
+            '<ul>\n  <li>\n    <span>1</span>\n  </li>\n  <li>\n    <span>2</span>\n  </li>\n</ul>\n')
+        # errors
+        with self.assertRaises(ValueError):
+            sel.extract(format_as='json')
+
     def test_selector_get_alias(self):
         """Test if get() returns extracted value on a Selector"""
         body = u'<ul><li id="1">1</li><li id="2">2</li><li id="3">3</li></ul>'
@@ -455,7 +490,7 @@ def test_re(self):
                          ["John", "Paul"])
         self.assertEqual(x.xpath("//ul/li").re("Age: (\d+)"),
                          ["10", "20"])
-        
+
         # Test named group, hit and miss
         x = self.sscls(text=u'foobar')
         self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
@@ -468,7 +503,7 @@ def test_re(self):
     def test_re_replace_entities(self):
         body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
         x = self.sscls(text=body)
-        
+
         name_re = re.compile('{"foo":(.*)}')
 
         # by default, only &amp; and &lt; are preserved ;