Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions Lib/test/test_minidom.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,46 @@ def testWriteXML(self):
dom.unlink()
self.confirm(str == domstr)

def test_toxml_quote_text(self):
dom = Document()
elem = dom.appendChild(dom.createElement('elem'))
elem.appendChild(dom.createTextNode('&<>"'))
cr = elem.appendChild(dom.createElement('cr'))
cr.appendChild(dom.createTextNode('\r'))
crlf = elem.appendChild(dom.createElement('crlf'))
crlf.appendChild(dom.createTextNode('\r\n'))
lflf = elem.appendChild(dom.createElement('lflf'))
lflf.appendChild(dom.createTextNode('\n\n'))
ws = elem.appendChild(dom.createElement('ws'))
ws.appendChild(dom.createTextNode('\t\n\r '))
domstr = dom.toxml()
dom.unlink()
self.assertEqual(domstr, '<?xml version="1.0" ?>'
'<elem>&amp;&lt;&gt;"'
'<cr>\r</cr>'
'<crlf>\r\n</crlf>'
'<lflf>\n\n</lflf>'
'<ws>\t\n\r </ws></elem>')

def test_toxml_quote_attrib(self):
dom = Document()
elem = dom.appendChild(dom.createElement('elem'))
elem.setAttribute("a", '&<>"')
elem.setAttribute("cr", "\r")
elem.setAttribute("lf", "\n")
elem.setAttribute("crlf", "\r\n")
elem.setAttribute("lflf", "\n\n")
elem.setAttribute("ws", "\t\n\r ")
domstr = dom.toxml()
dom.unlink()
self.assertEqual(domstr, '<?xml version="1.0" ?>'
'<elem a="&amp;&lt;&gt;&quot;" '
'cr="&#13;" '
'lf="&#10;" '
'crlf="&#13;&#10;" '
'lflf="&#10;&#10;" '
'ws="&#09;&#10;&#13; "/>')
Comment thread
serhiy-storchaka marked this conversation as resolved.
Outdated

def testAltNewline(self):
str = '<?xml version="1.0" ?>\n<a b="c"/>\n'
dom = parseString(str)
Expand Down
30 changes: 23 additions & 7 deletions Lib/xml/dom/minidom.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,28 @@ def _in_document(node):
node = node.parentNode
return False

def _write_data(writer, data):
def _write_data(writer, text, attr):
Comment thread
scoder marked this conversation as resolved.
"Writes datachars to writer."
if data:
data = data.replace("&", "&amp;").replace("<", "&lt;"). \
replace("\"", "&quot;").replace(">", "&gt;")
writer.write(data)
if not text:
return
# See the comments in ElementTree.py for behavior and
# implementation details.
if "&" in text:
text = text.replace("&", "&amp;")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
if attr:
if '"' in text:
text = text.replace('"', "&quot;")
if "\r" in text:
text = text.replace("\r", "&#13;")
if "\n" in text:
text = text.replace("\n", "&#10;")
if "\t" in text:
text = text.replace("\t", "&#09;")
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder, why &#09; and not simply &#9;? Is there a reason?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No technical reason. Probably based on Python's hex character spelling or due to consistency with the other two-digit codes above. The XML character spec does not need (or mention) leading zeros.

I'm happy to keep the leading zero. If you need compact data, use compression. That's way more effective than stripping some zeros from rare tab characters.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xml.sax.saxutils.quoteattr() uses &#9;.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which makes me wonder why we need a new implementation here, rather than importing the existing one.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now looked up the implementation in saxutils.py – it looks fairly slow. minidom will probably not become high-performance by any accident, but it doesn't feel good to slow it down even more. It's probably worth a new implementation..

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is sad, but there is a copy of escaping function in almost every module which outputs XML or HTML. On one hand, it is a trivial function, and we want to avoid unneeded dependencies. On other hand, efficient and complete implementation is not so trivial. But xml.sax.saxutils version is too generalized and far from been efficient.

It was worse in the past. Now many code just use html.escape().

Comment thread
serhiy-storchaka marked this conversation as resolved.
Outdated
writer.write(text)

def _get_elements_by_tagName_helper(parent, name, rc):
for node in parent.childNodes:
Expand Down Expand Up @@ -883,7 +899,7 @@ def writexml(self, writer, indent="", addindent="", newl=""):

for a_name in attrs.keys():
writer.write(" %s=\"" % a_name)
_write_data(writer, attrs[a_name].value)
_write_data(writer, attrs[a_name].value, True)
writer.write("\"")
if self.childNodes:
writer.write(">")
Expand Down Expand Up @@ -1112,7 +1128,7 @@ def splitText(self, offset):
return newText

def writexml(self, writer, indent="", addindent="", newl=""):
_write_data(writer, "%s%s%s" % (indent, self.data, newl))
_write_data(writer, "%s%s%s" % (indent, self.data, newl), False)

# DOM Level 3 (WD 9 April 2002)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:mod:`xml.dom.minidom` now preserves whitespaces in attributes.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:mod:`xml.dom.minidom` now only quotes ``"`` in attributes.