Skip to content

Commit

Permalink
chg: don't use regex to clean up XML following pretty-printing
Browse files Browse the repository at this point in the history
- default xml.minidom pretty-printer treats each node in a mixed-content
  element as a separate node. As a result it adds newlines and indents
  for each text node and output element, which is ugly and wrong. Then
  survey.py cleans that up with some regex.
- this change copies and customises the default writexml implementation
  from minidom, and skips newline/indent for text nodes. It also
  includes conditional whitespace to match the previous processing, in
  case Collect or Enketo expected it to be exactly that way.
- added a test to specifically enumerate various output/text mixture
  permutations that appear in a variety of existing tests.
  • Loading branch information
lindsay-stevens committed Feb 1, 2024
1 parent 0f6b4a5 commit 8f75605
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 18 deletions.
18 changes: 2 additions & 16 deletions pyxform/survey.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
r"(instance\(.*\)\/root\/item\[.*?(\$\{.*\})\]\/.*?)\s"
)
RE_PULLDATA = re.compile(r"(pulldata\s*\(\s*)(.*?),")
RE_XML_OUTPUT = re.compile(r"\n.*(<output.*>)\n(\s\s)*")
RE_XML_TEXT = re.compile(r"(>)\n\s*(\s[^<>\s].*?)\n\s*(\s</)", re.DOTALL)
SEARCH_APPEARANCE_REGEX = re.compile(r"search\(.*?\)")


Expand Down Expand Up @@ -965,20 +963,8 @@ def _to_ugly_xml(self):
return '<?xml version="1.0"?>' + self.xml().toxml()

def _to_pretty_xml(self):
"""
I want the to_xml method to by default validate the xml we are
producing.
"""
# Hacky way of pretty printing xml without adding extra white
# space to text
# TODO: check out pyxml
# http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/
xml_with_linebreaks = self.xml().toprettyxml(indent=" ")
pretty_xml = RE_XML_TEXT.sub(
lambda m: "".join(m.group(1, 2, 3)), xml_with_linebreaks
)
inline_output = RE_XML_OUTPUT.sub(r"\g<1>", pretty_xml)
return '<?xml version="1.0"?>\n' + inline_output
"""Get the XForm with human readable formatting."""
return '<?xml version="1.0"?>\n' + self.xml().toprettyxml(indent=" ")

def __repr__(self):
return self.__unicode__()
Expand Down
37 changes: 36 additions & 1 deletion pyxform/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from collections import namedtuple
from json.decoder import JSONDecodeError
from typing import Dict, List, Tuple
from xml.dom.minidom import Element, Text, parseString
from xml.dom import Node
from xml.dom.minidom import Element, Text, _write_data, parseString

import openpyxl
import xlrd
Expand All @@ -26,6 +27,7 @@
BRACKETED_TAG_REGEX = re.compile(r"\${(last-saved#)?(.*?)}")
LAST_SAVED_REGEX = re.compile(r"\${last-saved#(.*?)}")
PYXFORM_REFERENCE_REGEX = re.compile(r"\$\{(.*?)\}")
NODE_TYPE_TEXT = (Node.TEXT_NODE, Node.CDATA_SECTION_NODE)


NSMAP = {
Expand Down Expand Up @@ -54,6 +56,39 @@ def __init__(self, *args, **kwargs):
Element.__init__(self, *args, **kwargs)
self.ownerDocument = None

def writexml(self, writer, indent="", addindent="", newl=""):
# indent = current indentation
# addindent = indentation to add to higher levels
# newl = newline string
writer.write(indent + "<" + self.tagName)

attrs = self._get_attributes()

for a_name in attrs.keys():
writer.write(' %s="' % a_name)
_write_data(writer, attrs[a_name].value)
writer.write('"')
if self.childNodes:
writer.write(">")
# For text or mixed content, write without adding indents or newlines.
if 0 < len([c for c in self.childNodes if c.nodeType in NODE_TYPE_TEXT]):
# Conditions to match old Survey.py regex for remaining whitespace.
child_nodes = len(self.childNodes)
for idx, cnode in enumerate(self.childNodes):
if 1 < child_nodes and idx == 0 and cnode.nodeType in NODE_TYPE_TEXT:
writer.write(" ")
cnode.writexml(writer, "", "", "")
if 1 < child_nodes and (idx + 1) == child_nodes:
writer.write(" ")
else:
writer.write(newl)
for cnode in self.childNodes:
cnode.writexml(writer, indent + addindent, addindent, newl)
writer.write(indent)
writer.write("</%s>%s" % (self.tagName, newl))
else:
writer.write("/>%s" % (newl))


class PatchedText(Text):
def writexml(self, writer, indent="", addindent="", newl=""):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_repeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_repeat_relative_reference(self):
"""<label> B w <output value=" ../A "/> </label>""",
"""<label> E w <output value=" /test_repeat/Z "/> </label>""",
"""<label> Noted <output value=" ../FF "/> w """
"""<output value=" ../sectionb/H "/> </label></input>""",
"""<output value=" ../sectionb/H "/> </label>""",
],
)

Expand Down
74 changes: 74 additions & 0 deletions tests/test_whitespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,80 @@ def test_over_trim(self):
xml__contains=['<label><output value=" /issue96/var "/> text </label>'],
)

def test_whitespace_output_permutations(self):
"""Should find expected whitespace before/after/between output variables."""
md = """
| survey | | |
| | type | name | label |
| | text | A | None |
| | text | B1 | Before {0} |
| | text | C1 | {0} After |
| | text | D1 | Before x2 {0} {0} |
| | text | E1 | {0} {0} After x2 |
| | text | F1 | {0} Between {0} |
| | text | G1 | Wrap {0} in text |
| | text | H1 | Wrap {0} in {0} text |
| | text | I1 | Wrap {0} in {0} |
"""
xp = "/h:html/h:body/x:input[@ref='/test_name/{}']/x:label"
test_cases = ("A", "B1")
for case in test_cases:
with self.subTest(msg=case):
self.assertPyxformXform(
md=md.format(f"${{{case}}}"),
xml__xpath_exact=[
(xp.format("A"), {"<label>None</label>"}),
(
xp.format("B1"),
{
f"""<label> Before <output value=" /test_name/{case} "/> </label>"""
},
),
(
xp.format("C1"),
{
f"""<label><output value=" /test_name/{case} "/> After </label>"""
},
),
(
xp.format("D1"),
{
f"""<label> Before x2 <output value=" /test_name/{case} "/> <output value=" /test_name/{case} "/> </label>"""
},
),
(
xp.format("E1"),
{
f"""<label><output value=" /test_name/{case} "/> <output value=" /test_name/{case} "/> After x2 </label>"""
},
),
(
xp.format("F1"),
{
f"""<label><output value=" /test_name/{case} "/> Between <output value=" /test_name/{case} "/> </label>"""
},
),
(
xp.format("G1"),
{
f"""<label> Wrap <output value=" /test_name/{case} "/> in text </label>"""
},
),
(
xp.format("H1"),
{
f"""<label> Wrap <output value=" /test_name/{case} "/> in <output value=" /test_name/{case} "/> text </label>"""
},
),
(
xp.format("I1"),
{
f"""<label> Wrap <output value=" /test_name/{case} "/> in <output value=" /test_name/{case} "/> </label>"""
},
),
],
)

def test_values_without_whitespaces_are_processed_successfully(self):
md = """
| survey | | | |
Expand Down

0 comments on commit 8f75605

Please sign in to comment.