Support more markup transformations, plus more tests

ben-tinc · ben-tinc · commit cb7812fe5ef2 · 2018-10-14T23:37:05.000+02:00
Currently, the following confluence markup is supported:
* Headings h1. to h4.
* Unordered list items (multilevel)
* Ordered list items (multilevel)
* text emphasis (cursive, bold, and cursive AND bold)

Notable things that are still missing:
* links
* tables
* images
* several text effects like strike-outs, superscript...
diff --git a/Pipfile b/Pipfile
@@ -9,6 +9,7 @@ anytree = "*"
 
 [dev-packages]
 ipython = "*"
+pytest = "*"
 
 [requires]
 python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -10,3 +10,7 @@ Parse confluence xml export and generate pages with mediawiki markup.
  * Run `python xml2mw.py`.
 
 Now, result files should be in `OUT_PATH`, and a file `sitemap.txt` should be in the base directory.
+
+## Test suite
+
+If you want to run the test suite, just run `python -m unittest discover`. Using `pytest` should work as well, if you prefer that. `pytest` is included in the development dependencies, which you can install with `pipenv install --dev`.
diff --git a/test/test_transform.py b/test/test_transform.py
@@ -9,34 +9,170 @@
 from xml2mw.transform import to_mw
 
 
-class TransformToMW(TestCase):
-    """TestCases for transformation to media wiki markup.
-
-    Note, that `to_mw()` returns a generator, so we need to
-    consume it with e.g. `list()` or `'\n'.join()` to generate
-    an actual result.
+class TransformEdgeCases(TestCase):
+    """Some general tests which do not fit into any 
+    other category.
     """
 
     def test_empty_page(self):
         """A completely empty body should yield an empty generator."""
         page_body = ''
-        result = '\n'.join(to_mw(page_body))
         expected = ''
+        result = '\n'.join(to_mw(page_body))
         self.assertEqual(result, expected)
 
+
+class TransformHeadings(TestCase):
+    """TestCases for transformation of headings, which can only
+    be at the beginning of lines.
+
+    Note, that `to_mw()` returns a generator, so we need to
+    consume it with e.g. `list()` or `'\n'.join()` to generate
+    an actual result.
+    """
+
     def test_various_headings(self):
         """Test transformation of heading markup."""
         page_body = 'h1. H1\nh2. H2\nh3. H3\nh4. H4\n'
-        result = '\n'.join(to_mw(page_body))
         expected = '= H1 =\n== H2 ==\n=== H3 ===\n==== H4 ====\n'
+        result = '\n'.join(to_mw(page_body))
         self.assertEqual(result, expected)
 
-    def test_inline_markup(self):
-        """The markup which is currently implemented must be at the
+    def test_no_inline_headings(self):
+        """Most markup which is currently implemented must be at the
         beginning of lines, i.e. a `h1. ` in the middle of a line
         should not be interpreted as heading markup.
         """
-        page_body = 'h1. A Heading\n\nA line with h1. in it.\n'
+        page_body = 'h1. A Heading\n\nA line with h1. in it.\nA line with h3. in it.\n'
+        expected = '= A Heading =\n\nA line with h1. in it.\nA line with h3. in it.\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+
+
+class TransformLists(TestCase):
+    """Test cases for transformation of unordered and ordered lists.
+
+    Note, that most of the list syntax is identical for confluence
+    and mediawiki markup, so for many of these tests, `to_mw()`
+    should actually be a no-op.
+    Also note that because `to_mw()` returns a generator, we need
+    to consume it with e.g. `list()` or `'\n'.join()` to generate
+    an actual result.
+    """
+
+    def test_unordered_list_a(self):
+        """There are two versions of unordered list items in
+        confluence markup, this tests the `*` version.
+        """
+        page_body = 'Some text.\n* first item.\n* second item.\n'
+        expected = 'Some text.\n* first item.\n* second item.\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_unordered_list_b(self):
+        """There are two versions of unordered list items in
+        confluence markup, this tests the `-` version.
+        """
+        page_body = 'Some text.\n- first item.\n- second item.\n'
+        expected = 'Some text.\n* first item.\n* second item.\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_nested_unordered_lists_a(self):
+        """Test nested unordered lists of type `*`, but not mixed ones."""
+        page_body = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
+        expected = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_nested_unordered_lists_b(self):
+        """Test nested unordered lists of type `-`, but not mixed ones."""
+        page_body = 'Some text.\n- first level, first item\n-- second level, first item\n-- second level, second item\n- first level, second item\n'
+        expected = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_ordered_list(self):
+        """Test if ordered list items are transformed correctly."""
+        page_body = 'Some text.\n# first item\n# second item\n'
+        expected = 'Some text.\n# first item\n# second item\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_nested_ordered_lists(self):
+        """Test nested ordered lists, but not mixed ones."""
+        page_body = 'Some text.\n# first, first\n## second, first\n## second, second\n# first, second\n'
+        expected = 'Some text.\n# first, first\n## second, first\n## second, second\n# first, second\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+
+    def test_mixed_lists(self):
+        """Ordered and unordered lists can be mixed."""
+        page_body = '# first\n#* mixed\n#* list\n# for us\n'
+        expected = '# first\n#* mixed\n#* list\n# for us\n'
         result = '\n'.join(to_mw(page_body))
-        expected = '= A Heading =\n\nA line with h1. in it.\n'
         self.assertEqual(result, expected)
+    
+    def test_no_inline_ordered_lists(self):
+        """Ordered lists must start at the beginning of a line."""
+        page_body = 'Some text with # a pseudo list\nSome more ## fake lists\n'
+        expected = 'Some text with # a pseudo list\nSome more ## fake lists\n'
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, expected)
+    
+    def test_no_inline_unordered_lists(self):
+        """Unordered lists must start at the beginning of a line."""
+        # Note: we don't test for unordered lists with `*`, because these
+        # would indicate emphasis when not used at the beginning of the line.
+        page_body = 'Some text with - a pseudo-list\nSome more -- fake lists.\n'
+        # Should be a no-op.
+        result = '\n'.join(to_mw(page_body))
+        self.assertEqual(result, page_body)
+
+
+class TransformEmphasis(TestCase):
+    """TestCases for transformations of emphasis, which can occur anywhere
+    in a line.
+
+    Note, that `to_mw()` returns a generator, so we need to
+    consume it with e.g. `list()` or `'\n'.join()` to generate
+    an actual result.
+    """
+
+    def test_light_emphasis(self):
+        """This type of emphasis is normally rendered as cursive text."""
+        known_items = [
+            ("text with _inline emphasis_ and more text", "text with ''inline emphasis'' and more text"),
+            ("_emphasis at the beginning_ and more text", "''emphasis at the beginning'' and more text"),
+            ("emphasis with following _non-white-space_, and text", "emphasis with following ''non-white-space'', and text"),
+        ]
+        for page_body, expected in known_items:
+            result = '\n'.join(to_mw(page_body))
+            self.assertEqual(result, expected)
+    
+    def test_strong_emphasis(self):
+        """This type of emphasis is normally rendered as bold text."""
+        known_items = [
+            ("text with *inline emphasis* and more text", "text with '''inline emphasis''' and more text"),
+            ("*emphasis at the beginning* and more text", "'''emphasis at the beginning''' and more text"),
+            ("emphasis with following *non-white-space*, and text", "emphasis with following '''non-white-space''', and text"),
+        ]
+        for page_body, expected in known_items:
+            result = '\n'.join(to_mw(page_body))
+            self.assertEqual(result, expected)
+    
+    def test_very_strong_emphasis(self):
+        """This type of emphasis is normally rendered as both bold and
+        cursive text.
+        """
+        known_items = [
+            ("text with *_inline emphasis_* and more text", "text with '''''inline emphasis''''' and more text"),
+            ("*_emphasis at the beginning_* and more text", "'''''emphasis at the beginning''''' and more text"),
+            ("emphasis with following *_non-white-space_*, and text", "emphasis with following '''''non-white-space''''', and text"),
+            ("text with _*inline emphasis*_ and more text", "text with '''''inline emphasis''''' and more text"),
+            ("_*emphasis at the beginning*_ and more text", "'''''emphasis at the beginning''''' and more text"),
+            ("emphasis with following _*non-white-space*_, and text", "emphasis with following '''''non-white-space''''', and text"),
+        ]
+        for page_body, expected in known_items:
+            result = '\n'.join(to_mw(page_body))
+            self.assertEqual(result, expected)
diff --git a/xml2mw.py b/xml2mw.py
@@ -23,9 +23,9 @@
 
 from os.path import abspath, basename, dirname, join, normpath
 
-import read_xml
-from sitemap import build_sitemap, write_sitemap
-from write_markup import write_mediawiki
+from xml2mw import read_xml
+from xml2mw.sitemap import build_sitemap, write_sitemap
+from xml2mw.write_markup import write_mediawiki
 
 # Modify this to suit your needs.
 XML_PATH = "./data"           # path to folder which contains your 'entities.xml' export file
diff --git a/xml2mw/transform.py b/xml2mw/transform.py
@@ -7,6 +7,8 @@
 Generate MediaWiki markup out of confluence markup.
 """
 
+import re
+
 # Some basic template strings
 # The confluence -> mediawiki mapping is not a direct one
 # to allow for some logic in an intermediate step.
@@ -18,7 +20,11 @@
     # 'p': '<br />{}',
     'hr': '----{}',
     'uli': '* {}',
+    'ulii': '** {}',
+    'uliii': '*** {}',
     'oli': '# {}',
+    'olii': '## {}',
+    'oliii': '### {}',
 }
 CONFLUENCE_MU = {
     'h1. ': 'h1',
@@ -27,21 +33,57 @@
     'h4. ': 'h4',
     # '\n': 'p',
     '* ': 'uli',
+    '- ': 'uli',   # alternate version
+    '-- ': 'ulii',
+    '--- ': 'uliii',
+}
+EMPHASIS = {
+    '*': "'''",         # strong emphasis
+    '_': "''",          # light emphasis
+    '*_': "'''''",      # very strong emphasis
+    '_*': "'''''",      # very strong emphasis (alternate)
 }
 
 
 def to_mw(body_content):
     """Parse string with confluence markup."""
     for line in body_content.split('\n'):
-        result = ''
-        # Check if the beginning of the line is relevant.
-        for confluence, tag in CONFLUENCE_MU.items():
-            if line.startswith(confluence):
-                rest = line.lstrip(confluence)
-                result = __get_markup(tag, rest)
-                break
+        possibly_changed = _transform_line_start(line)
+        possibly_changed = _transform_inner_line(possibly_changed)
+
         # Either yield the freshly created result, or the original line.
-        yield result or line
+        yield possibly_changed
+
+
+def _transform_line_start(line):
+    result = ''
+    # Check if the beginning of the line is relevant.
+    for confluence, tag in CONFLUENCE_MU.items():
+        if line.startswith(confluence):
+            rest = line.lstrip(confluence)
+            line = __get_markup(tag, rest)
+            result = line
+            break
+    return result or line
+
+
+def _transform_inner_line(line):
+    result = ''
+    emphasis_pattern = re.compile(r'([\*_]+)(\S[^\*_]*)([\*_]+)')
+
+    # Check if there is relevant markup anywhere in the line.
+    for (start, content, end) in re.findall(emphasis_pattern, line):
+        # start and end markup must match
+        if start == end or start == end[::-1]:
+            # If we don't have a replacement defined in EMPHASIS, do nothing.
+            try:
+                markup = EMPHASIS[start]
+                replacement = '{}{}{}'.format(markup, content, markup)
+                line = re.sub(emphasis_pattern, replacement, line, count=1)
+                result = line
+            except KeyError:
+                continue
+    return result or line
 
 
 def __get_markup(tag, content=''):