Skip to content

Commit cb7812f

Browse files
committed
Support more markup transformations, plus more tests
Currently, the following confluence markup is supported: * Headings h1. to h4. * Unordered list items (multilevel) * Ordered list items (multilevel) * text emphasis (cursive, bold, and cursive AND bold) Notable things that are still missing: * links * tables * images * several text effects like strike-outs, superscript...
1 parent e1cdc78 commit cb7812f

File tree

6 files changed

+255
-28
lines changed

6 files changed

+255
-28
lines changed

Pipfile

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ anytree = "*"
99

1010
[dev-packages]
1111
ipython = "*"
12+
pytest = "*"
1213

1314
[requires]
1415
python_version = "3.6"

Pipfile.lock

+49-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,7 @@ Parse confluence xml export and generate pages with mediawiki markup.
1010
* Run `python xml2mw.py`.
1111

1212
Now, result files should be in `OUT_PATH`, and a file `sitemap.txt` should be in the base directory.
13+
14+
## Test suite
15+
16+
If you want to run the test suite, just run `python -m unittest discover`. Using `pytest` should work as well, if you prefer that. `pytest` is included in the development dependencies, which you can install with `pipenv install --dev`.

test/test_transform.py

+148-12
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,170 @@
99
from xml2mw.transform import to_mw
1010

1111

12-
class TransformToMW(TestCase):
13-
"""TestCases for transformation to media wiki markup.
14-
15-
Note, that `to_mw()` returns a generator, so we need to
16-
consume it with e.g. `list()` or `'\n'.join()` to generate
17-
an actual result.
12+
class TransformEdgeCases(TestCase):
13+
"""Some general tests which do not fit into any
14+
other category.
1815
"""
1916

2017
def test_empty_page(self):
2118
"""A completely empty body should yield an empty generator."""
2219
page_body = ''
23-
result = '\n'.join(to_mw(page_body))
2420
expected = ''
21+
result = '\n'.join(to_mw(page_body))
2522
self.assertEqual(result, expected)
2623

24+
25+
class TransformHeadings(TestCase):
26+
"""TestCases for transformation of headings, which can only
27+
be at the beginning of lines.
28+
29+
Note, that `to_mw()` returns a generator, so we need to
30+
consume it with e.g. `list()` or `'\n'.join()` to generate
31+
an actual result.
32+
"""
33+
2734
def test_various_headings(self):
2835
"""Test transformation of heading markup."""
2936
page_body = 'h1. H1\nh2. H2\nh3. H3\nh4. H4\n'
30-
result = '\n'.join(to_mw(page_body))
3137
expected = '= H1 =\n== H2 ==\n=== H3 ===\n==== H4 ====\n'
38+
result = '\n'.join(to_mw(page_body))
3239
self.assertEqual(result, expected)
3340

34-
def test_inline_markup(self):
35-
"""The markup which is currently implemented must be at the
41+
def test_no_inline_headings(self):
42+
"""Most markup which is currently implemented must be at the
3643
beginning of lines, i.e. a `h1. ` in the middle of a line
3744
should not be interpreted as heading markup.
3845
"""
39-
page_body = 'h1. A Heading\n\nA line with h1. in it.\n'
46+
page_body = 'h1. A Heading\n\nA line with h1. in it.\nA line with h3. in it.\n'
47+
expected = '= A Heading =\n\nA line with h1. in it.\nA line with h3. in it.\n'
48+
result = '\n'.join(to_mw(page_body))
49+
self.assertEqual(result, expected)
50+
51+
52+
class TransformLists(TestCase):
53+
"""Test cases for transformation of unordered and ordered lists.
54+
55+
Note, that most of the list syntax is identical for confluence
56+
and mediawiki markup, so for many of these tests, `to_mw()`
57+
should actually be a no-op.
58+
Also note that because `to_mw()` returns a generator, we need
59+
to consume it with e.g. `list()` or `'\n'.join()` to generate
60+
an actual result.
61+
"""
62+
63+
def test_unordered_list_a(self):
64+
"""There are two versions of unordered list items in
65+
confluence markup, this tests the `*` version.
66+
"""
67+
page_body = 'Some text.\n* first item.\n* second item.\n'
68+
expected = 'Some text.\n* first item.\n* second item.\n'
69+
result = '\n'.join(to_mw(page_body))
70+
self.assertEqual(result, expected)
71+
72+
def test_unordered_list_b(self):
73+
"""There are two versions of unordered list items in
74+
confluence markup, this tests the `-` version.
75+
"""
76+
page_body = 'Some text.\n- first item.\n- second item.\n'
77+
expected = 'Some text.\n* first item.\n* second item.\n'
78+
result = '\n'.join(to_mw(page_body))
79+
self.assertEqual(result, expected)
80+
81+
def test_nested_unordered_lists_a(self):
82+
"""Test nested unordered lists of type `*`, but not mixed ones."""
83+
page_body = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
84+
expected = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
85+
result = '\n'.join(to_mw(page_body))
86+
self.assertEqual(result, expected)
87+
88+
def test_nested_unordered_lists_b(self):
89+
"""Test nested unordered lists of type `-`, but not mixed ones."""
90+
page_body = 'Some text.\n- first level, first item\n-- second level, first item\n-- second level, second item\n- first level, second item\n'
91+
expected = 'Some text.\n* first level, first item\n** second level, first item\n** second level, second item\n* first level, second item\n'
92+
result = '\n'.join(to_mw(page_body))
93+
self.assertEqual(result, expected)
94+
95+
def test_ordered_list(self):
96+
"""Test if ordered list items are transformed correctly."""
97+
page_body = 'Some text.\n# first item\n# second item\n'
98+
expected = 'Some text.\n# first item\n# second item\n'
99+
result = '\n'.join(to_mw(page_body))
100+
self.assertEqual(result, expected)
101+
102+
def test_nested_ordered_lists(self):
103+
"""Test nested ordered lists, but not mixed ones."""
104+
page_body = 'Some text.\n# first, first\n## second, first\n## second, second\n# first, second\n'
105+
expected = 'Some text.\n# first, first\n## second, first\n## second, second\n# first, second\n'
106+
result = '\n'.join(to_mw(page_body))
107+
self.assertEqual(result, expected)
108+
109+
def test_mixed_lists(self):
110+
"""Ordered and unordered lists can be mixed."""
111+
page_body = '# first\n#* mixed\n#* list\n# for us\n'
112+
expected = '# first\n#* mixed\n#* list\n# for us\n'
40113
result = '\n'.join(to_mw(page_body))
41-
expected = '= A Heading =\n\nA line with h1. in it.\n'
42114
self.assertEqual(result, expected)
115+
116+
def test_no_inline_ordered_lists(self):
117+
"""Ordered lists must start at the beginning of a line."""
118+
page_body = 'Some text with # a pseudo list\nSome more ## fake lists\n'
119+
expected = 'Some text with # a pseudo list\nSome more ## fake lists\n'
120+
result = '\n'.join(to_mw(page_body))
121+
self.assertEqual(result, expected)
122+
123+
def test_no_inline_unordered_lists(self):
124+
"""Unordered lists must start at the beginning of a line."""
125+
# Note: we don't test for unordered lists with `*`, because these
126+
# would indicate emphasis when not used at the beginning of the line.
127+
page_body = 'Some text with - a pseudo-list\nSome more -- fake lists.\n'
128+
# Should be a no-op.
129+
result = '\n'.join(to_mw(page_body))
130+
self.assertEqual(result, page_body)
131+
132+
133+
class TransformEmphasis(TestCase):
134+
"""TestCases for transformations of emphasis, which can occur anywhere
135+
in a line.
136+
137+
Note, that `to_mw()` returns a generator, so we need to
138+
consume it with e.g. `list()` or `'\n'.join()` to generate
139+
an actual result.
140+
"""
141+
142+
def test_light_emphasis(self):
143+
"""This type of emphasis is normally rendered as cursive text."""
144+
known_items = [
145+
("text with _inline emphasis_ and more text", "text with ''inline emphasis'' and more text"),
146+
("_emphasis at the beginning_ and more text", "''emphasis at the beginning'' and more text"),
147+
("emphasis with following _non-white-space_, and text", "emphasis with following ''non-white-space'', and text"),
148+
]
149+
for page_body, expected in known_items:
150+
result = '\n'.join(to_mw(page_body))
151+
self.assertEqual(result, expected)
152+
153+
def test_strong_emphasis(self):
154+
"""This type of emphasis is normally rendered as bold text."""
155+
known_items = [
156+
("text with *inline emphasis* and more text", "text with '''inline emphasis''' and more text"),
157+
("*emphasis at the beginning* and more text", "'''emphasis at the beginning''' and more text"),
158+
("emphasis with following *non-white-space*, and text", "emphasis with following '''non-white-space''', and text"),
159+
]
160+
for page_body, expected in known_items:
161+
result = '\n'.join(to_mw(page_body))
162+
self.assertEqual(result, expected)
163+
164+
def test_very_strong_emphasis(self):
165+
"""This type of emphasis is normally rendered as both bold and
166+
cursive text.
167+
"""
168+
known_items = [
169+
("text with *_inline emphasis_* and more text", "text with '''''inline emphasis''''' and more text"),
170+
("*_emphasis at the beginning_* and more text", "'''''emphasis at the beginning''''' and more text"),
171+
("emphasis with following *_non-white-space_*, and text", "emphasis with following '''''non-white-space''''', and text"),
172+
("text with _*inline emphasis*_ and more text", "text with '''''inline emphasis''''' and more text"),
173+
("_*emphasis at the beginning*_ and more text", "'''''emphasis at the beginning''''' and more text"),
174+
("emphasis with following _*non-white-space*_, and text", "emphasis with following '''''non-white-space''''', and text"),
175+
]
176+
for page_body, expected in known_items:
177+
result = '\n'.join(to_mw(page_body))
178+
self.assertEqual(result, expected)

xml2mw.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323

2424
from os.path import abspath, basename, dirname, join, normpath
2525

26-
import read_xml
27-
from sitemap import build_sitemap, write_sitemap
28-
from write_markup import write_mediawiki
26+
from xml2mw import read_xml
27+
from xml2mw.sitemap import build_sitemap, write_sitemap
28+
from xml2mw.write_markup import write_mediawiki
2929

3030
# Modify this to suit your needs.
3131
XML_PATH = "./data" # path to folder which contains your 'entities.xml' export file

xml2mw/transform.py

+50-8
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
Generate MediaWiki markup out of confluence markup.
88
"""
99

10+
import re
11+
1012
# Some basic template strings
1113
# The confluence -> mediawiki mapping is not a direct one
1214
# to allow for some logic in an intermediate step.
@@ -18,7 +20,11 @@
1820
# 'p': '<br />{}',
1921
'hr': '----{}',
2022
'uli': '* {}',
23+
'ulii': '** {}',
24+
'uliii': '*** {}',
2125
'oli': '# {}',
26+
'olii': '## {}',
27+
'oliii': '### {}',
2228
}
2329
CONFLUENCE_MU = {
2430
'h1. ': 'h1',
@@ -27,21 +33,57 @@
2733
'h4. ': 'h4',
2834
# '\n': 'p',
2935
'* ': 'uli',
36+
'- ': 'uli', # alternate version
37+
'-- ': 'ulii',
38+
'--- ': 'uliii',
39+
}
40+
EMPHASIS = {
41+
'*': "'''", # strong emphasis
42+
'_': "''", # light emphasis
43+
'*_': "'''''", # very strong emphasis
44+
'_*': "'''''", # very strong emphasis (alternate)
3045
}
3146

3247

3348
def to_mw(body_content):
3449
"""Parse string with confluence markup."""
3550
for line in body_content.split('\n'):
36-
result = ''
37-
# Check if the beginning of the line is relevant.
38-
for confluence, tag in CONFLUENCE_MU.items():
39-
if line.startswith(confluence):
40-
rest = line.lstrip(confluence)
41-
result = __get_markup(tag, rest)
42-
break
51+
possibly_changed = _transform_line_start(line)
52+
possibly_changed = _transform_inner_line(possibly_changed)
53+
4354
# Either yield the freshly created result, or the original line.
44-
yield result or line
55+
yield possibly_changed
56+
57+
58+
def _transform_line_start(line):
59+
result = ''
60+
# Check if the beginning of the line is relevant.
61+
for confluence, tag in CONFLUENCE_MU.items():
62+
if line.startswith(confluence):
63+
rest = line.lstrip(confluence)
64+
line = __get_markup(tag, rest)
65+
result = line
66+
break
67+
return result or line
68+
69+
70+
def _transform_inner_line(line):
71+
result = ''
72+
emphasis_pattern = re.compile(r'([\*_]+)(\S[^\*_]*)([\*_]+)')
73+
74+
# Check if there is relevant markup anywhere in the line.
75+
for (start, content, end) in re.findall(emphasis_pattern, line):
76+
# start and end markup must match
77+
if start == end or start == end[::-1]:
78+
# If we don't have a replacement defined in EMPHASIS, do nothing.
79+
try:
80+
markup = EMPHASIS[start]
81+
replacement = '{}{}{}'.format(markup, content, markup)
82+
line = re.sub(emphasis_pattern, replacement, line, count=1)
83+
result = line
84+
except KeyError:
85+
continue
86+
return result or line
4587

4688

4789
def __get_markup(tag, content=''):

0 commit comments

Comments
 (0)