some-programs · rwmpelstilzchen · May 21, 2013
diff --git a/config.yaml b/config.yaml
@@ -5,7 +5,7 @@ wp_exports: wordpress-xml
 build_dir: build
 
 # Output format: primary choices are html or markdown.
-target_format: markdown
+target_format: html
 
 # The date format of the wikipedia export file.
 # I'm not sure if this ever differs depending on wordpress localization.

diff --git a/exitwp.py b/exitwp.py
@@ -12,6 +12,7 @@
 from urlparse import urlparse, urljoin
 from urllib import urlretrieve
 from html2text import html2text_file
+from linebreaks_wp import linebreaks_wp
 
 '''
 exitwp - Wordpress xml exports to Jekykll blog format conversion
@@ -51,7 +52,7 @@ def html2fmt(html, target_format):
     #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
     #   html = html.replace('</pre>', ']]></pre>')
     if target_format == 'html':
-        return html
+        return linebreaks_wp(html);
     else:
         return html2text_file(html, None)
 
@@ -183,7 +184,8 @@ def get_item_uid(item, date_prefix=False, namespace=''):
                 dt = datetime.strptime(item['date'], date_fmt)
                 uid.append(dt.strftime('%Y-%m-%d'))
                 uid.append('-')
-            s_title = item['slug']
+            #s_title = item['slug']
+            s_title = item['wp_id']
             if s_title is None or s_title == '':
                 s_title = item['title']
             if s_title is None or s_title == '':

diff --git a/linebreaks_wp.py b/linebreaks_wp.py
@@ -0,0 +1,75 @@
+import re
+from django import template
+from django.utils.functional import allow_lazy
+from django.template.defaultfilters import stringfilter
+from django.utils.safestring import mark_safe, SafeData
+from django.utils.encoding import force_unicode
+from django.utils.html import escape
+from django.utils.text import normalize_newlines
+register = template.Library()
+
+def linebreaks_wp(pee, autoescape=False):
+    """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop"""
+    if (pee.strip() == ""):
+        return ""
+    pee = normalize_newlines(pee)
+    pee = pee + "\n"
+    pee = re.sub(r'<br />\s*<br />', "\n\n", pee)
+    allblocks = r'(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
+    pee = re.sub(r'(<' + allblocks + '[^>]*>)', lambda m: "\n"+m.group(1) if m.group(1) else "\n", pee)
+    pee = re.sub(r'(</' + allblocks + '>)', lambda m: m.group(1)+"\n\n" if m.group(1) else "\n\n", pee)
+    #pee = pee.replace("\r\n", "\n")
+    #pee = pee.replace("\r", "\n") #these taken care of by normalize_newlines
+    if (pee.find("<object") != -1):
+        pee = re.sub(r'\s*<param([^>]*)>\s*', lambda m: "<param%s>" % (m.group(1) if m.group(1) else "", ), pee) # no pee inside object/embed
+        pee = re.sub(r'\s*</embed>\s*', '</embed>', pee)
+    pee = re.sub(r"\n\n+", "\n\n", pee) # take care of duplicates
+    pees = re.split(r'\n\s*\n', pee) # since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through and drop any empty strings
+    #pees = [p for p in pees if p]
+    pee = "".join(["<p>%s</p>\n" % tinkle.strip('\n') for tinkle in pees])
+    pee = re.sub(r'<p>\s*</p>', '', pee) #under certain strange conditions it could create a P of entirely whitespace
+    pee = re.sub(r'<p>([^<]+)</(div|address|form)>', lambda m: "<p>%s</p></%s>" % ((lambda x: x.group(1) if x.group(1) else "")(m), (lambda x: x.group(2) if x.group(2) else "")(m), ), pee)
+    pee = re.sub(r'<p>\s*(</?' + allblocks + r'[^>]*>)\s*</p>', lambda m: m.group(1) if m.group(1) else "", pee) # don't pee all over a tag
+    pee = re.sub(r"<p>(<li.+?)</p>", lambda m: m.group(1) if m.group(1) else "", pee) # problem with nested lists
+    pee = re.sub(r'<p><blockquote([^>]*)>', lambda m: "<blockquote%s><p>" % (m.group(1) if m.group(1) else "",), pee, flags=re.IGNORECASE)
+    pee = pee.replace('</blockquote></p>', '</p></blockquote>')
+    pee = re.sub(r'<p>\s*(</?' + allblocks + r'[^>]*>)', lambda m: m.group(1) if m.group(1) else "", pee)
+    pee = re.sub(r'(</?' + allblocks + '[^>]*>)\s*</p>', lambda m: m.group(1) if m.group(1) else "", pee)
+
+    def _autop_newline_preservation_helper(matches):
+        return matches.group(0).replace("\n", "<WPPreserveNewline />")
+    pee = re.sub(r'<(script|style).*?</\1>', _autop_newline_preservation_helper, pee, flags=re.DOTALL)
+    pee = re.sub(r'(?<!<br />)\s*\n', "<br />\n", pee) # make line breaks
+    pee = pee.replace('<WPPreserveNewline />', "\n")
+
+    pee = re.sub(r'(</?' + allblocks + '[^>]*>)\s*<br />', lambda m: m.group(1) if m.group(1) else "", pee)
+    pee = re.sub(r'<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)', lambda m: m.group(1) if m.group(1) else "", pee)
+    if (pee.find('<pre') != -1):
+        def clean_pre(m):
+            if m.group(1) and m.group(2):
+                text = m.group(2)
+                text = text.replace('<br />', '')
+                text = text.replace('<p>', "\n")
+                text = text.replace('</p>', '')
+                text = m.group(1)+escape(text)+"</pre>"
+            else:
+                text = m.group(0)
+                text = text.replace('<br />', '')
+                text = text.replace('<p>', "\n")
+                text = text.replace('</p>', '')
+
+            return text
+        pee = re.sub('(?is)(<pre[^>]*>)(.*?)</pre>', clean_pre, pee)
+    pee = re.sub( r"\n</p>$", '</p>', pee)
+    return pee
+linebreaks_wp = allow_lazy(linebreaks_wp, unicode)
+
+@register.filter("linebreaks_wp")
+@stringfilter
+def linebreaks_wp_filter(value, autoescape=None):
+    """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop"""
+    autoescape = autoescape and not isinstance(value, SafeData)
+    return mark_safe(linebreaks_wp(value, autoescape))
+linebreaks_wp_filter.is_safe = True
+linebreaks_wp_filter.needs_autoescape = True
+linebreaks_wp = stringfilter(linebreaks_wp)