-
Notifications
You must be signed in to change notification settings - Fork 0
/
DocumentToMarkdownConverter.swift
120 lines (110 loc) · 4.3 KB
/
DocumentToMarkdownConverter.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import Foundation
import SwiftSoup
/**
* Converts HTML documents into Markdown.
*/
public struct DocumentToMarkdownConverter {
private let defaultPrefix: String
private let defaultPostfix: String
private let useMultiLineCodeBlocks: Bool
private let codeLanguage: String?
public init(
defaultPrefix: String = "",
defaultPostfix: String = "",
useMultiLineCodeBlocks: Bool = false,
codeLanguage: String? = nil
) {
self.defaultPrefix = defaultPrefix
self.defaultPostfix = defaultPostfix
self.useMultiLineCodeBlocks = useMultiLineCodeBlocks
self.codeLanguage = codeLanguage
}
/** Parses and converts a full HTML document to Markdown. */
public func convert(htmlDocument: String, baseURL: URL? = nil) throws -> String {
try convert(SwiftSoup.parse(htmlDocument), baseURL: baseURL)
}
/** Parses and converts an HTML snippet to Markdown. */
public func convert(htmlFragment: String, baseURL: URL? = nil) throws -> String {
try convert(SwiftSoup.parseBodyFragment(htmlFragment), baseURL: baseURL)
}
public func plainTextOf(htmlFragment: String) throws -> String {
try SwiftSoup.parseBodyFragment(htmlFragment).text()
}
/** Converts an HTML element to Markdown. */
public func convert(_ element: Element, baseURL: URL? = nil, usedPrefixes: Set<String> = [], usedPostfixes: Set<String> = []) throws -> String {
var mdPrefix: String = defaultPrefix
var mdPostfix: String = defaultPostfix
var mdIfEmpty: String = ""
var content = try element.getChildNodes().map {
if let childElement = $0 as? Element {
return try convert(childElement, baseURL: baseURL, usedPrefixes: usedPrefixes.union([mdPrefix]), usedPostfixes: usedPostfixes.union([mdPostfix]))
} else if let childText = ($0 as? TextNode)?.getWholeText() {
var trimmed = childText.trimmingCharacters(in: .whitespacesAndNewlines)
if childText.hasPrefix(" ") { trimmed = " \(trimmed)" }
if childText.hasSuffix(" ") { trimmed += " " }
return trimmed
} else {
return ""
}
}.joined()
switch element.tagName() {
case "a":
if let href = try? element.attr("href") {
mdPrefix = "["
mdPostfix = "](\(URL(string: href, relativeTo: baseURL)?.absoluteString ?? href))"
}
case "b", "strong", "em":
if usedPrefixes.contains("**") {
mdPrefix = "*"
mdPostfix = "*"
} else {
mdPrefix = "**"
mdPostfix = "**"
}
case "i":
mdPrefix = "*"
mdPostfix = "*"
case "u":
mdPrefix = "__"
mdPostfix = "__"
case "br":
mdIfEmpty = "\n"
case "p":
mdPrefix = "\n\n"
mdPostfix = "\n\n"
mdIfEmpty = "\n\n"
content = content.trimmingCharacters(in: .whitespacesAndNewlines)
case "pre", "tt", "code", "samp":
if useMultiLineCodeBlocks && content.contains("\n") {
mdPrefix = "```\(codeLanguage ?? "")\n"
mdPostfix = "\n```"
} else {
mdPrefix = "`"
mdPostfix = "`"
}
content = content.trimmingCharacters(in: .whitespacesAndNewlines)
case "h1", "h2", "h3", "h4", "h5", "h6":
mdPrefix = "\n**"
mdPostfix = "**\n"
case "img":
mdPrefix = (try? element.attr("alt")) ?? defaultPrefix
mdPostfix = defaultPostfix
case "li":
mdPrefix = "- "
mdPostfix = "\n"
default:
break
}
if usedPrefixes.contains(mdPrefix) {
mdPrefix = defaultPrefix
}
if usedPostfixes.contains(mdPostfix) {
mdPostfix = defaultPostfix
}
if content.isEmpty {
return mdIfEmpty
} else {
return "\(mdPrefix)\(content)\(mdPostfix)"
}
}
}