diff --git a/cmd/html2text/main.go b/cmd/html2text/main.go
index e287c18..3ac3ad8 100644
--- a/cmd/html2text/main.go
+++ b/cmd/html2text/main.go
@@ -5,7 +5,7 @@ import (
"fmt"
"os"
- "jaytaylor.com/html2text"
+ "github.com/jaytaylor/html2text"
)
func main() {
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..f866373
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,14 @@
+module github.com/jaytaylor/html2text
+
+go 1.18
+
+require (
+ github.com/olekukonko/tablewriter v0.0.5
+ github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf
+ golang.org/x/net v0.8.0
+)
+
+require (
+ github.com/mattn/go-runewidth v0.0.14 // indirect
+ github.com/rivo/uniseg v0.4.4 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..acf09ad
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,12 @@
+github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
+github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
+github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
+github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
+github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
+github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
+golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
+golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
diff --git a/html2text.go b/html2text.go
index 8fe9000..4d8183b 100644
--- a/html2text.go
+++ b/html2text.go
@@ -18,6 +18,7 @@ type Options struct {
PrettyTables bool // Turns on pretty ASCII rendering for table elements.
PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
OmitLinks bool // Turns on omitting links
+ PreserveWhitespace bool // Turns on whitespace preservation.
TextOnly bool // Returns only plain text
}
@@ -408,10 +409,10 @@ func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
return ctx.traverseChildren(node)
case html.TextNode:
- var data string
+ data := node.Data
if ctx.isPre {
data = node.Data
- } else {
+ } else if !ctx.options.PreserveWhitespace {
data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
}
return ctx.emit(data)
@@ -442,19 +443,25 @@ func (ctx *textifyTraverseContext) emit(data string) error {
for _, line := range lines {
runes := []rune(line)
startsWithSpace := unicode.IsSpace(runes[0])
- if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
+ missingSpaceSeperator := !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".")
+ if !ctx.options.PreserveWhitespace && missingSpaceSeperator {
if err = ctx.buf.WriteByte(' '); err != nil {
return err
}
ctx.lineLength++
}
ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
- for _, c := range line {
- if _, err = ctx.buf.WriteString(string(c)); err != nil {
+ for _, line := range strings.SplitAfter(line, "\n") {
+ if len(line) == 0 {
+ continue
+ }
+
+ if _, err = ctx.buf.WriteString(line); err != nil {
return err
}
- ctx.lineLength++
- if c == '\n' {
+ ctx.lineLength += len(line)
+
+ if line[len(line)-1] == '\n' {
ctx.lineLength = 0
if ctx.prefix != "" {
if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
diff --git a/html2text_test.go b/html2text_test.go
index 452b45e..5fd0ddd 100644
--- a/html2text_test.go
+++ b/html2text_test.go
@@ -1,4 +1,4 @@
-package html2text
+package html2text_test
import (
"bytes"
@@ -9,6 +9,8 @@ import (
"regexp"
"strings"
"testing"
+
+ "github.com/jaytaylor/html2text"
)
const destPath = "testdata"
@@ -49,7 +51,7 @@ func TestParseUTF8(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- text, err := FromReader(bytes.NewReader(bs))
+ text, err := html2text.FromReader(bytes.NewReader(bs))
if err != nil {
t.Fatal(err)
}
@@ -98,6 +100,44 @@ func TestStrippingWhitespace(t *testing.T) {
}
}
+func TestPreservingWhitespace(t *testing.T) {
+ testCases := []struct {
+ input string
+ output string
+ }{
+ {
+ "test text",
+ "test text",
+ },
+ {
+ " \ttext\ntext\n",
+ "text\ntext",
+ },
+ {
+ " \na \n\t \n \n a \t",
+ "a \n\t \n\na",
+ },
+ {
+ "test text",
+ "test text",
+ },
+ {
+ "test text ",
+ "test text",
+ },
+ }
+
+ for _, testCase := range testCases {
+ if msg, err := wantString(testCase.input, testCase.output, html2text.Options{
+ PreserveWhitespace: true,
+ }); err != nil {
+ t.Error(err)
+ } else if len(msg) > 0 {
+ t.Log(msg)
+ }
+ }
+}
+
func TestParagraphsAndBreaks(t *testing.T) {
testCases := []struct {
input string
@@ -333,9 +373,9 @@ Table 2 Header 1 Table 2 Header 2 Table 2 Footer 1 Table 2 Footer 2 Table 2 Row
}
for _, testCase := range testCases {
- options := Options{
+ options := html2text.Options{
PrettyTables: true,
- PrettyTablesOptions: NewPrettyTablesOptions(),
+ PrettyTablesOptions: html2text.NewPrettyTablesOptions(),
}
// Check pretty tabular ASCII version.
if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil {
@@ -513,7 +553,7 @@ func TestOmitLinks(t *testing.T) {
}
for _, testCase := range testCases {
- if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil {
+ if msg, err := wantString(testCase.input, testCase.output, html2text.Options{OmitLinks: true}); err != nil {
t.Error(err)
} else if len(msg) > 0 {
t.Log(msg)
@@ -904,16 +944,16 @@ func (m ExactStringMatcher) String() string {
return string(m)
}
-func wantRegExp(input string, outputRE string, options ...Options) (string, error) {
+func wantRegExp(input string, outputRE string, options ...html2text.Options) (string, error) {
return match(input, RegexpStringMatcher(outputRE), options...)
}
-func wantString(input string, output string, options ...Options) (string, error) {
+func wantString(input string, output string, options ...html2text.Options) (string, error) {
return match(input, ExactStringMatcher(output), options...)
}
-func match(input string, matcher StringMatcher, options ...Options) (string, error) {
- text, err := FromString(input, options...)
+func match(input string, matcher StringMatcher, options ...html2text.Options) (string, error) {
+ text, err := html2text.FromString(input, options...)
if err != nil {
return "", err
}
@@ -1000,7 +1040,7 @@ func Example() {