diff --git a/cmd/html2text/main.go b/cmd/html2text/main.go index e287c18..3ac3ad8 100644 --- a/cmd/html2text/main.go +++ b/cmd/html2text/main.go @@ -5,7 +5,7 @@ import ( "fmt" "os" - "jaytaylor.com/html2text" + "github.com/jaytaylor/html2text" ) func main() { diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f866373 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/jaytaylor/html2text + +go 1.18 + +require ( + github.com/olekukonko/tablewriter v0.0.5 + github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf + golang.org/x/net v0.8.0 +) + +require ( + github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/rivo/uniseg v0.4.4 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..acf09ad --- /dev/null +++ b/go.sum @@ -0,0 +1,12 @@ +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= +github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= +github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= +github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo= +github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM= +golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ= +golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= diff --git a/html2text.go b/html2text.go index 8fe9000..4d8183b 100644 --- a/html2text.go +++ b/html2text.go @@ -18,6 +18,7 @@ type Options struct { PrettyTables bool // Turns on pretty ASCII rendering for table elements. PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements. OmitLinks bool // Turns on omitting links + PreserveWhitespace bool // Turns on whitespace preservation. TextOnly bool // Returns only plain text } @@ -408,10 +409,10 @@ func (ctx *textifyTraverseContext) traverse(node *html.Node) error { return ctx.traverseChildren(node) case html.TextNode: - var data string + data := node.Data if ctx.isPre { data = node.Data - } else { + } else if !ctx.options.PreserveWhitespace { data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " ")) } return ctx.emit(data) @@ -442,19 +443,25 @@ func (ctx *textifyTraverseContext) emit(data string) error { for _, line := range lines { runes := []rune(line) startsWithSpace := unicode.IsSpace(runes[0]) - if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { + missingSpaceSeperator := !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") + if !ctx.options.PreserveWhitespace && missingSpaceSeperator { if err = ctx.buf.WriteByte(' '); err != nil { return err } ctx.lineLength++ } ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) - for _, c := range line { - if _, err = ctx.buf.WriteString(string(c)); err != nil { + for _, line := range strings.SplitAfter(line, "\n") { + if len(line) == 0 { + continue + } + + if _, err = ctx.buf.WriteString(line); err != nil { return err } - ctx.lineLength++ - if c == '\n' { + ctx.lineLength += len(line) + + if line[len(line)-1] == '\n' { ctx.lineLength = 0 if ctx.prefix != "" { if _, err = ctx.buf.WriteString(ctx.prefix); err != nil { diff --git a/html2text_test.go b/html2text_test.go index 452b45e..5fd0ddd 100644 --- a/html2text_test.go +++ b/html2text_test.go @@ -1,4 +1,4 @@ -package html2text +package html2text_test import ( "bytes" @@ -9,6 +9,8 @@ import ( "regexp" "strings" "testing" + + "github.com/jaytaylor/html2text" ) const destPath = "testdata" @@ -49,7 +51,7 @@ func TestParseUTF8(t *testing.T) { if err != nil { t.Fatal(err) } - text, err := FromReader(bytes.NewReader(bs)) + text, err := html2text.FromReader(bytes.NewReader(bs)) if err != nil { t.Fatal(err) } @@ -98,6 +100,44 @@ func TestStrippingWhitespace(t *testing.T) { } } +func TestPreservingWhitespace(t *testing.T) { + testCases := []struct { + input string + output string + }{ + { + "test text", + "test text", + }, + { + " \ttext\ntext\n", + "text\ntext", + }, + { + " \na \n\t \n \n a \t", + "a \n\t \n\na", + }, + { + "test text", + "test text", + }, + { + "test    text ", + "test    text", + }, + } + + for _, testCase := range testCases { + if msg, err := wantString(testCase.input, testCase.output, html2text.Options{ + PreserveWhitespace: true, + }); err != nil { + t.Error(err) + } else if len(msg) > 0 { + t.Log(msg) + } + } +} + func TestParagraphsAndBreaks(t *testing.T) { testCases := []struct { input string @@ -333,9 +373,9 @@ Table 2 Header 1 Table 2 Header 2 Table 2 Footer 1 Table 2 Footer 2 Table 2 Row } for _, testCase := range testCases { - options := Options{ + options := html2text.Options{ PrettyTables: true, - PrettyTablesOptions: NewPrettyTablesOptions(), + PrettyTablesOptions: html2text.NewPrettyTablesOptions(), } // Check pretty tabular ASCII version. if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil { @@ -513,7 +553,7 @@ func TestOmitLinks(t *testing.T) { } for _, testCase := range testCases { - if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil { + if msg, err := wantString(testCase.input, testCase.output, html2text.Options{OmitLinks: true}); err != nil { t.Error(err) } else if len(msg) > 0 { t.Log(msg) @@ -904,16 +944,16 @@ func (m ExactStringMatcher) String() string { return string(m) } -func wantRegExp(input string, outputRE string, options ...Options) (string, error) { +func wantRegExp(input string, outputRE string, options ...html2text.Options) (string, error) { return match(input, RegexpStringMatcher(outputRE), options...) } -func wantString(input string, output string, options ...Options) (string, error) { +func wantString(input string, output string, options ...html2text.Options) (string, error) { return match(input, ExactStringMatcher(output), options...) } -func match(input string, matcher StringMatcher, options ...Options) (string, error) { - text, err := FromString(input, options...) +func match(input string, matcher StringMatcher, options ...html2text.Options) (string, error) { + text, err := html2text.FromString(input, options...) if err != nil { return "", err } @@ -1000,7 +1040,7 @@ func Example() { ` - text, err := FromString(inputHTML, Options{PrettyTables: true}) + text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true}) if err != nil { panic(err) }