diff --git a/html/doc.go b/html/doc.go index 3a7e5ab17..885c4c593 100644 --- a/html/doc.go +++ b/html/doc.go @@ -78,16 +78,11 @@ example, to process each anchor node in depth-first order: if err != nil { // ... } - var f func(*html.Node) - f = func(n *html.Node) { + for n := range doc.Descendants() { if n.Type == html.ElementNode && n.Data == "a" { // Do something with n... } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } } - f(doc) The relevant specifications include: https://html.spec.whatwg.org/multipage/syntax.html and diff --git a/html/example_test.go b/html/example_test.go index 0b06ed773..830f0b27a 100644 --- a/html/example_test.go +++ b/html/example_test.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build go1.23 + // This example demonstrates parsing HTML data and walking the resulting tree. package html_test @@ -11,6 +13,7 @@ import ( "strings" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) func ExampleParse() { @@ -19,9 +22,8 @@ func ExampleParse() { if err != nil { log.Fatal(err) } - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "a" { + for n := range doc.Descendants() { + if n.Type == html.ElementNode && n.DataAtom == atom.A { for _, a := range n.Attr { if a.Key == "href" { fmt.Println(a.Val) @@ -29,11 +31,8 @@ func ExampleParse() { } } } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } } - f(doc) + // Output: // foo // /bar/baz diff --git a/html/iter.go b/html/iter.go new file mode 100644 index 000000000..54be8fd30 --- /dev/null +++ b/html/iter.go @@ -0,0 +1,56 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 + +package html + +import "iter" + +// Ancestors returns an iterator over the ancestors of n, starting with n.Parent. +// +// Mutating a Node or its parents while iterating may have unexpected results. +func (n *Node) Ancestors() iter.Seq[*Node] { + _ = n.Parent // eager nil check + + return func(yield func(*Node) bool) { + for p := n.Parent; p != nil && yield(p); p = p.Parent { + } + } +} + +// ChildNodes returns an iterator over the immediate children of n, +// starting with n.FirstChild. +// +// Mutating a Node or its children while iterating may have unexpected results. +func (n *Node) ChildNodes() iter.Seq[*Node] { + _ = n.FirstChild // eager nil check + + return func(yield func(*Node) bool) { + for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling { + } + } + +} + +// Descendants returns an iterator over all nodes recursively beneath +// n, excluding n itself. Nodes are visited in depth-first preorder. +// +// Mutating a Node or its descendants while iterating may have unexpected results. +func (n *Node) Descendants() iter.Seq[*Node] { + _ = n.FirstChild // eager nil check + + return func(yield func(*Node) bool) { + n.descendants(yield) + } +} + +func (n *Node) descendants(yield func(*Node) bool) bool { + for c := range n.ChildNodes() { + if !yield(c) || !c.descendants(yield) { + return false + } + } + return true +} diff --git a/html/iter_test.go b/html/iter_test.go new file mode 100644 index 000000000..cca7f82f5 --- /dev/null +++ b/html/iter_test.go @@ -0,0 +1,96 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 + +package html + +import ( + "strings" + "testing" +) + +func TestNode_ChildNodes(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"", ""}, + {"", "a"}, + {"a", "a"}, + {"", "a b"}, + {"ac", "a b c"}, + {"ad", "a b d"}, + {"cefi", "a f g h"}, + } + for _, test := range tests { + doc, err := Parse(strings.NewReader(test.in)) + if err != nil { + t.Fatal(err) + } + // Drill to + n := doc.FirstChild.FirstChild.NextSibling + var results []string + for c := range n.ChildNodes() { + results = append(results, c.Data) + } + if got := strings.Join(results, " "); got != test.want { + t.Errorf("ChildNodes = %q, want %q", got, test.want) + } + } +} + +func TestNode_Descendants(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"", ""}, + {"", "a"}, + {"", "a b"}, + {"b", "a b"}, + {"", "a b"}, + {"bd", "a b c d"}, + {"be", "a b c d e"}, + {"dfgj", "a b c d e f g h i j"}, + } + for _, test := range tests { + doc, err := Parse(strings.NewReader(test.in)) + if err != nil { + t.Fatal(err) + } + // Drill to + n := doc.FirstChild.FirstChild.NextSibling + var results []string + for c := range n.Descendants() { + results = append(results, c.Data) + } + if got := strings.Join(results, " "); got != test.want { + t.Errorf("Descendants = %q; want: %q", got, test.want) + } + } +} + +func TestNode_Ancestors(t *testing.T) { + for _, size := range []int{0, 1, 2, 10, 100, 10_000} { + n := buildChain(size) + nParents := 0 + for _ = range n.Ancestors() { + nParents++ + } + if nParents != size { + t.Errorf("number of Ancestors = %d; want: %d", nParents, size) + } + } +} + +func buildChain(size int) *Node { + child := new(Node) + for range size { + parent := child + child = new(Node) + parent.AppendChild(child) + } + return child +} diff --git a/html/node.go b/html/node.go index 1350eef22..77741a195 100644 --- a/html/node.go +++ b/html/node.go @@ -38,6 +38,10 @@ var scopeMarker = Node{Type: scopeMarkerNode} // that it looks like "a