Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions c_src/lazy_html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <functional>
#include <memory>
#include <optional>
#include <set>
#include <stdexcept>
#include <string>
#include <tuple>
Expand Down Expand Up @@ -714,6 +715,25 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {

FINE_NIF(child_nodes, 0);

ExLazyHTML parent_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto nodes = std::vector<lxb_dom_node_t *>();
auto inserted_nodes = std::set<lxb_dom_node_t *>();

for (auto node : ex_lazy_html.resource->nodes) {
auto parent = node->parent;
if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto inserted_node = inserted_nodes.find(parent);
if (inserted_node == inserted_nodes.end()) {
inserted_nodes.insert(parent);
nodes.push_back(parent);
}
}
}
return ExLazyHTML(fine::make_resource<LazyHTML>(
ex_lazy_html.resource->document_ref, nodes, true));
}
FINE_NIF(parent_nodes, ERL_NIF_DIRTY_JOB_CPU_BOUND);

std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto document = ex_lazy_html.resource->document_ref->document;

Expand Down Expand Up @@ -802,6 +822,12 @@ std::uint64_t num_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {

FINE_NIF(num_nodes, 0);

bool equals(ErlNifEnv *env, ExLazyHTML html_a, ExLazyHTML html_b) {
return (html_a.resource->document_ref == html_b.resource->document_ref &&
html_a.resource->nodes == html_b.resource->nodes);
}
FINE_NIF(equals, 0);

std::vector<fine::Term> tag(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto values = std::vector<fine::Term>();

Expand Down
77 changes: 77 additions & 0 deletions lib/lazy_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,60 @@ defmodule LazyHTML do
LazyHTML.NIF.child_nodes(lazy_html)
end

@doc """
Returns the (unique) parent nodes of the root nodes in `lazy_html`.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>Hello</span> <span>world</span></div>|)
iex> spans = LazyHTML.query(lazy_html, "span")
iex> LazyHTML.parent_nodes(spans)
#LazyHTML<
1 node (from selector)
#1
<div><span>Hello</span> <span>world</span></div>
>

The root node is always <html>, even if initialized via `from_fragment/1`:

iex> lazy_html = LazyHTML.from_fragment(~S|<div>root</div>|)
iex> LazyHTML.parent_nodes(lazy_html)
#LazyHTML<
1 node (from selector)
#1
<html><div>root</div></html>
>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that should be the case, for the end user we should make it such that the fragment root has no parent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this was more accidental given my c++ implementation.

I'll check if I find out how to differentiate from_fragment vs from_document in c++.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I looked into this, and I now think the current behavior is correct and preferable.

The reason is that I could no longer write the get_css_path function without knowing if the node I'm passing in is part of a document or a fragment.
With the current behavior I can treat them both the same. The reason is that the css selector for fragments operates as if the fragment was inside an root node. Which makes sense, as the root of a css selector has to be a single node.

If you still think I should change it, then we need to add a new function that allows identifying how a LazyHTML was constructed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonatanklosko do you agree? If so I think this PR is ready and I'll remove the get_css_path test

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's correct if LazyHTML.from_fragment("<div></div>") |> LazyHTML.parent_node() returns more content.

If you still think I should change it, then we need to add a new function that allows identifying how a LazyHTML was constructed.

Typically it's something for the API user to track, since they are the one calling either from_fragment or from_document.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, ok. I think I can work around it by checking if the last parent is an html tag.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonatanklosko I have changed the implementation in my latest commit. You were right, this is better.

I couldn't find a way to identify if a document is a fragment or not in lexbor, so I tracked in manually at creation time. I think this is correct now, see the new tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonatanklosko Sorry for the ping. If just haven't gotten around to it yet, no worries, take your time.

I believe this was the last open issue, so I'm waiting on your approval here. After that I'll remove the "nth-child selector" test and then I think this PR is ready for a final review.


"""
@spec parent_nodes(t()) :: t()
def parent_nodes(lazy_html) do
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convention we follow is that the names are from perspective of a single node, so this should be parent_node. If the given %LazyHTML{} holds multiple nodes, it's just a batched version. We should not deduplicate nodes for the same reason.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, will do 👍 (plus remove the singular helpers)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not deduplicate nodes for the same reason.

Wait, do you mean if I have multiple elements on the same level and I call parent_node(same_level_nodes) I should get back the same parent node n times?
If so, I would strongly disagree, this seems pretty useless.
Also, if we remove equals? then a use has no was to remove those duplicates.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, do you mean if I have multiple elements on the same level and I call parent_node(same_level_nodes) I should get back the same parent node n times?

Correct.

In your use case it seems you target a specific element, so parent would always return either 1 or 0 elements.

The reason is API consistency, %LazyHTML{} holds a flat list of nodes and it is effectively a batch, so conceptually each operation applies to a single element, but is batched if there are multiple elements in the list.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My mental modal for LazyHTML is a document + a set of selected nodes. Or alternatively, a document plus the result of a CSS selector.

I would argue that a %LazyHTML{} that holds duplicate nodes should be an invalid state.
There is no css selector that returns multiple times the same node.
E.g. for this html:

<div>
  <span>1</span>
  <span>2</span>
</div>

I think that query(html, "div") and query(html, "span") |> parent_node() should be equal.

Also, what about getting siblings?
Without deduplication, query(html, "span") |> parent_node() |> child_nodes() will return 4 elements.
And it gets worse if I want grand-siblings etc.

I don't think this would be inconsistent with the API, other things that operate in a batch do return a list (apart from child_nodes).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fragment = LazyHTML.from_fragment(~S"""
<div>
  <div>1</div>
  <div>2</div>
</div>
""")

fragment |> LazyHTML.query("div") |> LazyHTML.query("div")

Currently this returns %LazyHTML{} that includes "1" and "2" twice. So the current interpretation is not a set. It's a fair argument to say this behaviour is weird, on the other hand it's a contrived example.

@josevalim do you have an opinion here, should we always return a set of nodes?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with treating it as a set, I assume such can be done cheaply?

Copy link
Member

@jonatanklosko jonatanklosko Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whenever building new list (query, child_nodes), we will need an extra unordered_set to keep track of which element we already included in the new list. It only stores pointers, so it seems fine to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh. Yeah, that is definitely surprising to me. Especially since it already filters out empty results..

LazyHTML.NIF.parent_nodes(lazy_html)
end

@doc """
Returns the parent nodes of the root nodes in `lazy_html`.
Useful when you're expecting a single, shared parent.
"""
def parent_node(lazy_html) do
parent = LazyHTML.NIF.parent_nodes(lazy_html)

case LazyHTML.NIF.num_nodes(parent) do
0 -> {:ok, nil}
1 -> {:ok, parent}
_ -> {:error, :multiple_parents}
end
end

@doc """
Same as `parent_node/1` but raises on multiple parents
"""
def parent_node!(lazy_html) do
case parent_node(lazy_html) do
{:ok, res} -> res
{:error, :multiple_parents} -> raise "Selected nodes have multiple parents"
end
end

@doc """
Returns the text content of all nodes in `lazy_html`.

Expand Down Expand Up @@ -481,6 +535,29 @@ defmodule LazyHTML do
LazyHTML.NIF.tag(lazy_html)
end

@doc """
Returns true if the lazy_html is selecting the same nodes starting from the same document.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span id=1>Hello</span></div>|)
iex> a = LazyHTML.query(lazy_html, "#1")
iex> b = LazyHTML.query(lazy_html, "div > span")
iex> LazyHTML.equals?(a, b)
true

Note that if the lazy_htmls are created separately, they are never equal:

iex> html_a = LazyHTML.from_fragment(~S|<div>hello</div>|)
iex> html_b = LazyHTML.from_fragment(~S|<div>hello</div>|)
iex> LazyHTML.equals?(html_a, html_b)
false
"""
@spec equals?(t(), t()) :: boolean()
def equals?(html_a, html_b) do
LazyHTML.NIF.equals(html_a, html_b)
end

@doc ~S"""
Escapes the given string to make a valid HTML text.

Expand Down
2 changes: 2 additions & 0 deletions lib/lazy_html/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ defmodule LazyHTML.NIF do
def filter(_lazy_html, _css_selector), do: err!()
def query_by_id(_lazy_html, _id), do: err!()
def child_nodes(_lazy_html), do: err!()
def parent_nodes(_lazy_html), do: err!()
def text(_lazy_html), do: err!()
def attribute(_lazy_html, _name), do: err!()
def attributes(_lazy_html), do: err!()
def tag(_lazy_html), do: err!()
def nodes(_lazy_html), do: err!()
def num_nodes(_lazy_html), do: err!()
def equals(_lazy_html_a, _lazy_html_b), do: err!()

defp err!(), do: :erlang.nif_error(:not_loaded)
end
90 changes: 90 additions & 0 deletions test/lazy_html_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,96 @@ defmodule LazyHTMLTest do
end
end

describe "parent_nodes/1" do
test "from selector of nodes on different levels" do
lazy_html =
LazyHTML.from_fragment("""
<div id=0>
<div id=1>
<span>Hello</span>
</div>
<span>world</span>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_nodes(spans)
parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort()
assert parent_ids == ["0", "1"]

# parent of div#id=0 is <html>
grandparents = LazyHTML.parent_nodes(parents)
assert LazyHTML.tag(grandparents) |> Enum.sort() == ["div", "html"]

# parent of <html> is null, so it's filtered out
great_grandparents = LazyHTML.parent_nodes(grandparents)
assert great_grandparents |> Enum.count() == 1

# again, parent of <html> is filtered out
assert LazyHTML.parent_nodes(great_grandparents) |> Enum.count() == 0
end

test "from selector of nodes on same level" do
lazy_html =
LazyHTML.from_fragment("""
<div id=0>
<div id=1>
<span>Hello</span>
</div>
<div id=2>
<span>world</span>
</div>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_nodes(spans)
parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort()
assert parent_ids == ["1", "2"]

# since they share the same parent, we now only have one node left
grandparent = LazyHTML.parent_nodes(parents)
assert LazyHTML.attribute(grandparent, "id") == ["0"]
end

defp get_css_path(node, acc) do
parent = LazyHTML.parent_node!(node)

if parent do
siblings =
LazyHTML.child_nodes(parent)
|> Enum.reject(fn n -> LazyHTML.tag(n) == [] end)

[tag] = LazyHTML.tag(node)
i = Enum.find_index(siblings, fn n -> LazyHTML.equals?(n, node) end)
get_css_path(parent, [{tag, i} | acc])
else
acc |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i + 1})" end)
end
end

test "construct nth-child selector by traversing parents" do
lazy_html =
LazyHTML.from_fragment("""
<div>
<div class="wibble">
<span>wibble</span>
</div>
<div class="wobble">
<span>wobble</span>
</div>
</div>
""")

span = LazyHTML.query(lazy_html, ".wobble span")
path = get_css_path(span, [])
assert path == "div:nth-child(1) > div:nth-child(2) > span:nth-child(1)"

span2 = LazyHTML.query(lazy_html, path)
assert LazyHTML.equals?(span, span2)
end
end

describe "query_by_id/2" do
test "raises when an empty id is given" do
assert_raise ArgumentError, ~r/id cannot be empty/, fn ->
Expand Down