Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 63 additions & 4 deletions c_src/lazy_html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <stdexcept>
#include <string>
#include <tuple>
#include <unordered_set>
#include <variant>

#include <lexbor/html/html.h>
Expand Down Expand Up @@ -43,8 +44,10 @@ auto resource = fine::Atom("resource");

struct DocumentRef {
lxb_html_document_t *document;
bool is_fragment;

DocumentRef(lxb_html_document_t *document) : document(document) {}
DocumentRef(lxb_html_document_t *document, bool is_fragment)
: document(document), is_fragment(is_fragment) {}

~DocumentRef() { lxb_html_document_destroy(this->document); }
};
Expand Down Expand Up @@ -97,7 +100,7 @@ ExLazyHTML from_document(ErlNifEnv *env, ErlNifBinary html) {
throw std::runtime_error("failed to parse html document");
}

auto document_ref = std::make_shared<DocumentRef>(document);
auto document_ref = std::make_shared<DocumentRef>(document, false);
document_guard.deactivate();

auto nodes = std::vector<lxb_dom_node_t *>();
Expand Down Expand Up @@ -129,7 +132,7 @@ ExLazyHTML from_fragment(ErlNifEnv *env, ErlNifBinary html) {
throw std::runtime_error("failed to parse html fragment");
}

auto document_ref = std::make_shared<DocumentRef>(document);
auto document_ref = std::make_shared<DocumentRef>(document, true);
document_guard.deactivate();

auto nodes = std::vector<lxb_dom_node_t *>();
Expand Down Expand Up @@ -522,7 +525,12 @@ ExLazyHTML from_tree(ErlNifEnv *env, std::vector<fine::Term> tree) {
nodes.push_back(node);
}

auto document_ref = std::make_shared<DocumentRef>(document);
bool is_fragment = true;
if (!nodes.empty() && lxb_html_tree_node_is(nodes.front(), LXB_TAG_HTML)) {
is_fragment = false;
}

auto document_ref = std::make_shared<DocumentRef>(document, is_fragment);
document_guard.deactivate();

return ExLazyHTML(fine::make_resource<LazyHTML>(document_ref, nodes, false));
Expand Down Expand Up @@ -714,6 +722,57 @@ ExLazyHTML child_nodes(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {

FINE_NIF(child_nodes, 0);

ExLazyHTML parent_node(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
bool is_document = !ex_lazy_html.resource->document_ref->is_fragment;
auto nodes = std::vector<lxb_dom_node_t *>();
auto inserted_nodes = std::unordered_set<lxb_dom_node_t *>();

for (auto node : ex_lazy_html.resource->nodes) {
auto parent = lxb_dom_node_parent(node);
if (parent != NULL && parent->type == LXB_DOM_NODE_TYPE_ELEMENT &&
(is_document || !lxb_html_tree_node_is(parent, LXB_TAG_HTML))) {
auto inserted_node = inserted_nodes.find(parent);
if (inserted_node == inserted_nodes.end()) {
inserted_nodes.insert(parent);
nodes.push_back(parent);
}
}
}
return ExLazyHTML(fine::make_resource<LazyHTML>(
ex_lazy_html.resource->document_ref, nodes, true));
}
FINE_NIF(parent_node, ERL_NIF_DIRTY_JOB_CPU_BOUND);

std::vector<int64_t> nth_child(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto values = std::vector<int64_t>();
for (auto node : ex_lazy_html.resource->nodes) {
if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
continue;
}

auto parent = lxb_dom_node_parent(node);
if (parent == NULL) {
// We're at the root, nth_child is 1
values.push_back(1);
} else {
int64_t i = 1;
for (auto child = lxb_dom_node_first_child(parent); child != NULL;
child = lxb_dom_node_next(child)) {
if (child == node) {
break;
}
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
i++;
}
}
values.push_back(i);
}
}

return values;
}
FINE_NIF(nth_child, ERL_NIF_DIRTY_JOB_CPU_BOUND);

std::string text(ErlNifEnv *env, ExLazyHTML ex_lazy_html) {
auto document = ex_lazy_html.resource->document_ref->document;

Expand Down
37 changes: 37 additions & 0 deletions lib/lazy_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,43 @@ defmodule LazyHTML do
LazyHTML.NIF.child_nodes(lazy_html)
end

@doc """
Returns the (unique) parent nodes of the root nodes in `lazy_html`.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>Hello</span> <span>world</span></div>|)
iex> spans = LazyHTML.query(lazy_html, "span")
iex> LazyHTML.parent_node(spans)
#LazyHTML<
1 node (from selector)
#1
<div><span>Hello</span> <span>world</span></div>
>

"""
@spec parent_node(t()) :: t()
def parent_node(lazy_html) do
LazyHTML.NIF.parent_node(lazy_html)
end

@doc """
Returns the positions of the selcted nodes among their siblings.
Note that the numbering is 1 based and doesn't include text nodes,
as it matches the `nth-child` CSS selector.

## Examples

iex> lazy_html = LazyHTML.from_fragment(~S|<div><span>1</span><span>2</span></div>|)
iex> spans = LazyHTML.query(lazy_html, "span")
iex> LazyHTML.nth_child(spans)
[1, 2]
"""
@spec nth_child(t()) :: list(integer())
def nth_child(lazy_html) do
LazyHTML.NIF.nth_child(lazy_html)
end

@doc """
Returns the text content of all nodes in `lazy_html`.

Expand Down
2 changes: 2 additions & 0 deletions lib/lazy_html/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ defmodule LazyHTML.NIF do
def filter(_lazy_html, _css_selector), do: err!()
def query_by_id(_lazy_html, _id), do: err!()
def child_nodes(_lazy_html), do: err!()
def parent_node(_lazy_html), do: err!()
def nth_child(_lazy_html), do: err!()
def text(_lazy_html), do: err!()
def attribute(_lazy_html, _name), do: err!()
def attributes(_lazy_html), do: err!()
Expand Down
128 changes: 128 additions & 0 deletions test/lazy_html_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,134 @@ defmodule LazyHTMLTest do
end
end

describe "parent_node/1" do
test "from selector of nodes on different levels" do
lazy_html =
LazyHTML.from_fragment("""
<div id="a">
<div id="b">
<span>Hello</span>
</div>
<span>world</span>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_node(spans)
parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort()
assert parent_ids == ["a", "b"]

# parent of div#id="a" is null
grandparents = LazyHTML.parent_node(parents)
assert LazyHTML.tag(grandparents) == ["div"]

great_grandparents = LazyHTML.parent_node(grandparents)
assert great_grandparents |> Enum.count() == 0
end

test "from selector of nodes on same level" do
lazy_html =
LazyHTML.from_fragment("""
<div id="a">
<div id="b">
<span>Hello</span>
</div>
<div id="c">
<span>world</span>
</div>
</div>
""")

spans = LazyHTML.query(lazy_html, "span")
parents = LazyHTML.parent_node(spans)
parent_ids = parents |> Enum.flat_map(&LazyHTML.attribute(&1, "id")) |> Enum.sort()
assert parent_ids == ["b", "c"]

# since they share the same parent, we now only have one node left
grandparent = LazyHTML.parent_node(parents)
assert LazyHTML.attribute(grandparent, "id") == ["a"]
end

defp parents(node) do
if Enum.count(node) == 0 do
[]
else
tag = LazyHTML.tag(node)
parents(LazyHTML.parent_node(node)) ++ tag
end
end

test "last parent node is <html> if instantiated via from_document and similar" do
lazy_html = LazyHTML.from_document("<html><body><div>root</div></body></html>")
assert parents(lazy_html["div"]) == ["html", "body", "div"]

lazy_html = LazyHTML.from_fragment("<div>root</div>")
assert parents(lazy_html["div"]) == ["div"]

lazy_html = LazyHTML.from_tree([{"div", [], []}])
assert parents(lazy_html["div"]) == ["div"]

lazy_html = LazyHTML.from_tree([{"html", [], [{"body", [], [{"div", [], []}]}]}])
assert parents(lazy_html["div"]) == ["html", "body", "div"]
end

defp get_css_path(node, acc) do
1 = Enum.count(node)
parent = LazyHTML.parent_node(node)
[tag] = LazyHTML.tag(node)
[i] = LazyHTML.nth_child(node)

if Enum.count(parent) > 0 do
get_css_path(parent, [{tag, i} | acc])
else
[{tag, i} | acc] |> Enum.map_join(" > ", fn {tag, i} -> "#{tag}:nth-child(#{i})" end)
end
end

test "construct nth-child selector by traversing parents" do
lazy_html =
LazyHTML.from_fragment("""
<div>
<div class="wibble">
<span>wibble</span>
</div>
<div class="wobble">
<span>wobble</span>
</div>
</div>
""")

span = LazyHTML.query(lazy_html, ".wobble span")
path = get_css_path(span, [])
assert path == "div:nth-child(1) > div:nth-child(2) > span:nth-child(1)"

span2 = LazyHTML.query(lazy_html, path)
assert LazyHTML.text(span) == LazyHTML.text(span2)
end
end

describe "nth_child/1" do
test "nth_child gives position" do
lazy_html =
LazyHTML.from_fragment("""
<div>
Text isn't counted.
<span>1</span>
<!-- neither are comments -->
<span>2</span>
</div>
""")

assert LazyHTML.nth_child(lazy_html) == [1]
assert lazy_html["div"] |> LazyHTML.nth_child() == [1]
assert lazy_html["span"] |> LazyHTML.nth_child() == [1, 2]

# Verify numbering matches css selector
assert lazy_html["span:nth-child(1)"] |> LazyHTML.text() == "1"
assert lazy_html["span:nth-child(2)"] |> LazyHTML.text() == "2"
end
end

describe "query_by_id/2" do
test "raises when an empty id is given" do
assert_raise ArgumentError, ~r/id cannot be empty/, fn ->
Expand Down