Skip to content

Add support for parsing attributes as maps #467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ defmodule Floki do
inside a list.
"""

@type html_declaration :: {:pi, String.t(), [html_attribute()]}
@type html_attribute :: {String.t(), String.t()}
@type html_attributes :: [html_attribute()] | html_attributes_map()
@type html_attributes_map :: %{String.t() => String.t()}
@type html_declaration :: {:pi, String.t(), html_attributes()}
@type html_comment :: {:comment, String.t()}
@type html_doctype :: {:doctype, String.t(), String.t(), String.t()}
@type html_attribute :: {String.t(), String.t()}
@type html_text :: String.t()
@type html_tag :: {String.t(), [html_attribute()], [html_node()]}
@type html_tag :: {String.t(), html_attributes(), [html_node()]}
@type html_node ::
html_tag() | html_comment() | html_doctype() | html_declaration() | html_text()
@type html_tree :: [html_node()]
Expand Down Expand Up @@ -102,13 +104,19 @@ defmodule Floki do

## Options

* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Default to `false`.

* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.

See https://github.com/philss/floki#alternative-html-parsers for more details.

* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific for a given parser. Defaults to an empty list.

## Examples

iex> Floki.parse_document("<html><head></head><body>hello</body></html>")
Expand All @@ -117,6 +125,13 @@ defmodule Floki do
iex> Floki.parse_document("<html><head></head><body>hello</body></html>", html_parser: Floki.HTMLParser.Mochiweb)
{:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]}

iex> Floki.parse_document(
...> "<html><head></head><body class=main>hello</body></html>",
...> attributes_as_maps: true,
...> html_parser: Floki.HTMLParser.Mochiweb
...>)
{:ok, [{"html", %{}, [{"head", %{}, []}, {"body", %{"class" => "main"}, ["hello"]}]}]}

"""

@spec parse_document(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -152,13 +167,20 @@ defmodule Floki do

## Options

* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Remember that maps are no longer
ordered since OTP 26. Default to `false`.

* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.

See https://github.com/philss/floki#alternative-html-parsers for more details.

* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific for a given parser. Defaults to an empty list.

"""

@spec parse_fragment(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -355,7 +377,7 @@ defmodule Floki do
@spec find_and_update(
html_tree(),
css_selector(),
({String.t(), [html_attribute()]} -> {String.t(), [html_attribute()]} | :delete)
({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete)
) :: html_tree()
def find_and_update(html_tree, selector, fun) do
{tree, results} = Finder.find(html_tree, selector)
Expand Down
39 changes: 31 additions & 8 deletions lib/floki/html_parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,47 @@ defmodule Floki.HTMLParser do
The default parser is Mochiweb, which comes with Floki.
You can also choose between Html5ever or FastHtml.

And it's possible to pass down options to the parsers using
the `parser_args` option.

This module is also a behaviour that those parsers must implement.
"""

@default_parser Floki.HTMLParser.Mochiweb

@callback parse_document(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@callback parse_fragment(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@typep result(success) :: {:ok, success} | {:error, String.t()}
@typep html :: binary()

@callback parse_document(html(), Keyword.t()) :: result(Floki.html_tree())
@callback parse_fragment(html(), Keyword.t()) :: result(Floki.html_tree())

@callback parse_document_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())
@callback parse_fragment_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())

def parse_document(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_document(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser(opts).parse_document(html, parser_args)
parser = parser(opts)

if opts[:attributes_as_maps] do
parser.parse_document_with_attributes_as_maps(html, parser_args)
else
parser.parse_document(html, parser_args)
end
end

def parse_fragment(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_fragment(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser = parser(opts)

parser(opts).parse_fragment(html, parser_args)
if opts[:attributes_as_maps] do
parser.parse_fragment_with_attributes_as_maps(html, parser_args)
else
parser.parse_fragment(html, parser_args)
end
end

defp parser(opts) do
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/fast_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ defmodule Floki.HTMLParser.FastHtml do
execute_with_module(fn module -> module.decode_fragment(html, args) end)
end

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

defp execute_with_module(fun) do
case Code.ensure_loaded(:fast_html) do
{:module, module} ->
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/html5ever.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@ defmodule Floki.HTMLParser.Html5ever do
# NOTE: html5ever does not implement parse_fragment yet.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end
end
14 changes: 12 additions & 2 deletions lib/floki/html_parser/mochiweb.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ defmodule Floki.HTMLParser.Mochiweb do
@root_node "floki"

@impl true
def parse_document(html, _args) do
def parse_document(html, args) do
html = "<#{@root_node}>#{html}</#{@root_node}>"
{@root_node, [], parsed} = :floki_mochi_html.parse(html)
{@root_node, _, parsed} = :floki_mochi_html.parse(html, args)

{:ok, parsed}
end

# NOTE: mochi_html cannot make a distinction of a fragment and document.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end

@impl true
def parse_fragment_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end
end
19 changes: 14 additions & 5 deletions lib/floki/selector.ex
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,20 @@ defmodule Floki.Selector do
defp classes_matches?(_node, []), do: true
defp classes_matches?(%HTMLNode{attributes: []}, _), do: false

defp classes_matches?(%HTMLNode{attributes: attributes}, classes) do
case :proplists.get_value("class", attributes, nil) do
nil -> false
class -> classes -- String.split(class, ~r/\s+/) == []
end
defp classes_matches?(%HTMLNode{attributes: attributes}, classes) when is_list(attributes) do
class_attr_value = :proplists.get_value("class", attributes, nil)

do_classes_matches?(class_attr_value, classes)
end

defp classes_matches?(%HTMLNode{attributes: attributes}, classes) when is_map(attributes) do
do_classes_matches?(attributes["class"], classes)
end

defp do_classes_matches?(nil, _classes), do: false

defp do_classes_matches?(class_attr_value, classes) do
classes -- String.split(class_attr_value, ~r/\s+/) == []
end

defp attributes_matches?(_node, []), do: true
Expand Down
3 changes: 2 additions & 1 deletion lib/floki/selector/attribute_selector.ex
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ defmodule Floki.Selector.AttributeSelector do
end

# Returns if attributes of a node matches with a given attribute selector.
def match?(attributes, s = %AttributeSelector{match_type: nil, value: nil}) do
def match?(attributes, s = %AttributeSelector{match_type: nil, value: nil})
when is_list(attributes) or is_map(attributes) do
attribute_present?(s.attribute, attributes)
end

Expand Down
29 changes: 12 additions & 17 deletions lib/floki/selector/pseudo_class.ex
Original file line number Diff line number Diff line change
Expand Up @@ -111,35 +111,30 @@ defmodule Floki.Selector.PseudoClass do
end

def match_checked?(%{type: "input"} = html_node) do
case List.keyfind(html_node.attributes, "checked", 0) do
{"checked", _} -> true
_ -> false
end
attribute_is_present?(html_node.attributes, "checked")
end

def match_checked?(%{type: "option"} = html_node) do
case List.keyfind(html_node.attributes, "selected", 0) do
{"selected", _} -> true
_ -> false
end
attribute_is_present?(html_node.attributes, "selected")
end

def match_checked?(_) do
false
def match_checked?(_), do: false

defp attribute_is_present?(attributes, attribute_name) when is_list(attributes) do
match?({^attribute_name, _}, List.keyfind(attributes, attribute_name, 0))
end

defp attribute_is_present?(attributes, attribute_name) when is_map(attributes) do
not is_nil(attributes[attribute_name])
end

@disableable_html_nodes ~w[button input select option textarea]

def match_disabled?(%{type: type} = html_node) when type in @disableable_html_nodes do
case List.keyfind(html_node.attributes, "disabled", 0) do
{"disabled", _} -> true
_ -> false
end
attribute_is_present?(html_node.attributes, "disabled")
end

def match_disabled?(_html_node) do
false
end
def match_disabled?(_html_node), do: false

def match_root?(html_node, tree) do
html_node.node_id in tree.root_nodes_ids
Expand Down
77 changes: 49 additions & 28 deletions src/floki_mochi_html.erl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
-module(floki_mochi_html).
-export([
tokens/1,
parse/1
parse/2
]).
-ifdef(TEST).
-export([destack/1, destack/2, is_singleton/1]).
Expand Down Expand Up @@ -96,17 +96,28 @@

%% External API.

%% @spec parse(string() | binary()) -> html_node()
%% @spec parse(string() | binary(), list()) -> html_node()
%% @doc tokenize and then transform the token stream into a HTML tree.
parse(Input) ->
parse_tokens(tokens(Input)).
%%
%% The following option is supported:
%%
%% <dl>
%% <dt>`attributes_as_maps`</dt>
%% <dd>
%% When `true`, it configures the parser to use maps for the attributes.
%% It is `false` by default, which means attributes are going to be represented
%% as a list of tuples.
%% </dd>
%% </dl>
parse(Input, Opts) ->
parse_tokens(tokens(Input), Opts).

%% @spec parse_tokens([html_token()]) -> html_node()
%% @doc Transform the output of tokens(Doc) into a HTML tree.
parse_tokens(Tokens) when is_list(Tokens) ->
parse_tokens(Tokens, Opts) when is_list(Tokens) andalso is_list(Opts) ->
%% Skip over doctype, processing instructions
[{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
{Tree, _} = tree(Rest, [norm({Tag, Attrs}, Opts)], Opts),
Tree.

find_document(Tokens = [{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
Expand Down Expand Up @@ -215,38 +226,48 @@ tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
tree_data(Rest, AllWhitespace, Acc) ->
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.

tree([], Stack) ->
tree([], Stack, _Opts) ->
{destack(Stack), []};
tree([{end_tag, Tag} | Rest], Stack) ->
case destack(norm(Tag), Stack) of
tree([{end_tag, Tag} | Rest], Stack, Opts) ->
case destack(norm(Tag, Opts), Stack) of
S when is_list(S) ->
tree(Rest, S);
tree(Rest, S, Opts);
Result ->
{Result, []}
end;
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
tree(Rest, stack(norm({Tag, Attrs}), S));
tree([T = {pi, _Tag, _Attrs} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree([T = {comment, _Comment} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree(L = [{data, _Data, _Whitespace} | _], S) ->
tree([{start_tag, Tag, Attrs, true} | Rest], S, Opts) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}, Opts), S), Opts);
tree([{start_tag, Tag, Attrs, false} | Rest], S, Opts) ->
tree(Rest, stack(norm({Tag, Attrs}, Opts), S), Opts);
tree([T = {pi, _Tag, _Attrs} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree([T = {comment, _Comment} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree(L = [{data, _Data, _Whitespace} | _], S, Opts) ->
case tree_data(L, true, []) of
{_, true, Rest} ->
tree(Rest, S);
tree(Rest, S, Opts);
{Data, false, Rest} ->
tree(Rest, append_stack_child(Data, S))
tree(Rest, append_stack_child(Data, S), Opts)
end;
tree([{doctype, _} | Rest], Stack) ->
tree(Rest, Stack).

norm({Tag, Attrs}) ->
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
norm(Tag) when is_binary(Tag) ->
tree([{doctype, _} | Rest], Stack, Opts) ->
tree(Rest, Stack, Opts).

norm({Tag, Attrs}, Opts) ->
Attrs = [{norm(K, Opts), iolist_to_binary(V)} || {K, V} <- Attrs],
case lists:keyfind(attributes_as_maps, 1, Opts) of
{attributes_as_maps, true} ->
% The HTML specs says we should ignore duplicated attributes and keep the first
% occurence of a given key.
% Since `maps:from_list/1` does the opposite, we need to reverse the attributes.
% See https://github.com/philss/floki/pull/467#discussion_r1225548333
{norm(Tag, Opts), maps:from_list(lists:reverse(Attrs)), []};
_ ->
{norm(Tag, Opts), Attrs, []}
end;
norm(Tag, _Opts) when is_binary(Tag) ->
Tag;
norm(Tag) ->
norm(Tag, _Opts) ->
list_to_binary(string:to_lower(Tag)).

stack(T1 = {TN, _, _}, Stack = [{TN, _, _} | _Rest]) when
Expand Down
Loading