Skip to content

Commit

Permalink
Improve Floki.text/2 by using IO data
Browse files Browse the repository at this point in the history
This makes the process of building the resultant string much faster and
cheaper, because it uses less memory.
  • Loading branch information
philss committed Jun 2, 2023
1 parent dcacade commit c8850a7
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
14 changes: 8 additions & 6 deletions lib/floki/deep_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ defmodule Floki.DeepText do
def get(html_tree, sep \\ "", include_inputs? \\ false)

def get(html_tree, sep, include_inputs?) do
get_text(html_tree, "", sep, include_inputs?)
html_tree
|> get_text([], sep, include_inputs?)
|> IO.iodata_to_binary()
end

defp get_text(text, "", _sep, _) when is_binary(text), do: text
defp get_text(text, acc, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)
defp get_text(text, [], _sep, _) when is_binary(text), do: text
defp get_text(text, acc, sep, _) when is_binary(text), do: [acc, sep, text]

defp get_text(nodes, acc, sep, include_inputs?) when is_list(nodes) do
Enum.reduce(nodes, acc, fn child, istr ->
Expand All @@ -24,14 +26,14 @@ defmodule Floki.DeepText do
end

defp get_text({:comment, _}, acc, _, _), do: acc
defp get_text({"br", _, _}, acc, _, _), do: acc <> "\n"
defp get_text({"br", _, _}, acc, _, _), do: [acc, "\n"]

defp get_text({"input", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp get_text({"textarea", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp get_text({_, _, nodes}, acc, sep, include_inputs?) do
Expand Down
16 changes: 10 additions & 6 deletions lib/floki/flat_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,25 @@ defmodule Floki.FlatText do
def get(html_nodes, sep \\ "", include_inputs? \\ false)

def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do
Enum.reduce(html_nodes, "", fn html_node, acc ->
html_nodes
|> Enum.reduce([], fn html_node, acc ->
text_from_node(html_node, acc, 0, sep, include_inputs?)
end)
|> IO.iodata_to_binary()
end

def get(html_node, sep, include_inputs?) do
text_from_node(html_node, "", 0, sep, include_inputs?)
html_node
|> text_from_node([], 0, sep, include_inputs?)
|> IO.iodata_to_binary()
end

defp text_from_node({"input", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
[acc, Floki.TextExtractor.extract_input_value(attrs)]
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?)
Expand All @@ -40,7 +44,7 @@ defmodule Floki.FlatText do
end)
end

defp text_from_node(text, "", _, _sep, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)
defp text_from_node(text, [], _, _sep, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: [acc, sep, text]
defp text_from_node(_, acc, _, _, _), do: acc
end

0 comments on commit c8850a7

Please sign in to comment.