diff --git a/pagefind/features/anchors.feature b/pagefind/features/anchors.feature
index 73fc65d5..6434931d 100644
--- a/pagefind/features/anchors.feature
+++ b/pagefind/features/anchors.feature
@@ -5,26 +5,36 @@ Feature: Anchors
| PAGEFIND_SOURCE | public |
Given I have a "public/index.html" file with the body:
"""
-
Nothing
- Nothing
+ Nothing
"""
Given I have a "public/cat/index.html" file with the body:
"""
Outer Heading
-
Hello World, from Pagefind
+
PageOne, from Pagefind
Cats
-
+
- Cheeka
- Ali
- Theodore
- Smudge
Pagefind
- Hello World, again, from Pagefind
+ PageOne, again, from Pagefind
Outer Content
"""
+ Given I have a "public/dog/index.html" file with the body:
+ """
+
+
PageTwo, from Pagefind
+
Words in spans should be extracted
+
+
Text that is bold or italic should be extracted
+
Text containing nested IDs should extract both
+
Divs containing
💀 he he he 💀
divs should only take from the top level
+
+ """
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
@@ -36,13 +46,13 @@ Feature: Anchors
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
- let searchone = await pagefind.search("hello");
- let searchonedata = await searchone.results[0].data();
- document.querySelector('[data-search-one]').innerText = searchonedata.locations.join(', ');
+ let search = await pagefind.search("pageone");
+ let searchdata = await search.results[0].data();
+ document.querySelector('[data-search]').innerText = searchdata.locations.join(', ');
}
"""
Then There should be no logs
- Then The selector "[data-search-one]" should contain "0, 10"
+ Then The selector "[data-search]" should contain "0, 9"
Scenario: Pagefind returns full content without anchors
When I evaluate:
@@ -50,13 +60,13 @@ Feature: Anchors
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
- let searchone = await pagefind.search("hello");
- let searchonedata = await searchone.results[0].data();
- document.querySelector('[data-search-one]').innerText = searchonedata.content;
+ let search = await pagefind.search("pageone");
+ let searchdata = await search.results[0].data();
+ document.querySelector('[data-search]').innerText = searchdata.content;
}
"""
Then There should be no logs
- Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. Hello World, again, from Pagefind."
+ Then The selector "[data-search]" should contain "PageOne, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. PageOne, again, from Pagefind."
Scenario: Pagefind returns all page anchors in the fragment
When I evaluate:
@@ -64,10 +74,48 @@ Feature: Anchors
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
- let searchone = await pagefind.search("hello");
- let searchonedata = await searchone.results[0].data();
- document.querySelector('[data-search-one]').innerText = searchonedata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
+ let search = await pagefind.search("pageone");
+ let searchdata = await search.results[0].data();
+ document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-search]" should contain "h2#cats: 3, ul#list: 4, li#ali: 5, h2#pagefind: 8"
+
+ Scenario: Pagefind returns page anchor content in the fragment
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let search = await pagefind.search("pageone");
+ let searchdata = await search.results[0].data();
+ document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `#${a.id}: '${a.text}'`).join(', ');
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-search]" should contain "#cats: 'Cats', #list: '', #ali: 'Ali', #pagefind: 'Pagefind'"
+
+ Scenario: Pagefind extracts page anchor text where it makes sense
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let search = await pagefind.search("pagetwo");
+ let searchdata = await search.results[0].data();
+ document.querySelector('[data-search]').innerHTML = `
+
+ ${searchdata.anchors.map(a => `- #${a.id}: '${a.text}'
`)}
+
+ `;
}
"""
Then There should be no logs
- Then The selector "[data-search-one]" should contain "h2#cats: 4, li#ali: 6, h2#pagefind: 9"
+ Then The selector "[data-search]>ul>li:nth-of-type(1)" should contain "#h1: 'PageTwo, from Pagefind'"
+ Then The selector "[data-search]>ul>li:nth-of-type(2)" should contain "#p_spans: 'Words in spans should be extracted'"
+ Then The selector "[data-search]>ul>li:nth-of-type(3)" should contain "#h2_hrefs: 'Links should be extracted'"
+ Then The selector "[data-search]>ul>li:nth-of-type(4)" should contain "#span_formatted: 'Text that is bold or italic should be extracted'"
+ Then The selector "[data-search]>ul>li:nth-of-type(5)" should contain "#p_nested_ids: 'Text containing nested IDs should extract both'"
+ Then The selector "[data-search]>ul>li:nth-of-type(6)" should contain "#span_nested: 'nested IDs'"
+ Then The selector "[data-search]>ul>li:nth-of-type(7)" should contain "#double_div: 'Divs containing divs should only take from the top level'"
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index b107d130..3a771258 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -20,6 +20,9 @@ use parser::DomParser;
use self::parser::DomParserResult;
lazy_static! {
+ static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
+ static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
+ static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
// TODO: i18n?
static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap();
}
@@ -186,7 +189,7 @@ impl Fossicker {
) -> (
String,
HashMap>,
- Vec<(String, String, u32)>,
+ Vec<(String, String, String, u32)>,
) {
let mut map: HashMap> = HashMap::new();
let mut anchors = Vec::new();
@@ -230,14 +233,23 @@ impl Fossicker {
if word.chars().next() == Some('_') {
if word.starts_with("___PAGEFIND_ANCHOR___") {
- if let Some((element_name, element_id)) =
+ if let Some((element_name, anchor_id)) =
word.replace("___PAGEFIND_ANCHOR___", "").split_once(':')
{
- anchors.push((
- element_name.to_string(),
- element_id.to_string(),
- word_index as u32,
- ));
+ let element_text = data
+ .anchor_content
+ .get(anchor_id)
+ .map(|t| normalize_content(t))
+ .unwrap_or_default();
+
+ if let Some((_, element_id)) = anchor_id.split_once(':') {
+ anchors.push((
+ element_name.to_string(),
+ element_id.to_string(),
+ normalize_content(&element_text),
+ word_index as u32,
+ ));
+ }
}
offset_word_index += 1;
continue;
@@ -367,11 +379,11 @@ impl Fossicker {
word_count: word_data.len(),
anchors: anchors
.into_iter()
- .map(|(element, id, location)| PageAnchorData {
+ .map(|(element, id, text, location)| PageAnchorData {
element,
id,
location,
- text: None,
+ text,
})
.collect(),
},
@@ -404,6 +416,15 @@ fn build_url(page_url: &Path, relative_to: Option<&Path>, options: &SearchOption
format!("/{}", final_url)
}
+fn normalize_content(content: &str) -> String {
+ let content = html_escape::decode_html_entities(content);
+ let content = TRIM_NEWLINES.replace_all(&content, "");
+ let content = NEWLINES.replace_all(&content, " ");
+ let content = EXTRANEOUS_SPACES.replace_all(&content, " ");
+
+ content.to_string()
+}
+
// TODO: These language codes are duplicated with pagefind_web's Cargo.toml
fn get_stemmer(lang: &str) -> Option {
match lang.split('-').next().unwrap() {
@@ -446,6 +467,14 @@ mod tests {
use super::*;
+ #[test]
+ fn normalizing_content() {
+ let input = "\nHello Wor\n ld? \n \n";
+ let output = normalize_content(input);
+
+ assert_eq!(&output, "Hello Wor ld?");
+ }
+
async fn test_fossick(s: String) -> Fossicker {
std::env::set_var("PAGEFIND_SOURCE", "somewhere");
let config =
diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs
index 8ae51762..ab1f7a31 100644
--- a/pagefind/src/fossick/parser.rs
+++ b/pagefind/src/fossick/parser.rs
@@ -9,10 +9,9 @@ use std::rc::Rc;
use crate::SearchOptions;
+use super::normalize_content;
+
lazy_static! {
- static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
- static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
- static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
static ref ALL_SPACES: Regex = Regex::new("\\s").unwrap();
static ref SENTENCE_CHARS: Regex = Regex::new("[\\w'\"\\)\\$\\*]").unwrap();
}
@@ -24,6 +23,11 @@ lazy_static! {
static ref SENTENCE_SELECTORS: Vec<&'static str> = vec!(
"h1", "h2", "h3", "h4", "h5", "h6", "p", "td", "div", "ul", "li", "article", "section"
);
+ static ref INLINE_SELECTORS: Vec<&'static str> = vec!(
+ "a", "abbr", "acronym", "b", "bdo", "big", "br", "button", "cite", "code", "dfn", "em",
+ "i", "img", "input", "kbd", "label", "map", "object", "output", "q", "samp", "script",
+ "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "tt", "var",
+ );
static ref REMOVE_SELECTORS: Vec<&'static str> = vec!(
"head", "style", "script", "noscript", "label", "form", "svg", "footer", "nav", "iframe",
"template"
@@ -54,6 +58,7 @@ struct DomParserData {
sort: HashMap,
meta: HashMap,
default_meta: HashMap,
+ anchor_content: HashMap,
language: Option,
has_html_element: bool,
}
@@ -91,6 +96,7 @@ struct DomParsingNode {
meta: Option>,
default_meta: Option>,
weight: Option,
+ anchor_ids: Option>,
status: NodeStatus,
}
@@ -101,6 +107,7 @@ pub struct DomParserResult {
pub filters: HashMap>,
pub sort: HashMap,
pub meta: HashMap,
+ pub anchor_content: HashMap,
pub has_custom_body: bool,
pub force_inclusion: bool, // Include this page even if there is no body
pub has_html_element: bool,
@@ -129,6 +136,7 @@ impl<'a> DomParser<'a> {
.map(|e| format!("{} {}", options.root_selector, e))
.collect::>()
.join(", ");
+ let mut anchor_counter = 0;
let rewriter = HtmlRewriter::new(
Settings {
@@ -170,6 +178,7 @@ impl<'a> DomParser<'a> {
NodeStatus::Indexing
};
+ let mut anchor_id = None;
if status != NodeStatus::Excluded && status != NodeStatus::Ignored {
if let Some(element_id) = element_id {
let parent = &data.borrow().current_node;
@@ -178,7 +187,9 @@ impl<'a> DomParser<'a> {
if !(parent.status == NodeStatus::ParentOfBody
&& status != NodeStatus::Body
&& status != NodeStatus::ParentOfBody) {
- parent.current_value.push_str(&format!(" ___PAGEFIND_ANCHOR___{tag_name}:{element_id} "));
+ parent.current_value.push_str(&format!(" ___PAGEFIND_ANCHOR___{tag_name}:{anchor_counter}:{element_id} "));
+ anchor_id = Some(format!("{anchor_counter}:{element_id}"));
+ anchor_counter += 1;
}
}
}
@@ -210,7 +221,22 @@ impl<'a> DomParser<'a> {
let node = {
let mut data = data.borrow_mut();
- let parent_status = data.current_node.borrow().status;
+ let parent_node = data.current_node.borrow();
+ let parent_status = parent_node.status;
+
+ let mut node_anchors = if parent_node.anchor_ids.is_some() && INLINE_SELECTORS.contains(&tag_name.as_str()) {
+ parent_node.anchor_ids.clone()
+ } else {
+ None
+ };
+
+ if let Some(this_node_anchor_id) = anchor_id {
+ if let Some(existing) = node_anchors.as_mut() {
+ existing.push(this_node_anchor_id);
+ } else {
+ node_anchors = Some(vec![this_node_anchor_id]);
+ }
+ }
let node = Rc::new(RefCell::new(DomParsingNode{
parent: Some(Rc::clone(&data.current_node)),
@@ -222,10 +248,12 @@ impl<'a> DomParser<'a> {
meta,
default_meta,
sort,
+ anchor_ids: node_anchors,
current_value: String::default(),
weight,
}));
+ drop(parent_node);
data.current_node = Rc::clone(&node);
node
};
@@ -462,9 +490,22 @@ impl<'a> DomParser<'a> {
})},
// Slap any text we encounter inside the body into the current node's current value
enclose! { (data) text!(&options.root_selector, move |el| {
- let data = data.borrow_mut();
+ let mut data = data.borrow_mut();
let mut node = data.current_node.borrow_mut();
- node.current_value.push_str(el.as_str());
+ let element_text = el.as_str();
+ node.current_value.push_str(element_text);
+
+ if node.anchor_ids.is_some() {
+ let anchor_ids = node.anchor_ids.clone().unwrap();
+ drop(node);
+ for anchor_id in anchor_ids {
+ if let Some(anchor_text) = data.anchor_content.get_mut(&anchor_id) {
+ anchor_text.push_str(element_text);
+ } else {
+ data.anchor_content.insert(anchor_id, element_text.to_string());
+ }
+ }
+ }
Ok(())
})},
],
@@ -543,6 +584,7 @@ impl<'a> DomParser<'a> {
filters: data.filters,
sort: data.sort,
meta: data.default_meta,
+ anchor_content: data.anchor_content,
has_custom_body: node.status == NodeStatus::ParentOfBody,
force_inclusion: false,
has_html_element: data.has_html_element,
@@ -554,15 +596,6 @@ impl<'a> DomParser<'a> {
}
}
-fn normalize_content(content: &str) -> String {
- let content = html_escape::decode_html_entities(content);
- let content = TRIM_NEWLINES.replace_all(&content, "");
- let content = NEWLINES.replace_all(&content, " ");
- let content = EXTRANEOUS_SPACES.replace_all(&content, " ");
-
- content.to_string()
-}
-
fn parse_attr_string(input: String, el: &Element) -> Vec {
if let Some((attrs, literal)) = input.split_once(':') {
let mut attrs = parse_attr_string(attrs.to_owned(), el);
@@ -606,14 +639,6 @@ impl DomParsingNode {
mod tests {
use super::*;
- #[test]
- fn normalizing_content() {
- let input = "\nHello Wor\n ld? \n \n";
- let output = normalize_content(input);
-
- assert_eq!(&output, "Hello Wor ld?");
- }
-
#[test]
fn get_filter_from_node() {
let mut node = DomParsingNode::default();
@@ -685,7 +710,7 @@ mod tests {
assert_eq!(
data.digest,
- "Sentence one. ___PAGEFIND_ANCHOR___br:break ___PAGEFIND_ANCHOR___p:pid Sentence two."
+ "Sentence one. ___PAGEFIND_ANCHOR___br:0:break ___PAGEFIND_ANCHOR___p:1:pid Sentence two."
)
}
diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs
index f98c1da0..d280c66a 100644
--- a/pagefind/src/fragments/mod.rs
+++ b/pagefind/src/fragments/mod.rs
@@ -6,7 +6,7 @@ use serde::Serialize;
pub struct PageAnchorData {
pub element: String,
pub id: String,
- pub text: Option,
+ pub text: String,
pub location: u32,
}
diff --git a/pagefind/src/service/mod.rs b/pagefind/src/service/mod.rs
index 90381ed7..6fc23797 100644
--- a/pagefind/src/service/mod.rs
+++ b/pagefind/src/service/mod.rs
@@ -4,6 +4,7 @@ use std::{
};
use base64::{engine::general_purpose, Engine as _};
+use hashbrown::HashMap;
use rust_patch::Patch;
use tokio::sync::mpsc;
@@ -190,6 +191,7 @@ pub async fn run_service() {
filters: filters.unwrap_or_default(),
sort: sort.unwrap_or_default(),
meta: meta.unwrap_or_default(),
+ anchor_content: HashMap::new(),
has_custom_body: false,
force_inclusion: true,
has_html_element: true,