diff --git a/pagefind/features/anchors.feature b/pagefind/features/anchors.feature index 73fc65d5..6434931d 100644 --- a/pagefind/features/anchors.feature +++ b/pagefind/features/anchors.feature @@ -5,26 +5,36 @@ Feature: Anchors | PAGEFIND_SOURCE | public | Given I have a "public/index.html" file with the body: """ -

Nothing

-

Nothing

+

Nothing

""" Given I have a "public/cat/index.html" file with the body: """

Outer Heading

-

Hello World, from Pagefind

+

PageOne, from Pagefind

Cats

-

Outer Content

""" + Given I have a "public/dog/index.html" file with the body: + """ +
+

PageTwo, from Pagefind

+

Words in spans should be extracted

+

Links should be extracted

+ Text that is bold or italic should be extracted +

Text containing nested IDs should extract both

+
Divs containing
💀 he he he 💀
divs should only take from the top level
+
+ """ When I run my program Then I should see "Running Pagefind" in stdout When I serve the "public" directory @@ -36,13 +46,13 @@ Feature: Anchors async function() { let pagefind = await import("/_pagefind/pagefind.js"); - let searchone = await pagefind.search("hello"); - let searchonedata = await searchone.results[0].data(); - document.querySelector('[data-search-one]').innerText = searchonedata.locations.join(', '); + let search = await pagefind.search("pageone"); + let searchdata = await search.results[0].data(); + document.querySelector('[data-search]').innerText = searchdata.locations.join(', '); } """ Then There should be no logs - Then The selector "[data-search-one]" should contain "0, 10" + Then The selector "[data-search]" should contain "0, 9" Scenario: Pagefind returns full content without anchors When I evaluate: @@ -50,13 +60,13 @@ Feature: Anchors async function() { let pagefind = await import("/_pagefind/pagefind.js"); - let searchone = await pagefind.search("hello"); - let searchonedata = await searchone.results[0].data(); - document.querySelector('[data-search-one]').innerText = searchonedata.content; + let search = await pagefind.search("pageone"); + let searchdata = await search.results[0].data(); + document.querySelector('[data-search]').innerText = searchdata.content; } """ Then There should be no logs - Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. Hello World, again, from Pagefind." + Then The selector "[data-search]" should contain "PageOne, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. PageOne, again, from Pagefind." Scenario: Pagefind returns all page anchors in the fragment When I evaluate: @@ -64,10 +74,48 @@ Feature: Anchors async function() { let pagefind = await import("/_pagefind/pagefind.js"); - let searchone = await pagefind.search("hello"); - let searchonedata = await searchone.results[0].data(); - document.querySelector('[data-search-one]').innerText = searchonedata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', '); + let search = await pagefind.search("pageone"); + let searchdata = await search.results[0].data(); + document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', '); + } + """ + Then There should be no logs + Then The selector "[data-search]" should contain "h2#cats: 3, ul#list: 4, li#ali: 5, h2#pagefind: 8" + + Scenario: Pagefind returns page anchor content in the fragment + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let search = await pagefind.search("pageone"); + let searchdata = await search.results[0].data(); + document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `#${a.id}: '${a.text}'`).join(', '); + } + """ + Then There should be no logs + Then The selector "[data-search]" should contain "#cats: 'Cats', #list: '', #ali: 'Ali', #pagefind: 'Pagefind'" + + Scenario: Pagefind extracts page anchor text where it makes sense + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let search = await pagefind.search("pagetwo"); + let searchdata = await search.results[0].data(); + document.querySelector('[data-search]').innerHTML = ` + + `; } """ Then There should be no logs - Then The selector "[data-search-one]" should contain "h2#cats: 4, li#ali: 6, h2#pagefind: 9" + Then The selector "[data-search]>ul>li:nth-of-type(1)" should contain "#h1: 'PageTwo, from Pagefind'" + Then The selector "[data-search]>ul>li:nth-of-type(2)" should contain "#p_spans: 'Words in spans should be extracted'" + Then The selector "[data-search]>ul>li:nth-of-type(3)" should contain "#h2_hrefs: 'Links should be extracted'" + Then The selector "[data-search]>ul>li:nth-of-type(4)" should contain "#span_formatted: 'Text that is bold or italic should be extracted'" + Then The selector "[data-search]>ul>li:nth-of-type(5)" should contain "#p_nested_ids: 'Text containing nested IDs should extract both'" + Then The selector "[data-search]>ul>li:nth-of-type(6)" should contain "#span_nested: 'nested IDs'" + Then The selector "[data-search]>ul>li:nth-of-type(7)" should contain "#double_div: 'Divs containing divs should only take from the top level'" diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index b107d130..3a771258 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -20,6 +20,9 @@ use parser::DomParser; use self::parser::DomParserResult; lazy_static! { + static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap(); + static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap(); + static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap(); // TODO: i18n? static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap(); } @@ -186,7 +189,7 @@ impl Fossicker { ) -> ( String, HashMap>, - Vec<(String, String, u32)>, + Vec<(String, String, String, u32)>, ) { let mut map: HashMap> = HashMap::new(); let mut anchors = Vec::new(); @@ -230,14 +233,23 @@ impl Fossicker { if word.chars().next() == Some('_') { if word.starts_with("___PAGEFIND_ANCHOR___") { - if let Some((element_name, element_id)) = + if let Some((element_name, anchor_id)) = word.replace("___PAGEFIND_ANCHOR___", "").split_once(':') { - anchors.push(( - element_name.to_string(), - element_id.to_string(), - word_index as u32, - )); + let element_text = data + .anchor_content + .get(anchor_id) + .map(|t| normalize_content(t)) + .unwrap_or_default(); + + if let Some((_, element_id)) = anchor_id.split_once(':') { + anchors.push(( + element_name.to_string(), + element_id.to_string(), + normalize_content(&element_text), + word_index as u32, + )); + } } offset_word_index += 1; continue; @@ -367,11 +379,11 @@ impl Fossicker { word_count: word_data.len(), anchors: anchors .into_iter() - .map(|(element, id, location)| PageAnchorData { + .map(|(element, id, text, location)| PageAnchorData { element, id, location, - text: None, + text, }) .collect(), }, @@ -404,6 +416,15 @@ fn build_url(page_url: &Path, relative_to: Option<&Path>, options: &SearchOption format!("/{}", final_url) } +fn normalize_content(content: &str) -> String { + let content = html_escape::decode_html_entities(content); + let content = TRIM_NEWLINES.replace_all(&content, ""); + let content = NEWLINES.replace_all(&content, " "); + let content = EXTRANEOUS_SPACES.replace_all(&content, " "); + + content.to_string() +} + // TODO: These language codes are duplicated with pagefind_web's Cargo.toml fn get_stemmer(lang: &str) -> Option { match lang.split('-').next().unwrap() { @@ -446,6 +467,14 @@ mod tests { use super::*; + #[test] + fn normalizing_content() { + let input = "\nHello Wor\n ld? \n \n"; + let output = normalize_content(input); + + assert_eq!(&output, "Hello Wor ld?"); + } + async fn test_fossick(s: String) -> Fossicker { std::env::set_var("PAGEFIND_SOURCE", "somewhere"); let config = diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs index 8ae51762..ab1f7a31 100644 --- a/pagefind/src/fossick/parser.rs +++ b/pagefind/src/fossick/parser.rs @@ -9,10 +9,9 @@ use std::rc::Rc; use crate::SearchOptions; +use super::normalize_content; + lazy_static! { - static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap(); - static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap(); - static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap(); static ref ALL_SPACES: Regex = Regex::new("\\s").unwrap(); static ref SENTENCE_CHARS: Regex = Regex::new("[\\w'\"\\)\\$\\*]").unwrap(); } @@ -24,6 +23,11 @@ lazy_static! { static ref SENTENCE_SELECTORS: Vec<&'static str> = vec!( "h1", "h2", "h3", "h4", "h5", "h6", "p", "td", "div", "ul", "li", "article", "section" ); + static ref INLINE_SELECTORS: Vec<&'static str> = vec!( + "a", "abbr", "acronym", "b", "bdo", "big", "br", "button", "cite", "code", "dfn", "em", + "i", "img", "input", "kbd", "label", "map", "object", "output", "q", "samp", "script", + "select", "small", "span", "strong", "sub", "sup", "textarea", "time", "tt", "var", + ); static ref REMOVE_SELECTORS: Vec<&'static str> = vec!( "head", "style", "script", "noscript", "label", "form", "svg", "footer", "nav", "iframe", "template" @@ -54,6 +58,7 @@ struct DomParserData { sort: HashMap, meta: HashMap, default_meta: HashMap, + anchor_content: HashMap, language: Option, has_html_element: bool, } @@ -91,6 +96,7 @@ struct DomParsingNode { meta: Option>, default_meta: Option>, weight: Option, + anchor_ids: Option>, status: NodeStatus, } @@ -101,6 +107,7 @@ pub struct DomParserResult { pub filters: HashMap>, pub sort: HashMap, pub meta: HashMap, + pub anchor_content: HashMap, pub has_custom_body: bool, pub force_inclusion: bool, // Include this page even if there is no body pub has_html_element: bool, @@ -129,6 +136,7 @@ impl<'a> DomParser<'a> { .map(|e| format!("{} {}", options.root_selector, e)) .collect::>() .join(", "); + let mut anchor_counter = 0; let rewriter = HtmlRewriter::new( Settings { @@ -170,6 +178,7 @@ impl<'a> DomParser<'a> { NodeStatus::Indexing }; + let mut anchor_id = None; if status != NodeStatus::Excluded && status != NodeStatus::Ignored { if let Some(element_id) = element_id { let parent = &data.borrow().current_node; @@ -178,7 +187,9 @@ impl<'a> DomParser<'a> { if !(parent.status == NodeStatus::ParentOfBody && status != NodeStatus::Body && status != NodeStatus::ParentOfBody) { - parent.current_value.push_str(&format!(" ___PAGEFIND_ANCHOR___{tag_name}:{element_id} ")); + parent.current_value.push_str(&format!(" ___PAGEFIND_ANCHOR___{tag_name}:{anchor_counter}:{element_id} ")); + anchor_id = Some(format!("{anchor_counter}:{element_id}")); + anchor_counter += 1; } } } @@ -210,7 +221,22 @@ impl<'a> DomParser<'a> { let node = { let mut data = data.borrow_mut(); - let parent_status = data.current_node.borrow().status; + let parent_node = data.current_node.borrow(); + let parent_status = parent_node.status; + + let mut node_anchors = if parent_node.anchor_ids.is_some() && INLINE_SELECTORS.contains(&tag_name.as_str()) { + parent_node.anchor_ids.clone() + } else { + None + }; + + if let Some(this_node_anchor_id) = anchor_id { + if let Some(existing) = node_anchors.as_mut() { + existing.push(this_node_anchor_id); + } else { + node_anchors = Some(vec![this_node_anchor_id]); + } + } let node = Rc::new(RefCell::new(DomParsingNode{ parent: Some(Rc::clone(&data.current_node)), @@ -222,10 +248,12 @@ impl<'a> DomParser<'a> { meta, default_meta, sort, + anchor_ids: node_anchors, current_value: String::default(), weight, })); + drop(parent_node); data.current_node = Rc::clone(&node); node }; @@ -462,9 +490,22 @@ impl<'a> DomParser<'a> { })}, // Slap any text we encounter inside the body into the current node's current value enclose! { (data) text!(&options.root_selector, move |el| { - let data = data.borrow_mut(); + let mut data = data.borrow_mut(); let mut node = data.current_node.borrow_mut(); - node.current_value.push_str(el.as_str()); + let element_text = el.as_str(); + node.current_value.push_str(element_text); + + if node.anchor_ids.is_some() { + let anchor_ids = node.anchor_ids.clone().unwrap(); + drop(node); + for anchor_id in anchor_ids { + if let Some(anchor_text) = data.anchor_content.get_mut(&anchor_id) { + anchor_text.push_str(element_text); + } else { + data.anchor_content.insert(anchor_id, element_text.to_string()); + } + } + } Ok(()) })}, ], @@ -543,6 +584,7 @@ impl<'a> DomParser<'a> { filters: data.filters, sort: data.sort, meta: data.default_meta, + anchor_content: data.anchor_content, has_custom_body: node.status == NodeStatus::ParentOfBody, force_inclusion: false, has_html_element: data.has_html_element, @@ -554,15 +596,6 @@ impl<'a> DomParser<'a> { } } -fn normalize_content(content: &str) -> String { - let content = html_escape::decode_html_entities(content); - let content = TRIM_NEWLINES.replace_all(&content, ""); - let content = NEWLINES.replace_all(&content, " "); - let content = EXTRANEOUS_SPACES.replace_all(&content, " "); - - content.to_string() -} - fn parse_attr_string(input: String, el: &Element) -> Vec { if let Some((attrs, literal)) = input.split_once(':') { let mut attrs = parse_attr_string(attrs.to_owned(), el); @@ -606,14 +639,6 @@ impl DomParsingNode { mod tests { use super::*; - #[test] - fn normalizing_content() { - let input = "\nHello Wor\n ld? \n \n"; - let output = normalize_content(input); - - assert_eq!(&output, "Hello Wor ld?"); - } - #[test] fn get_filter_from_node() { let mut node = DomParsingNode::default(); @@ -685,7 +710,7 @@ mod tests { assert_eq!( data.digest, - "Sentence one. ___PAGEFIND_ANCHOR___br:break ___PAGEFIND_ANCHOR___p:pid Sentence two." + "Sentence one. ___PAGEFIND_ANCHOR___br:0:break ___PAGEFIND_ANCHOR___p:1:pid Sentence two." ) } diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs index f98c1da0..d280c66a 100644 --- a/pagefind/src/fragments/mod.rs +++ b/pagefind/src/fragments/mod.rs @@ -6,7 +6,7 @@ use serde::Serialize; pub struct PageAnchorData { pub element: String, pub id: String, - pub text: Option, + pub text: String, pub location: u32, } diff --git a/pagefind/src/service/mod.rs b/pagefind/src/service/mod.rs index 90381ed7..6fc23797 100644 --- a/pagefind/src/service/mod.rs +++ b/pagefind/src/service/mod.rs @@ -4,6 +4,7 @@ use std::{ }; use base64::{engine::general_purpose, Engine as _}; +use hashbrown::HashMap; use rust_patch::Patch; use tokio::sync::mpsc; @@ -190,6 +191,7 @@ pub async fn run_service() { filters: filters.unwrap_or_default(), sort: sort.unwrap_or_default(), meta: meta.unwrap_or_default(), + anchor_content: HashMap::new(), has_custom_body: false, force_inclusion: true, has_html_element: true,