CloudCannon · bglw · Jul 28, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/pagefind/features/anchors.feature b/pagefind/features/anchors.feature
@@ -5,26 +5,36 @@ Feature: Anchors
             | PAGEFIND_SOURCE | public |
         Given I have a "public/index.html" file with the body:
             """
-            <p data-search-one>Nothing</p>
-            <p data-search-two>Nothing</p>
+            <p data-search>Nothing</p>
             """
         Given I have a "public/cat/index.html" file with the body:
             """
             <h1 id="outer-heading">Outer Heading</h1>
             <div data-pagefind-body>
-                <p>Hello World, from Pagefind</p>
+                <p>PageOne, from Pagefind</p>
                 <h2 id="cats">Cats</h2>
-                <ul>
+                <ul id="list">
                     <li>Cheeka</li>
                     <li id="ali">Ali</li>
                     <li>Theodore</li>
                     <li>Smudge</li>
                 </ul>
                 <h2 id="pagefind">Pagefind</h2>
-                <p>Hello World, again, from Pagefind</p>
+                <p>PageOne, again, from Pagefind</p>
             </div>
             <p id="outer-content">Outer Content</p>
             """
+        Given I have a "public/dog/index.html" file with the body:
+            """
+            <div data-pagefind-body>
+                <h1 id="h1">PageTwo, from Pagefind</h1>
+                <p id="p_spans">Words <span>in</span> <span><span>spans</span></span> should be extracted</p>
+                <h2 id="h2_hrefs">Links <a href="/">should be extracted</a></h2>
+                <span id="span_formatted">Text that is <b>bold</b> or <i>italic</i> should be extracted</span>
+                <p id="p_nested_ids">Text containing <span id="span_nested">nested IDs</span> should extract both</p>
+                <div id="double_div">Divs containing <div>💀 he he he 💀</div> divs should only take from the top level</div>
+            </div>
+            """
         When I run my program
         Then I should see "Running Pagefind" in stdout
         When I serve the "public" directory
@@ -36,38 +46,76 @@ Feature: Anchors
             async function() {
                 let pagefind = await import("/_pagefind/pagefind.js");
 
-                let searchone = await pagefind.search("hello");
-                let searchonedata = await searchone.results[0].data();
-                document.querySelector('[data-search-one]').innerText = searchonedata.locations.join(', ');
+                let search = await pagefind.search("pageone");
+                let searchdata = await search.results[0].data();
+                document.querySelector('[data-search]').innerText = searchdata.locations.join(', ');
             }
             """
         Then There should be no logs
-        Then The selector "[data-search-one]" should contain "0, 10"
+        Then The selector "[data-search]" should contain "0, 9"
 
     Scenario: Pagefind returns full content without anchors
         When I evaluate:
             """
             async function() {
                 let pagefind = await import("/_pagefind/pagefind.js");
 
-                let searchone = await pagefind.search("hello");
-                let searchonedata = await searchone.results[0].data();
-                document.querySelector('[data-search-one]').innerText = searchonedata.content;
+                let search = await pagefind.search("pageone");
+                let searchdata = await search.results[0].data();
+                document.querySelector('[data-search]').innerText = searchdata.content;
             }
             """
         Then There should be no logs
-        Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. Hello World, again, from Pagefind."
+        Then The selector "[data-search]" should contain "PageOne, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. PageOne, again, from Pagefind."
 
     Scenario: Pagefind returns all page anchors in the fragment
         When I evaluate:
             """
             async function() {
                 let pagefind = await import("/_pagefind/pagefind.js");
 
-                let searchone = await pagefind.search("hello");
-                let searchonedata = await searchone.results[0].data();
-                document.querySelector('[data-search-one]').innerText = searchonedata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
+                let search = await pagefind.search("pageone");
+                let searchdata = await search.results[0].data();
+                document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-search]" should contain "h2#cats: 3, ul#list: 4, li#ali: 5, h2#pagefind: 8"
+
+    Scenario: Pagefind returns page anchor content in the fragment
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let search = await pagefind.search("pageone");
+                let searchdata = await search.results[0].data();
+                document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `#${a.id}: '${a.text}'`).join(', ');
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-search]" should contain "#cats: 'Cats', #list: '', #ali: 'Ali', #pagefind: 'Pagefind'"
+
+    Scenario: Pagefind extracts page anchor text where it makes sense
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let search = await pagefind.search("pagetwo");
+                let searchdata = await search.results[0].data();
+                document.querySelector('[data-search]').innerHTML = `
+                    <ul>
+                        ${searchdata.anchors.map(a => `<li>#${a.id}: '${a.text}'</li>`)}
+                    </ul>
+                `;
             }
             """
         Then There should be no logs
-        Then The selector "[data-search-one]" should contain "h2#cats: 4, li#ali: 6, h2#pagefind: 9"
+        Then The selector "[data-search]>ul>li:nth-of-type(1)" should contain "#h1: 'PageTwo, from Pagefind'"
+        Then The selector "[data-search]>ul>li:nth-of-type(2)" should contain "#p_spans: 'Words in spans should be extracted'"
+        Then The selector "[data-search]>ul>li:nth-of-type(3)" should contain "#h2_hrefs: 'Links should be extracted'"
+        Then The selector "[data-search]>ul>li:nth-of-type(4)" should contain "#span_formatted: 'Text that is bold or italic should be extracted'"
+        Then The selector "[data-search]>ul>li:nth-of-type(5)" should contain "#p_nested_ids: 'Text containing nested IDs should extract both'"
+        Then The selector "[data-search]>ul>li:nth-of-type(6)" should contain "#span_nested: 'nested IDs'"
+        Then The selector "[data-search]>ul>li:nth-of-type(7)" should contain "#double_div: 'Divs containing divs should only take from the top level'"
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
@@ -20,6 +20,9 @@ use parser::DomParser;
 use self::parser::DomParserResult;
 
 lazy_static! {
+    static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
+    static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
+    static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
     // TODO: i18n?
     static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap();
 }
@@ -186,7 +189,7 @@ impl Fossicker {
     ) -> (
         String,
         HashMap<String, Vec<FossickedWord>>,
-        Vec<(String, String, u32)>,
+        Vec<(String, String, String, u32)>,
     ) {
         let mut map: HashMap<String, Vec<FossickedWord>> = HashMap::new();
         let mut anchors = Vec::new();
@@ -230,14 +233,23 @@ impl Fossicker {
 
             if word.chars().next() == Some('_') {
                 if word.starts_with("___PAGEFIND_ANCHOR___") {
-                    if let Some((element_name, element_id)) =
+                    if let Some((element_name, anchor_id)) =
                         word.replace("___PAGEFIND_ANCHOR___", "").split_once(':')
                     {
-                        anchors.push((
-                            element_name.to_string(),
-                            element_id.to_string(),
-                            word_index as u32,
-                        ));
+                        let element_text = data
+                            .anchor_content
+                            .get(anchor_id)
+                            .map(|t| normalize_content(t))
+                            .unwrap_or_default();
+
+                        if let Some((_, element_id)) = anchor_id.split_once(':') {
+                            anchors.push((
+                                element_name.to_string(),
+                                element_id.to_string(),
+                                normalize_content(&element_text),
+                                word_index as u32,
+                            ));
+                        }
                     }
                     offset_word_index += 1;
                     continue;
@@ -367,11 +379,11 @@ impl Fossicker {
                     word_count: word_data.len(),
                     anchors: anchors
                         .into_iter()
-                        .map(|(element, id, location)| PageAnchorData {
+                        .map(|(element, id, text, location)| PageAnchorData {
                             element,
                             id,
                             location,
-                            text: None,
+                            text,
                         })
                         .collect(),
                 },
@@ -404,6 +416,15 @@ fn build_url(page_url: &Path, relative_to: Option<&Path>, options: &SearchOption
     format!("/{}", final_url)
 }
 
+fn normalize_content(content: &str) -> String {
+    let content = html_escape::decode_html_entities(content);
+    let content = TRIM_NEWLINES.replace_all(&content, "");
+    let content = NEWLINES.replace_all(&content, " ");
+    let content = EXTRANEOUS_SPACES.replace_all(&content, " ");
+
+    content.to_string()
+}
+
 // TODO: These language codes are duplicated with pagefind_web's Cargo.toml
 fn get_stemmer(lang: &str) -> Option<Stemmer> {
     match lang.split('-').next().unwrap() {
@@ -446,6 +467,14 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn normalizing_content() {
+        let input = "\nHello  Wor\n ld? \n \n";
+        let output = normalize_content(input);
+
+        assert_eq!(&output, "Hello Wor ld?");
+    }
+
     async fn test_fossick(s: String) -> Fossicker {
         std::env::set_var("PAGEFIND_SOURCE", "somewhere");
         let config =