Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return top-level text alongside page anchors in fragments #369

Merged
merged 3 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 65 additions & 17 deletions pagefind/features/anchors.feature
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,36 @@ Feature: Anchors
| PAGEFIND_SOURCE | public |
Given I have a "public/index.html" file with the body:
"""
<p data-search-one>Nothing</p>
<p data-search-two>Nothing</p>
<p data-search>Nothing</p>
"""
Given I have a "public/cat/index.html" file with the body:
"""
<h1 id="outer-heading">Outer Heading</h1>
<div data-pagefind-body>
<p>Hello World, from Pagefind</p>
<p>PageOne, from Pagefind</p>
<h2 id="cats">Cats</h2>
<ul>
<ul id="list">
<li>Cheeka</li>
<li id="ali">Ali</li>
<li>Theodore</li>
<li>Smudge</li>
</ul>
<h2 id="pagefind">Pagefind</h2>
<p>Hello World, again, from Pagefind</p>
<p>PageOne, again, from Pagefind</p>
</div>
<p id="outer-content">Outer Content</p>
"""
Given I have a "public/dog/index.html" file with the body:
"""
<div data-pagefind-body>
<h1 id="h1">PageTwo, from Pagefind</h1>
<p id="p_spans">Words <span>in</span> <span><span>spans</span></span> should be extracted</p>
<h2 id="h2_hrefs">Links <a href="/">should be extracted</a></h2>
<span id="span_formatted">Text that is <b>bold</b> or <i>italic</i> should be extracted</span>
<p id="p_nested_ids">Text containing <span id="span_nested">nested IDs</span> should extract both</p>
<div id="double_div">Divs containing <div>💀 he he he 💀</div> divs should only take from the top level</div>
</div>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
Expand All @@ -36,38 +46,76 @@ Feature: Anchors
async function() {
let pagefind = await import("/_pagefind/pagefind.js");

let searchone = await pagefind.search("hello");
let searchonedata = await searchone.results[0].data();
document.querySelector('[data-search-one]').innerText = searchonedata.locations.join(', ');
let search = await pagefind.search("pageone");
let searchdata = await search.results[0].data();
document.querySelector('[data-search]').innerText = searchdata.locations.join(', ');
}
"""
Then There should be no logs
Then The selector "[data-search-one]" should contain "0, 10"
Then The selector "[data-search]" should contain "0, 9"

Scenario: Pagefind returns full content without anchors
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");

let searchone = await pagefind.search("hello");
let searchonedata = await searchone.results[0].data();
document.querySelector('[data-search-one]').innerText = searchonedata.content;
let search = await pagefind.search("pageone");
let searchdata = await search.results[0].data();
document.querySelector('[data-search]').innerText = searchdata.content;
}
"""
Then There should be no logs
Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. Hello World, again, from Pagefind."
Then The selector "[data-search]" should contain "PageOne, from Pagefind. Cats. Cheeka. Ali. Theodore. Smudge. Pagefind. PageOne, again, from Pagefind."

Scenario: Pagefind returns all page anchors in the fragment
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");

let searchone = await pagefind.search("hello");
let searchonedata = await searchone.results[0].data();
document.querySelector('[data-search-one]').innerText = searchonedata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
let search = await pagefind.search("pageone");
let searchdata = await search.results[0].data();
document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `${a.element}#${a.id}: ${a.location}`).join(', ');
}
"""
Then There should be no logs
Then The selector "[data-search]" should contain "h2#cats: 3, ul#list: 4, li#ali: 5, h2#pagefind: 8"

Scenario: Pagefind returns page anchor content in the fragment
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");

let search = await pagefind.search("pageone");
let searchdata = await search.results[0].data();
document.querySelector('[data-search]').innerText = searchdata.anchors.map(a => `#${a.id}: '${a.text}'`).join(', ');
}
"""
Then There should be no logs
Then The selector "[data-search]" should contain "#cats: 'Cats', #list: '', #ali: 'Ali', #pagefind: 'Pagefind'"

Scenario: Pagefind extracts page anchor text where it makes sense
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");

let search = await pagefind.search("pagetwo");
let searchdata = await search.results[0].data();
document.querySelector('[data-search]').innerHTML = `
<ul>
${searchdata.anchors.map(a => `<li>#${a.id}: '${a.text}'</li>`)}
</ul>
`;
}
"""
Then There should be no logs
Then The selector "[data-search-one]" should contain "h2#cats: 4, li#ali: 6, h2#pagefind: 9"
Then The selector "[data-search]>ul>li:nth-of-type(1)" should contain "#h1: 'PageTwo, from Pagefind'"
Then The selector "[data-search]>ul>li:nth-of-type(2)" should contain "#p_spans: 'Words in spans should be extracted'"
Then The selector "[data-search]>ul>li:nth-of-type(3)" should contain "#h2_hrefs: 'Links should be extracted'"
Then The selector "[data-search]>ul>li:nth-of-type(4)" should contain "#span_formatted: 'Text that is bold or italic should be extracted'"
Then The selector "[data-search]>ul>li:nth-of-type(5)" should contain "#p_nested_ids: 'Text containing nested IDs should extract both'"
Then The selector "[data-search]>ul>li:nth-of-type(6)" should contain "#span_nested: 'nested IDs'"
Then The selector "[data-search]>ul>li:nth-of-type(7)" should contain "#double_div: 'Divs containing divs should only take from the top level'"
47 changes: 38 additions & 9 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ use parser::DomParser;
use self::parser::DomParserResult;

lazy_static! {
static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
static ref TRIM_NEWLINES: Regex = Regex::new("^[\n\r\\s]+|[\n\r\\s]+$").unwrap();
static ref EXTRANEOUS_SPACES: Regex = Regex::new("\\s{2,}").unwrap();
// TODO: i18n?
static ref SPECIAL_CHARS: Regex = Regex::new("[^\\w]").unwrap();
}
Expand Down Expand Up @@ -186,7 +189,7 @@ impl Fossicker {
) -> (
String,
HashMap<String, Vec<FossickedWord>>,
Vec<(String, String, u32)>,
Vec<(String, String, String, u32)>,
) {
let mut map: HashMap<String, Vec<FossickedWord>> = HashMap::new();
let mut anchors = Vec::new();
Expand Down Expand Up @@ -230,14 +233,23 @@ impl Fossicker {

if word.chars().next() == Some('_') {
if word.starts_with("___PAGEFIND_ANCHOR___") {
if let Some((element_name, element_id)) =
if let Some((element_name, anchor_id)) =
word.replace("___PAGEFIND_ANCHOR___", "").split_once(':')
{
anchors.push((
element_name.to_string(),
element_id.to_string(),
word_index as u32,
));
let element_text = data
.anchor_content
.get(anchor_id)
.map(|t| normalize_content(t))
.unwrap_or_default();

if let Some((_, element_id)) = anchor_id.split_once(':') {
anchors.push((
element_name.to_string(),
element_id.to_string(),
normalize_content(&element_text),
word_index as u32,
));
}
}
offset_word_index += 1;
continue;
Expand Down Expand Up @@ -367,11 +379,11 @@ impl Fossicker {
word_count: word_data.len(),
anchors: anchors
.into_iter()
.map(|(element, id, location)| PageAnchorData {
.map(|(element, id, text, location)| PageAnchorData {
element,
id,
location,
text: None,
text,
})
.collect(),
},
Expand Down Expand Up @@ -404,6 +416,15 @@ fn build_url(page_url: &Path, relative_to: Option<&Path>, options: &SearchOption
format!("/{}", final_url)
}

fn normalize_content(content: &str) -> String {
let content = html_escape::decode_html_entities(content);
let content = TRIM_NEWLINES.replace_all(&content, "");
let content = NEWLINES.replace_all(&content, " ");
let content = EXTRANEOUS_SPACES.replace_all(&content, " ");

content.to_string()
}

// TODO: These language codes are duplicated with pagefind_web's Cargo.toml
fn get_stemmer(lang: &str) -> Option<Stemmer> {
match lang.split('-').next().unwrap() {
Expand Down Expand Up @@ -446,6 +467,14 @@ mod tests {

use super::*;

#[test]
fn normalizing_content() {
let input = "\nHello Wor\n ld? \n \n";
let output = normalize_content(input);

assert_eq!(&output, "Hello Wor ld?");
}

async fn test_fossick(s: String) -> Fossicker {
std::env::set_var("PAGEFIND_SOURCE", "somewhere");
let config =
Expand Down
Loading