From 82de52246758e25498f55ea4cc59bb7570e9bf14 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:12:43 -0500 Subject: [PATCH 01/27] move functions --- fire_seq_search_server/src/load_notes/mod.rs | 29 ++----------------- .../src/markdown_parser/markdown_to_text.rs | 7 +++++ .../src/markdown_parser/mod.rs | 25 ++++++++++++++++ 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 0b0fb63..848d565 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -5,7 +5,7 @@ use std::process; use rayon::prelude::*; -use crate::markdown_parser::parse_to_plain_text; + pub fn read_specific_directory(path: &str) -> Vec<(String, String)> { info!("Try to read {}", &path); @@ -84,33 +84,10 @@ pub fn read_md_file_and_parse(note: &std::fs::DirEntry) -> Option<(String, Strin } }; + let content : String = crate::markdown_parser::parse_logseq_notebook(content); + - // Now we do some parsing for this file - let content: String = exclude_advanced_query(content); - let content: String = parse_to_plain_text(&content); Some((note_title.to_string(),content)) } -// https://docs.rs/regex/latest/regex/#repetitions -// https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query(md: String) -> String { - if !md.contains('#') { - return md; - } - - lazy_static! { - static ref RE: Regex = Regex::new( - r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") - .unwrap(); - } - let result = RE.replace_all(&md, " "); - String::from(result) - // let mat = RE.find(&md); - // match mat { - // Some(m) => { - // todo!() - // }, - // None => md - // } -} \ No newline at end of file diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index ab49076..ac7581d 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -123,6 +123,13 @@ fn is_strikethrough(tag: &Tag) -> bool { mod tests { use super::convert; + #[test] + fn links_to_pdf() { + let markdown = r#"Refer to ![order.pdf](../assets/buy_00000_0.pdf)"#; + let expected = "Hello"; + assert_eq!(convert(markdown), expected); + } + #[test] fn basic_inline_strong() { let markdown = r#"**Hello**"#; diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index d8b161f..b45f5b4 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -1,4 +1,29 @@ mod markdown_to_text; +use regex::Regex; + +// https://docs.rs/regex/latest/regex/#repetitions +// https://stackoverflow.com/a/8303552/1166518 +fn exclude_advanced_query(md: String) -> String { + if !md.contains('#') { + return md; + } + + lazy_static! { + static ref RE: Regex = Regex::new( + r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") + .unwrap(); + } + let result = RE.replace_all(&md, " "); + String::from(result) +} + +pub fn parse_logseq_notebook(md: String) -> String { + + // Now we do some parsing for this file + let content: String = exclude_advanced_query(md); + let content: String = parse_to_plain_text(&content); + content +} pub fn parse_to_plain_text(md: &str) -> String { From 2f4e4af7ff5acc193ea44151540c13acbb432867 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:14:12 -0500 Subject: [PATCH 02/27] fix test path --- fire_seq_search_server/src/markdown_parser/mod.rs | 2 +- fire_seq_search_server/tests/unit_test_load_notes.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index b45f5b4..289ba65 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -3,7 +3,7 @@ use regex::Regex; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -fn exclude_advanced_query(md: String) -> String { +pub fn exclude_advanced_query(md: String) -> String { if !md.contains('#') { return md; } diff --git a/fire_seq_search_server/tests/unit_test_load_notes.rs b/fire_seq_search_server/tests/unit_test_load_notes.rs index b9ed533..3f040f8 100644 --- a/fire_seq_search_server/tests/unit_test_load_notes.rs +++ b/fire_seq_search_server/tests/unit_test_load_notes.rs @@ -1,5 +1,5 @@ -use fire_seq_search_server::load_notes::{exclude_advanced_query, read_specific_directory}; -use fire_seq_search_server::markdown_parser::parse_to_plain_text; +use fire_seq_search_server::load_notes::read_specific_directory; +use fire_seq_search_server::markdown_parser::{exclude_advanced_query, parse_to_plain_text}; fn load_articles() -> Vec<(String, String)> { From 3e00fe3f7cf216a28697832b579c593cc4d9c5ba Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:15:01 -0500 Subject: [PATCH 03/27] fix stub test --- fire_seq_search_server/src/load_notes/mod.rs | 1 - fire_seq_search_server/src/markdown_parser/markdown_to_text.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 848d565..6211eed 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -1,6 +1,5 @@ use std::fs::DirEntry; use log::{debug, error, info, warn}; -use regex::Regex; use std::process; use rayon::prelude::*; diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index ac7581d..58dc831 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -126,7 +126,7 @@ mod tests { #[test] fn links_to_pdf() { let markdown = r#"Refer to ![order.pdf](../assets/buy_00000_0.pdf)"#; - let expected = "Hello"; + let expected = "Refer to order.pdf"; assert_eq!(convert(markdown), expected); } From ed41b1a8945751c07958682afd3489c34e8e1ca1 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:25:21 -0500 Subject: [PATCH 04/27] try delay the markdown process --- fire_seq_search_server/src/load_notes/mod.rs | 9 +++++---- .../src/markdown_parser/markdown_to_text.rs | 1 + .../markdown_parser/markdown_to_text_fireseqsearch.rs | 0 fire_seq_search_server/src/markdown_parser/mod.rs | 2 ++ 4 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 6211eed..3275bc3 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -22,7 +22,7 @@ pub fn read_specific_directory(path: &str) -> Vec<(String, String)> { } // debug!("Note titles: {:?}", ¬e_filenames); let result: Vec<(String,String)> = note_filenames.par_iter() - .map(|note| read_md_file_and_parse(¬e)) + .map(|note| read_md_file_wo_parse(¬e)) .filter(|x| (&x).is_some()) .map(|x| x.unwrap()) .collect(); @@ -44,11 +44,12 @@ pub fn read_specific_directory(path: &str) -> Vec<(String, String)> { /// returns: Option<(String, String)> /// /// First: title (filename) -/// Second: full text (parsed) +/// Second: full raw text /// +/// I would delay the parsing job, so it could be couples with server info. -Zhenbo Li 2023-02-17 /// If input is a directory or DS_STORE, return None /// -pub fn read_md_file_and_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { +pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> { if let Ok(file_type) = note.file_type() { // Now let's show our entry's file type! debug!("{:?}: {:?}", note.path(), file_type); @@ -83,7 +84,7 @@ pub fn read_md_file_and_parse(note: &std::fs::DirEntry) -> Option<(String, Strin } }; - let content : String = crate::markdown_parser::parse_logseq_notebook(content); + // let content : String = crate::markdown_parser::parse_logseq_notebook(content); diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index 58dc831..af8beb7 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -27,6 +27,7 @@ use pulldown_cmark::{Event, Options, Parser, Tag}; +// pub fn convert_from_logseq #[must_use] pub fn convert(markdown: &str) -> String { // GFM tables and tasks lists are not enabled. diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs new file mode 100644 index 0000000..e69de29 diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 289ba65..719b068 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -1,4 +1,6 @@ mod markdown_to_text; +mod markdown_to_text_fireseqsearch; + use regex::Regex; // https://docs.rs/regex/latest/regex/#repetitions From 35d72ce157221013cd2abd6976a4937732b8dc3e Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:33:54 -0500 Subject: [PATCH 05/27] move parser place --- .../src/markdown_parser/markdown_to_text.rs | 4 +++- fire_seq_search_server/src/markdown_parser/mod.rs | 2 +- fire_seq_search_server/src/query_engine/mod.rs | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index af8beb7..e02175e 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -27,7 +27,9 @@ use pulldown_cmark::{Event, Options, Parser, Tag}; -// pub fn convert_from_logseq +pub fn convert_from_logseq(markdown:&str, parse_pdf: bool) -> String { + todo!() +} #[must_use] pub fn convert(markdown: &str) -> String { // GFM tables and tasks lists are not enabled. diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 719b068..6c23ea7 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -19,7 +19,7 @@ pub fn exclude_advanced_query(md: String) -> String { String::from(result) } -pub fn parse_logseq_notebook(md: String) -> String { +pub fn parse_logseq_notebook(md: String, parse_pdf: bool) -> String { // Now we do some parsing for this file let content: String = exclude_advanced_query(md); diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index e0f524b..89de778 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -4,6 +4,8 @@ use log::{info, warn}; use crate::{decode_cjk_str, JiebaTokenizer}; use crate::load_notes::read_specific_directory; use crate::post_query::post_query_wrapper; +use rayon::prelude::*; +use crate::markdown_parser::parse_logseq_notebook; #[derive(Debug, Clone, serde::Serialize)] pub struct ServerInformation { @@ -125,6 +127,12 @@ fn indexing_documents(server_info: &ServerInformation, document_setting: &Docume let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); + let pages: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() + .map(|(title,md)| { + let content = parse_logseq_notebook(md.to_string(), false); + (title.to_string(), content) + }).collect(); + for (file_name, contents) in read_specific_directory(&pages_path) { // let note_title = process_note_title(file_name, &server_info); index_writer.add_document( From e9f277e23b4a362df5f5cadabf2fdd6eceaa3f06 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:44:45 -0500 Subject: [PATCH 06/27] use cow to avoid copy --- fire_seq_search_server/src/markdown_parser/mod.rs | 12 ++++++------ fire_seq_search_server/src/query_engine/mod.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 6c23ea7..cb470b1 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -5,9 +5,9 @@ use regex::Regex; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query(md: String) -> String { +pub fn exclude_advanced_query<'a>(md: &'a str) -> std::borrow::Cow<'a, str> { if !md.contains('#') { - return md; + return std::borrow::Cow::Borrowed(md); } lazy_static! { @@ -15,14 +15,14 @@ pub fn exclude_advanced_query(md: String) -> String { r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY") .unwrap(); } - let result = RE.replace_all(&md, " "); - String::from(result) + // return RE.replace_all(&md, " ") + return RE.replace_all(&md, " "); } -pub fn parse_logseq_notebook(md: String, parse_pdf: bool) -> String { +pub fn parse_logseq_notebook(md: &str, parse_pdf: bool) -> String { // Now we do some parsing for this file - let content: String = exclude_advanced_query(md); + let content = exclude_advanced_query(md); let content: String = parse_to_plain_text(&content); content } diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 89de778..ca53538 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -129,7 +129,7 @@ fn indexing_documents(server_info: &ServerInformation, document_setting: &Docume let pages: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() .map(|(title,md)| { - let content = parse_logseq_notebook(md.to_string(), false); + let content = parse_logseq_notebook(md, false); (title.to_string(), content) }).collect(); From f5674250a957812ced5f566c1bd39b0c95e7884f Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:45:59 -0500 Subject: [PATCH 07/27] remove lifetime --- fire_seq_search_server/src/markdown_parser/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index cb470b1..603b44d 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -5,7 +5,7 @@ use regex::Regex; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query<'a>(md: &'a str) -> std::borrow::Cow<'a, str> { +pub fn exclude_advanced_query(md: &str) -> std::borrow::Cow { if !md.contains('#') { return std::borrow::Cow::Borrowed(md); } @@ -20,7 +20,6 @@ pub fn exclude_advanced_query<'a>(md: &'a str) -> std::borrow::Cow<'a, str> { } pub fn parse_logseq_notebook(md: &str, parse_pdf: bool) -> String { - // Now we do some parsing for this file let content = exclude_advanced_query(md); let content: String = parse_to_plain_text(&content); From 93bc0fa248a0ba547693ad6d5bb327956f17233d Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 17:47:34 -0500 Subject: [PATCH 08/27] hack char --- fire_seq_search_server/src/markdown_parser/mod.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 603b44d..d07d653 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -1,13 +1,14 @@ mod markdown_to_text; mod markdown_to_text_fireseqsearch; +use std::borrow::Cow; use regex::Regex; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 -pub fn exclude_advanced_query(md: &str) -> std::borrow::Cow { +pub fn exclude_advanced_query(md: &str) -> Cow { if !md.contains('#') { - return std::borrow::Cow::Borrowed(md); + return Cow::Borrowed(md); } lazy_static! { @@ -19,17 +20,22 @@ pub fn exclude_advanced_query(md: &str) -> std::borrow::Cow { return RE.replace_all(&md, " "); } +fn hack_specific_chars_cow(text: Cow) -> String { + //https://www.compart.com/en/unicode/U+2022 + let bullet = char::from_u32(0x00002022).unwrap(); + text.replace(bullet, " ") +} + pub fn parse_logseq_notebook(md: &str, parse_pdf: bool) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); + let content = hack_specific_chars_cow(content); let content: String = parse_to_plain_text(&content); content } pub fn parse_to_plain_text(md: &str) -> String { - - let plain_text: String = markdown_to_text::convert(&md); let plain_text = hack_specific_chars(plain_text); From 66872d19b40384a185a35c66109c03f295d55095 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 18:02:09 -0500 Subject: [PATCH 09/27] pass server_info --- .../src/markdown_parser/markdown_to_text.rs | 42 ++++++++++++++++++- .../src/markdown_parser/mod.rs | 5 ++- .../src/query_engine/mod.rs | 5 ++- 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index e02175e..a8555d4 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -25,11 +25,46 @@ #![warn(clippy::all, clippy::pedantic)] +use log::warn; use pulldown_cmark::{Event, Options, Parser, Tag}; +use crate::query_engine::ServerInformation; -pub fn convert_from_logseq(markdown:&str, parse_pdf: bool) -> String { - todo!() +pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> String { + let mut options = Options::empty(); + options.insert(Options::ENABLE_STRIKETHROUGH); + + let parser = Parser::new_ext(&markdown, options); + let mut tags_stack = Vec::new(); + let mut buffer = String::new(); + + // For each event we push into the buffer to produce the plain text version. + for event in parser { + println!("{:?}", &event); + match event { + // The start and end events don't contain the text inside the tag. That's handled by the `Event::Text` arm. + // However, pdf is considered as Image, and will be specially handled when parsing end tag + Event::Start(tag) => { + start_tag(&tag, &mut buffer, &mut tags_stack); + tags_stack.push(tag); + } + Event::End(tag) => { + tags_stack.pop(); + end_tag(&tag, &mut buffer, &tags_stack); + + } + Event::Text(content) => { + if !tags_stack.iter().any(is_strikethrough) { + buffer.push_str(&content) + } + } + Event::Code(content) => buffer.push_str(&content), + Event::SoftBreak => buffer.push(' '), + _ => (), + } + } + buffer.trim().to_string() } + #[must_use] pub fn convert(markdown: &str) -> String { // GFM tables and tasks lists are not enabled. @@ -125,12 +160,15 @@ fn is_strikethrough(tag: &Tag) -> bool { #[cfg(test)] mod tests { use super::convert; + use super::convert_from_logseq; #[test] fn links_to_pdf() { let markdown = r#"Refer to ![order.pdf](../assets/buy_00000_0.pdf)"#; let expected = "Refer to order.pdf"; assert_eq!(convert(markdown), expected); + + let _a = convert_from_logseq(markdown, true); } #[test] diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index d07d653..653d553 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -3,6 +3,7 @@ mod markdown_to_text_fireseqsearch; use std::borrow::Cow; use regex::Regex; +use crate::query_engine::ServerInformation; // https://docs.rs/regex/latest/regex/#repetitions // https://stackoverflow.com/a/8303552/1166518 @@ -26,11 +27,11 @@ fn hack_specific_chars_cow(text: Cow) -> String { text.replace(bullet, " ") } -pub fn parse_logseq_notebook(md: &str, parse_pdf: bool) -> String { +pub fn parse_logseq_notebook(md: &str, server_info: &ServerInformation) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); let content = hack_specific_chars_cow(content); - let content: String = parse_to_plain_text(&content); + let content: String = markdown_to_text::convert_from_logseq(&content, server_info); content } diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index ca53538..8792d74 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -127,13 +127,14 @@ fn indexing_documents(server_info: &ServerInformation, document_setting: &Docume let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); + //TODO always parse pdf let pages: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() .map(|(title,md)| { - let content = parse_logseq_notebook(md, false); + let content = parse_logseq_notebook(md, server_info); (title.to_string(), content) }).collect(); - for (file_name, contents) in read_specific_directory(&pages_path) { + for (file_name, contents) in pages { // let note_title = process_note_title(file_name, &server_info); index_writer.add_document( tantivy::doc!{ title => file_name, body => contents} From 06c8c8c120a87938472acac24d8dd692d25adf05 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 18:06:24 -0500 Subject: [PATCH 10/27] add parse info to parse_pdf_links --- fire_seq_search_server/src/lib.rs | 1 + fire_seq_search_server/src/main.rs | 1 + .../src/markdown_parser/markdown_to_text.rs | 6 +++--- fire_seq_search_server/src/post_query/hit_parsed.rs | 8 ++++---- fire_seq_search_server/src/query_engine/mod.rs | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fire_seq_search_server/src/lib.rs b/fire_seq_search_server/src/lib.rs index 467efe4..15796d2 100644 --- a/fire_seq_search_server/src/lib.rs +++ b/fire_seq_search_server/src/lib.rs @@ -170,6 +170,7 @@ pub fn generate_server_info_for_test() -> ServerInformation { enable_journal_query: false, show_top_hits: 0, show_summary_single_line_chars_limit: 0, + parse_pdf_links: false, obsidian_md: false, convert_underline_hierarchy: true }; diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs index 55fa927..dca5c61 100644 --- a/fire_seq_search_server/src/main.rs +++ b/fire_seq_search_server/src/main.rs @@ -105,6 +105,7 @@ fn build_server_info(args: Cli) -> ServerInformation { show_top_hits: args.show_top_hits, show_summary_single_line_chars_limit: args.show_summary_single_line_chars_limit, + parse_pdf_links: true, // FIXME hardcode obsidian_md: args.obsidian_md, convert_underline_hierarchy: true, } diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index a8555d4..58ae0d4 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -39,7 +39,7 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St // For each event we push into the buffer to produce the plain text version. for event in parser { - println!("{:?}", &event); + // println!("{:?}", &event); match event { // The start and end events don't contain the text inside the tag. That's handled by the `Event::Text` arm. // However, pdf is considered as Image, and will be specially handled when parsing end tag @@ -50,7 +50,7 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St Event::End(tag) => { tags_stack.pop(); end_tag(&tag, &mut buffer, &tags_stack); - + // if server_info } Event::Text(content) => { if !tags_stack.iter().any(is_strikethrough) { @@ -168,7 +168,7 @@ mod tests { let expected = "Refer to order.pdf"; assert_eq!(convert(markdown), expected); - let _a = convert_from_logseq(markdown, true); + // let _a = convert_from_logseq(markdown, true); } #[test] diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index 8e4f242..8c47bc5 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -88,10 +88,10 @@ mod test_serde { metadata: String::from("meta") } } - fn serde(title: &str) -> String { - let h = get_parsed_hit(title); - h.serde_to_string() - } + // fn serde(title: &str) -> String { + // let h = get_parsed_hit(title); + // h.serde_to_string() + // } // TODO: This solution is buggy. Consider PR#100, which might be a better idea. -Zli, 2023-Jan // This test disabled on 2023-Feb-02 for PR #112 diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 8792d74..29f3d8e 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -14,7 +14,7 @@ pub struct ServerInformation { pub enable_journal_query: bool, pub show_top_hits: usize, pub show_summary_single_line_chars_limit: usize, - + pub parse_pdf_links: bool, pub obsidian_md: bool, From 993ffaa1214647fae223471988dc927c00e57cc3 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 18:21:03 -0500 Subject: [PATCH 11/27] trying to locate the pdf file --- .../src/markdown_parser/markdown_to_text.rs | 35 ++++++++++++++++--- .../src/post_query/hit_parsed.rs | 22 ++++++------ 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index 58ae0d4..76aa349 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -25,7 +25,7 @@ #![warn(clippy::all, clippy::pedantic)] -use log::warn; +use log::{debug, warn}; use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::query_engine::ServerInformation; @@ -50,7 +50,13 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St Event::End(tag) => { tags_stack.pop(); end_tag(&tag, &mut buffer, &tags_stack); - // if server_info + if server_info.parse_pdf_links { + let pdf_str = try_parse_pdf(&tag, server_info); + match pdf_str { + Some(s) => buffer.push_str(&s), + None => () + } + } } Event::Text(content) => { if !tags_stack.iter().any(is_strikethrough) { @@ -65,6 +71,22 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St buffer.trim().to_string() } +fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { + + let destination_uri = match tag { + Tag::Image(link_type, destination_uri, title) => { + if !destination_uri.ends_with(".pdf") { + return None; + } + debug!("Trying to parse PDF {:?}", tag); + println!("{:?}", &tag); + destination_uri + }, + _ => {return None;} + }; + None +} + #[must_use] pub fn convert(markdown: &str) -> String { // GFM tables and tasks lists are not enabled. @@ -159,16 +181,21 @@ fn is_strikethrough(tag: &Tag) -> bool { #[cfg(test)] mod tests { + use crate::generate_server_info_for_test; use super::convert; use super::convert_from_logseq; #[test] fn links_to_pdf() { - let markdown = r#"Refer to ![order.pdf](../assets/buy_00000_0.pdf)"#; + let markdown = r#"Refer to ![order.pdf](../assets/readings_1634910859348_0.pdf)"#; let expected = "Refer to order.pdf"; assert_eq!(convert(markdown), expected); - // let _a = convert_from_logseq(markdown, true); + let mut info = generate_server_info_for_test(); + info.notebook_path = "C:\\Users\\z2369li\\Nextcloud\\logseq_notebook".to_string(); + info.parse_pdf_links = true; + println!("{:?}", &info); + let _a = convert_from_logseq(markdown, &info); } #[test] diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index 8c47bc5..d475d37 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -77,17 +77,17 @@ mod test_serde { use crate::post_query::logseq_uri::generate_logseq_uri; - fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed { - let server_info = generate_server_info_for_test(); - let logseq_uri = generate_logseq_uri(title, &true, &server_info); - FireSeqSearchHitParsed{ - title: title.to_owned(), - summary: String::from("summary"), - score: 1.0, - logseq_uri, - metadata: String::from("meta") - } - } + // fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed { + // let server_info = generate_server_info_for_test(); + // let logseq_uri = generate_logseq_uri(title, &true, &server_info); + // FireSeqSearchHitParsed{ + // title: title.to_owned(), + // summary: String::from("summary"), + // score: 1.0, + // logseq_uri, + // metadata: String::from("meta") + // } + // } // fn serde(title: &str) -> String { // let h = get_parsed_hit(title); // h.serde_to_string() From 690f851be1fce1899e1a9c2fbc47493e5b9ea693 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 19:20:28 -0500 Subject: [PATCH 12/27] check file --- .../src/markdown_parser/markdown_to_text.rs | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index 76aa349..2588726 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -25,7 +25,8 @@ #![warn(clippy::all, clippy::pedantic)] -use log::{debug, warn}; +use std::path::Path; +use log::{debug, error, warn}; use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::query_engine::ServerInformation; @@ -80,10 +81,21 @@ fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { } debug!("Trying to parse PDF {:?}", tag); println!("{:?}", &tag); - destination_uri + destination_uri.replace("../", "") }, _ => {return None;} }; + + let path = Path::new(&server_info.notebook_path); + let pdf_path = path.join(destination_uri); + println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); + if !pdf_path.is_file() { + error!("pdf_path is not a file, skipping {:?}", &pdf_path); + return None; + } + + + None } From ec403df76148f64a25ae8b5a6a8b8a786a12dfea Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 19:33:38 -0500 Subject: [PATCH 13/27] move to new file --- fire_seq_search_server/Cargo.toml | 15 ++++++--- .../src/markdown_parser/markdown_to_text.rs | 26 +-------------- .../src/markdown_parser/mod.rs | 1 + .../src/markdown_parser/pdf_parser.rs | 33 +++++++++++++++++++ 4 files changed, 45 insertions(+), 30 deletions(-) create mode 100644 fire_seq_search_server/src/markdown_parser/pdf_parser.rs diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index d5abda0..ab7614f 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fire_seq_search_server" -version = "0.1.1" +version = "0.1.2" edition = "2021" license = "MIT" @@ -26,13 +26,18 @@ env_logger = "0.9.0" # Rust clap = { version = "4.0", features = ["derive"] } - +lazy_static = "1.4.0" +rayon = "1.5" urlencoding = "2.1.0" jieba-rs = { version = "0.6.6" } -lazy_static = "1.4.0" -rayon = "1.5" + + stopwords = "0.1.1" -pulldown-cmark = { version = "0.9.2", default-features = false } + regex = "1" lingua = { version = "1.4.0", default-features = false, features = ["chinese", "english"] } + +# Parsing +pulldown-cmark = { version = "0.9.2", default-features = false } +pdf-extract = "0.6.4" diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index 2588726..e4cbfa9 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -28,6 +28,7 @@ use std::path::Path; use log::{debug, error, warn}; use pulldown_cmark::{Event, Options, Parser, Tag}; +use crate::markdown_parser::pdf_parser::try_parse_pdf; use crate::query_engine::ServerInformation; pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> String { @@ -72,32 +73,7 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St buffer.trim().to_string() } -fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { - let destination_uri = match tag { - Tag::Image(link_type, destination_uri, title) => { - if !destination_uri.ends_with(".pdf") { - return None; - } - debug!("Trying to parse PDF {:?}", tag); - println!("{:?}", &tag); - destination_uri.replace("../", "") - }, - _ => {return None;} - }; - - let path = Path::new(&server_info.notebook_path); - let pdf_path = path.join(destination_uri); - println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); - if !pdf_path.is_file() { - error!("pdf_path is not a file, skipping {:?}", &pdf_path); - return None; - } - - - - None -} #[must_use] pub fn convert(markdown: &str) -> String { diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 653d553..d29d3c2 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -1,5 +1,6 @@ mod markdown_to_text; mod markdown_to_text_fireseqsearch; +mod pdf_parser; use std::borrow::Cow; use regex::Regex; diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs new file mode 100644 index 0000000..2d5e8a5 --- /dev/null +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -0,0 +1,33 @@ +use std::path::Path; +use log::{debug, error}; +use pulldown_cmark::Tag; +use crate::query_engine::ServerInformation; + +pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { + + let destination_uri = match tag { + Tag::Image(link_type, destination_uri, title) => { + if !destination_uri.ends_with(".pdf") { + return None; + } + debug!("Trying to parse PDF {:?}", tag); + println!("{:?}", &tag); + destination_uri.replace("../", "") + }, + _ => {return None;} + }; + + let path = Path::new(&server_info.notebook_path); + let pdf_path = path.join(destination_uri); + println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); + if !pdf_path.is_file() { + error!("pdf_path is not a file, skipping {:?}", &pdf_path); + return None; + } + + // use lopdf::*; + // let doc = pdf_extract::Document::load(pdf_path).unwrap(); + + + None +} \ No newline at end of file From 4e33e57c137fa0368c1c12298e0b3c51eef9088f Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 20:14:05 -0500 Subject: [PATCH 14/27] import pdf --- fire_seq_search_server/Cargo.toml | 1 + fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index ab7614f..cede93a 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -41,3 +41,4 @@ lingua = { version = "1.4.0", default-features = false, features = ["chinese", " # Parsing pulldown-cmark = { version = "0.9.2", default-features = false } pdf-extract = "0.6.4" +lopdf = { version = "0.29", default-features = false, features = [ "pom_parser" ] } \ No newline at end of file diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index 2d5e8a5..fc05bf0 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -3,6 +3,11 @@ use log::{debug, error}; use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; +extern crate pdf_extract; +extern crate lopdf; +use pdf_extract::*; +use lopdf::*; + pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { let destination_uri = match tag { @@ -25,8 +30,7 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio return None; } - // use lopdf::*; - // let doc = pdf_extract::Document::load(pdf_path).unwrap(); + let doc = Document::load(pdf_path).unwrap(); None From 259fbe88f8999332d3c55ff1af1ce2fa9bc6cc6c Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 20:26:55 -0500 Subject: [PATCH 15/27] I guess it works now --- .../src/markdown_parser/pdf_parser.rs | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index fc05bf0..dd09080 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -1,12 +1,13 @@ use std::path::Path; use log::{debug, error}; +use pdf_extract::OutputError; use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; extern crate pdf_extract; -extern crate lopdf; -use pdf_extract::*; -use lopdf::*; +// extern crate lopdf; +// use pdf_extract::*; +// use lopdf::*; pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { @@ -29,9 +30,24 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio error!("pdf_path is not a file, skipping {:?}", &pdf_path); return None; } + // + // let doc = match Document::load(pdf_path) { + // Ok(s) => {s} + // Err(e) => { + // error!("Failed({:?} to load pdf {:?}", e, pdf_path); + // return None; + // } + // }; + // println!("{:?}", &doc); - let doc = Document::load(pdf_path).unwrap(); - + let text = match pdf_extract::extract_text(&pdf_path) { + Ok(s) => {s} + Err(e) => { + error!("Failed({:?} to load pdf {:?}", e, pdf_path); + return None; + } + }; + println!("{:?}", &text); - None + Some(text) } \ No newline at end of file From c4da5fb8d4af85cd233e66bf540210ff10bb531d Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 20:44:20 -0500 Subject: [PATCH 16/27] use it full --- .../src/markdown_parser/markdown_to_text.rs | 11 ++++++---- .../src/markdown_parser/mod.rs | 5 +++-- .../src/markdown_parser/pdf_parser.rs | 20 +++++++++---------- .../src/query_engine/mod.rs | 4 ++-- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index e4cbfa9..253d05b 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -31,7 +31,7 @@ use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::markdown_parser::pdf_parser::try_parse_pdf; use crate::query_engine::ServerInformation; -pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> String { +pub fn convert_from_logseq(markdown:&str, document_title: &str, server_info: &ServerInformation) -> String { let mut options = Options::empty(); options.insert(Options::ENABLE_STRIKETHROUGH); @@ -55,7 +55,10 @@ pub fn convert_from_logseq(markdown:&str, server_info: &ServerInformation) -> St if server_info.parse_pdf_links { let pdf_str = try_parse_pdf(&tag, server_info); match pdf_str { - Some(s) => buffer.push_str(&s), + Some(s) => { + debug!("PDF document {:?} appended to {}", &tag, document_title); + buffer.push_str(&s) + }, None => () } } @@ -182,8 +185,8 @@ mod tests { let mut info = generate_server_info_for_test(); info.notebook_path = "C:\\Users\\z2369li\\Nextcloud\\logseq_notebook".to_string(); info.parse_pdf_links = true; - println!("{:?}", &info); - let _a = convert_from_logseq(markdown, &info); + // println!("{:?}", &info); + let _a = convert_from_logseq(markdown, "title", &info); } #[test] diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index d29d3c2..56cee94 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -28,11 +28,12 @@ fn hack_specific_chars_cow(text: Cow) -> String { text.replace(bullet, " ") } -pub fn parse_logseq_notebook(md: &str, server_info: &ServerInformation) -> String { +pub fn parse_logseq_notebook(md: &str, title: &str, server_info: &ServerInformation) -> String { // Now we do some parsing for this file let content = exclude_advanced_query(md); let content = hack_specific_chars_cow(content); - let content: String = markdown_to_text::convert_from_logseq(&content, server_info); + let content: String = markdown_to_text::convert_from_logseq( + &content, title, server_info); content } diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index dd09080..d14cf81 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -1,5 +1,6 @@ +use std::ffi::OsStr; use std::path::Path; -use log::{debug, error}; +use log::{debug, error, info}; use pdf_extract::OutputError; use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; @@ -30,15 +31,7 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio error!("pdf_path is not a file, skipping {:?}", &pdf_path); return None; } - // - // let doc = match Document::load(pdf_path) { - // Ok(s) => {s} - // Err(e) => { - // error!("Failed({:?} to load pdf {:?}", e, pdf_path); - // return None; - // } - // }; - // println!("{:?}", &doc); + let text = match pdf_extract::extract_text(&pdf_path) { Ok(s) => {s} @@ -47,7 +40,12 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio return None; } }; - println!("{:?}", &text); + + match pdf_path.file_name() { + None => {error!("Extracted text len {}, file_name() failed", text.len());} + Some(f) => {info!("Extracted text from {:?} len {}", f, text.len());} + }; + Some(text) } \ No newline at end of file diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 29f3d8e..61d3154 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -127,10 +127,10 @@ fn indexing_documents(server_info: &ServerInformation, document_setting: &Docume let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); - //TODO always parse pdf + let pages: Vec<(String, String)> = read_specific_directory(&pages_path).par_iter() .map(|(title,md)| { - let content = parse_logseq_notebook(md, server_info); + let content = parse_logseq_notebook(md, title, server_info); (title.to_string(), content) }).collect(); From e56d99e9255c3616e58af3372c71212883571a72 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 20:45:25 -0500 Subject: [PATCH 17/27] remove dependency --- fire_seq_search_server/Cargo.toml | 1 - fire_seq_search_server/src/post_query/hit_parsed.rs | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index cede93a..ab7614f 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -41,4 +41,3 @@ lingua = { version = "1.4.0", default-features = false, features = ["chinese", " # Parsing pulldown-cmark = { version = "0.9.2", default-features = false } pdf-extract = "0.6.4" -lopdf = { version = "0.29", default-features = false, features = [ "pom_parser" ] } \ No newline at end of file diff --git a/fire_seq_search_server/src/post_query/hit_parsed.rs b/fire_seq_search_server/src/post_query/hit_parsed.rs index d475d37..fb233f0 100644 --- a/fire_seq_search_server/src/post_query/hit_parsed.rs +++ b/fire_seq_search_server/src/post_query/hit_parsed.rs @@ -72,9 +72,9 @@ impl FireSeqSearchHitParsed { #[cfg(test)] mod test_serde { - use crate::generate_server_info_for_test; - use crate::post_query::hit_parsed::FireSeqSearchHitParsed; - use crate::post_query::logseq_uri::generate_logseq_uri; + // use crate::generate_server_info_for_test; + // use crate::post_query::hit_parsed::FireSeqSearchHitParsed; + // use crate::post_query::logseq_uri::generate_logseq_uri; // fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed { From 54cbd84b03ecb2d63011b8c1ba678fa7f225a854 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 21:27:29 -0500 Subject: [PATCH 18/27] It works now! cool --- fire_seq_search_server/Cargo.toml | 7 ++++++- fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index ab7614f..d3560b4 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -40,4 +40,9 @@ lingua = { version = "1.4.0", default-features = false, features = ["chinese", " # Parsing pulldown-cmark = { version = "0.9.2", default-features = false } -pdf-extract = "0.6.4" +# Error +#at /rustc/897e37553bba8b42751c67658967889d11ecd120\library\core\src/option.rs:775:21 +#4: pdf_extract::show_text +#at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16 +#pdf-extract = "0.6.4" +pdf-extract = { git = "https://github.com/Endle/pdf-extract", rev="8cf54a37abd79d99d6c82b0df422fe2b06573183" } diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index d14cf81..e4734f7 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -1,7 +1,6 @@ use std::ffi::OsStr; use std::path::Path; use log::{debug, error, info}; -use pdf_extract::OutputError; use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; From 90189162b393bf2959c33882484d11066c02cede Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 21:30:55 -0500 Subject: [PATCH 19/27] code clean --- fire_seq_search_server/src/load_notes/mod.rs | 4 ---- .../src/markdown_parser/markdown_to_text_fireseqsearch.rs | 0 fire_seq_search_server/src/markdown_parser/mod.rs | 1 - 3 files changed, 5 deletions(-) delete mode 100644 fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs diff --git a/fire_seq_search_server/src/load_notes/mod.rs b/fire_seq_search_server/src/load_notes/mod.rs index 3275bc3..a733e81 100644 --- a/fire_seq_search_server/src/load_notes/mod.rs +++ b/fire_seq_search_server/src/load_notes/mod.rs @@ -84,10 +84,6 @@ pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String } }; - // let content : String = crate::markdown_parser::parse_logseq_notebook(content); - - - Some((note_title.to_string(),content)) } diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text_fireseqsearch.rs deleted file mode 100644 index e69de29..0000000 diff --git a/fire_seq_search_server/src/markdown_parser/mod.rs b/fire_seq_search_server/src/markdown_parser/mod.rs index 56cee94..26baf8a 100644 --- a/fire_seq_search_server/src/markdown_parser/mod.rs +++ b/fire_seq_search_server/src/markdown_parser/mod.rs @@ -1,5 +1,4 @@ mod markdown_to_text; -mod markdown_to_text_fireseqsearch; mod pdf_parser; use std::borrow::Cow; From fd7fd433e80288bc5a3c45fa1d6aaa4e2df91143 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 21:31:44 -0500 Subject: [PATCH 20/27] turnoff println --- fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index e4734f7..3c3083a 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -5,9 +5,7 @@ use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; extern crate pdf_extract; -// extern crate lopdf; -// use pdf_extract::*; -// use lopdf::*; + pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { @@ -17,7 +15,7 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio return None; } debug!("Trying to parse PDF {:?}", tag); - println!("{:?}", &tag); + // println!("{:?}", &tag); destination_uri.replace("../", "") }, _ => {return None;} @@ -25,7 +23,7 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio let path = Path::new(&server_info.notebook_path); let pdf_path = path.join(destination_uri); - println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); + // println!("{:?}, {:?}", &pdf_path, pdf_path.is_file()); if !pdf_path.is_file() { error!("pdf_path is not a file, skipping {:?}", &pdf_path); return None; From 72b9444b87481e872f9e3d4778cbb760f0259c49 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 21:46:25 -0500 Subject: [PATCH 21/27] set dfs --- fire_seq_search_server/debug_server.sh | 3 ++- fire_seq_search_server/src/main.rs | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/debug_server.sh b/fire_seq_search_server/debug_server.sh index 134e8b6..be05bd3 100644 --- a/fire_seq_search_server/debug_server.sh +++ b/fire_seq_search_server/debug_server.sh @@ -4,4 +4,5 @@ rm ./fire_seq_search_server -f cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ --notebook_path /c/Users/z2369li/Nextcloud/logseq_notebook \ ---enable-journal-query +--enable-journal-query \ +--parse_pdf_links \ No newline at end of file diff --git a/fire_seq_search_server/src/main.rs b/fire_seq_search_server/src/main.rs index dca5c61..5d88e78 100644 --- a/fire_seq_search_server/src/main.rs +++ b/fire_seq_search_server/src/main.rs @@ -17,6 +17,9 @@ struct Cli{ #[arg(long="notebook_name")] notebook_name: Option, + #[arg(long, default_value_t = false)] + parse_pdf_links: bool, + #[arg(long, default_value_t = false)] obsidian_md: bool, @@ -105,7 +108,7 @@ fn build_server_info(args: Cli) -> ServerInformation { show_top_hits: args.show_top_hits, show_summary_single_line_chars_limit: args.show_summary_single_line_chars_limit, - parse_pdf_links: true, // FIXME hardcode + parse_pdf_links: args.parse_pdf_links, obsidian_md: args.obsidian_md, convert_underline_hierarchy: true, } From 294cf637dd4acd80da7ed9fbdb0496a65fa24219 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 21:59:51 -0500 Subject: [PATCH 22/27] load pdf also for journals --- fire_seq_search_server/debug_server.sh | 2 +- fire_seq_search_server/src/query_engine/mod.rs | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/debug_server.sh b/fire_seq_search_server/debug_server.sh index be05bd3..109b80d 100644 --- a/fire_seq_search_server/debug_server.sh +++ b/fire_seq_search_server/debug_server.sh @@ -5,4 +5,4 @@ cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \ --notebook_path /c/Users/z2369li/Nextcloud/logseq_notebook \ --enable-journal-query \ ---parse_pdf_links \ No newline at end of file +--parse-pdf-links \ No newline at end of file diff --git a/fire_seq_search_server/src/query_engine/mod.rs b/fire_seq_search_server/src/query_engine/mod.rs index 61d3154..7fc4677 100644 --- a/fire_seq_search_server/src/query_engine/mod.rs +++ b/fire_seq_search_server/src/query_engine/mod.rs @@ -144,7 +144,12 @@ fn indexing_documents(server_info: &ServerInformation, document_setting: &Docume if server_info.enable_journal_query { info!("Loading journals"); let journals_page = path.clone() + "/journals"; - for (note_title, contents) in read_specific_directory(&journals_page) { + let journals: Vec<(String, String)> = read_specific_directory(&journals_page).par_iter() + .map(|(title,md)| { + let content = parse_logseq_notebook(md, title, server_info); + (title.to_string(), content) + }).collect(); + for (note_title, contents) in journals { let tantivy_title = crate::JOURNAL_PREFIX.to_owned() + ¬e_title; index_writer.add_document( tantivy::doc!{ title => tantivy_title, body => contents} From 09dc8fc0552d061072896562ed3af0dd123e3194 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 22:02:44 -0500 Subject: [PATCH 23/27] Fix warnings --- .../src/markdown_parser/markdown_to_text.rs | 4 ++-- fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index 253d05b..c07a1ae 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -25,8 +25,8 @@ #![warn(clippy::all, clippy::pedantic)] -use std::path::Path; -use log::{debug, error, warn}; + +use log::{debug, warn}; use pulldown_cmark::{Event, Options, Parser, Tag}; use crate::markdown_parser::pdf_parser::try_parse_pdf; use crate::query_engine::ServerInformation; diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index 3c3083a..3c7df1b 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -1,4 +1,4 @@ -use std::ffi::OsStr; + use std::path::Path; use log::{debug, error, info}; use pulldown_cmark::Tag; @@ -10,7 +10,7 @@ extern crate pdf_extract; pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { let destination_uri = match tag { - Tag::Image(link_type, destination_uri, title) => { + Tag::Image(_link_type, destination_uri, _title) => { if !destination_uri.ends_with(".pdf") { return None; } From 151dfcde4623061aa45dc86ef865bc742617ef4d Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 22:04:53 -0500 Subject: [PATCH 24/27] fix test --- fire_seq_search_server/tests/unit_test_load_notes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/tests/unit_test_load_notes.rs b/fire_seq_search_server/tests/unit_test_load_notes.rs index 3f040f8..612f640 100644 --- a/fire_seq_search_server/tests/unit_test_load_notes.rs +++ b/fire_seq_search_server/tests/unit_test_load_notes.rs @@ -39,12 +39,12 @@ fn parse() { #[test] fn exclude_advance_query() { let md = read_file_to_line("advanced_query.md"); - let result = exclude_advanced_query(md); + let result = exclude_advanced_query(&md); assert!(!result.contains("exempli")); assert!(result.contains("In this test page we have")); let md = read_file_to_line("blog_thunderbird_zh.md"); - let result = exclude_advanced_query(md.clone()); + let result = exclude_advanced_query(&md); assert_eq!(md, result); } \ No newline at end of file From 7d337d23c465ba722901fabb2d7573d3ed55dab0 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Fri, 17 Feb 2023 22:07:13 -0500 Subject: [PATCH 25/27] deepsource --- .../src/markdown_parser/markdown_to_text.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs index c07a1ae..0e7acb3 100644 --- a/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs +++ b/fire_seq_search_server/src/markdown_parser/markdown_to_text.rs @@ -37,7 +37,7 @@ pub fn convert_from_logseq(markdown:&str, document_title: &str, server_info: &Se let parser = Parser::new_ext(&markdown, options); let mut tags_stack = Vec::new(); - let mut buffer = String::new(); + let mut buffer = String::default(); // For each event we push into the buffer to produce the plain text version. for event in parser { @@ -86,7 +86,7 @@ pub fn convert(markdown: &str) -> String { let parser = Parser::new_ext(&markdown, options); let mut tags_stack = Vec::new(); - let mut buffer = String::new(); + let mut buffer = String::default(); // For each event we push into the buffer to produce the plain text version. for event in parser { From e381047e3224571aa142476f8b977f2e7c55f638 Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Sat, 18 Feb 2023 10:37:59 -0500 Subject: [PATCH 26/27] use migitation pdf-extract --- fire_seq_search_server/Cargo.toml | 2 +- fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index d3560b4..d07a393 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -45,4 +45,4 @@ pulldown-cmark = { version = "0.9.2", default-features = false } #4: pdf_extract::show_text #at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16 #pdf-extract = "0.6.4" -pdf-extract = { git = "https://github.com/Endle/pdf-extract", rev="8cf54a37abd79d99d6c82b0df422fe2b06573183" } +pdf-extract-temporary-migitation-panic = "0.7.1" \ No newline at end of file diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index 3c7df1b..660574c 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -4,8 +4,9 @@ use log::{debug, error, info}; use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; -extern crate pdf_extract; - +// extern crate pdf_extract; +extern crate pdf_extract_temporary_migitation_panic; +use pdf_extract_temporary_migitation_panic::extract_text; pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option { @@ -30,7 +31,7 @@ pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Optio } - let text = match pdf_extract::extract_text(&pdf_path) { + let text = match extract_text(&pdf_path) { Ok(s) => {s} Err(e) => { error!("Failed({:?} to load pdf {:?}", e, pdf_path); From d8a79f33cd16aa01a4c126b066751010d00e5f0c Mon Sep 17 00:00:00 2001 From: Zhenbo Li Date: Sat, 18 Feb 2023 10:42:13 -0500 Subject: [PATCH 27/27] fix typo --- fire_seq_search_server/Cargo.toml | 2 +- fire_seq_search_server/src/markdown_parser/pdf_parser.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fire_seq_search_server/Cargo.toml b/fire_seq_search_server/Cargo.toml index d07a393..9bbc321 100644 --- a/fire_seq_search_server/Cargo.toml +++ b/fire_seq_search_server/Cargo.toml @@ -45,4 +45,4 @@ pulldown-cmark = { version = "0.9.2", default-features = false } #4: pdf_extract::show_text #at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16 #pdf-extract = "0.6.4" -pdf-extract-temporary-migitation-panic = "0.7.1" \ No newline at end of file +pdf-extract-temporary-mitigation-panic = "0.7.1" \ No newline at end of file diff --git a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs index 660574c..ba49acf 100644 --- a/fire_seq_search_server/src/markdown_parser/pdf_parser.rs +++ b/fire_seq_search_server/src/markdown_parser/pdf_parser.rs @@ -5,8 +5,8 @@ use pulldown_cmark::Tag; use crate::query_engine::ServerInformation; // extern crate pdf_extract; -extern crate pdf_extract_temporary_migitation_panic; -use pdf_extract_temporary_migitation_panic::extract_text; +extern crate pdf_extract_temporary_mitigation_panic; +use pdf_extract_temporary_mitigation_panic::extract_text; pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option {