Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pdf support #114

Merged
merged 27 commits into from
Feb 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions fire_seq_search_server/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fire_seq_search_server"
version = "0.1.1"
version = "0.1.2"
edition = "2021"
license = "MIT"

Expand All @@ -26,13 +26,23 @@ env_logger = "0.9.0"

# Rust
clap = { version = "4.0", features = ["derive"] }

lazy_static = "1.4.0"
rayon = "1.5"

urlencoding = "2.1.0"
jieba-rs = { version = "0.6.6" }
lazy_static = "1.4.0"
rayon = "1.5"


stopwords = "0.1.1"
pulldown-cmark = { version = "0.9.2", default-features = false }

regex = "1"
lingua = { version = "1.4.0", default-features = false, features = ["chinese", "english"] }

# Parsing
pulldown-cmark = { version = "0.9.2", default-features = false }
# Error
#at /rustc/897e37553bba8b42751c67658967889d11ecd120\library\core\src/option.rs:775:21
#4: pdf_extract::show_text
#at C:\Users\z2369li\.cargo\git\checkouts\pdf-extract-c67a6fa67c2d526c\0d8b9d9\src\lib.rs:1262:16
#pdf-extract = "0.6.4"
pdf-extract-temporary-mitigation-panic = "0.7.1"
3 changes: 2 additions & 1 deletion fire_seq_search_server/debug_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ rm ./fire_seq_search_server -f
cp --force target/debug/fire_seq_search_server.exe ./fire_seq_search_server
RUST_BACKTRACE=1 RUST_LOG=debug ./fire_seq_search_server \
--notebook_path /c/Users/z2369li/Nextcloud/logseq_notebook \
--enable-journal-query
--enable-journal-query \
--parse-pdf-links
1 change: 1 addition & 0 deletions fire_seq_search_server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ pub fn generate_server_info_for_test() -> ServerInformation {
enable_journal_query: false,
show_top_hits: 0,
show_summary_single_line_chars_limit: 0,
parse_pdf_links: false,
obsidian_md: false,
convert_underline_hierarchy: true
};
Expand Down
37 changes: 5 additions & 32 deletions fire_seq_search_server/src/load_notes/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use std::fs::DirEntry;
use log::{debug, error, info, warn};
use regex::Regex;
use std::process;

use rayon::prelude::*;

use crate::markdown_parser::parse_to_plain_text;


pub fn read_specific_directory(path: &str) -> Vec<(String, String)> {
info!("Try to read {}", &path);
Expand All @@ -23,7 +22,7 @@ pub fn read_specific_directory(path: &str) -> Vec<(String, String)> {
}
// debug!("Note titles: {:?}", &note_filenames);
let result: Vec<(String,String)> = note_filenames.par_iter()
.map(|note| read_md_file_and_parse(&note))
.map(|note| read_md_file_wo_parse(&note))
.filter(|x| (&x).is_some())
.map(|x| x.unwrap())
.collect();
Expand All @@ -45,11 +44,12 @@ pub fn read_specific_directory(path: &str) -> Vec<(String, String)> {
/// returns: Option<(String, String)>
///
/// First: title (filename)
/// Second: full text (parsed)
/// Second: full raw text
///
/// I would delay the parsing job, so it could be couples with server info. -Zhenbo Li 2023-02-17
/// If input is a directory or DS_STORE, return None
///
pub fn read_md_file_and_parse(note: &std::fs::DirEntry) -> Option<(String, String)> {
pub fn read_md_file_wo_parse(note: &std::fs::DirEntry) -> Option<(String, String)> {
if let Ok(file_type) = note.file_type() {
// Now let's show our entry's file type!
debug!("{:?}: {:?}", note.path(), file_type);
Expand Down Expand Up @@ -84,33 +84,6 @@ pub fn read_md_file_and_parse(note: &std::fs::DirEntry) -> Option<(String, Strin
}
};


// Now we do some parsing for this file
let content: String = exclude_advanced_query(content);
let content: String = parse_to_plain_text(&content);

Some((note_title.to_string(),content))
}

// https://docs.rs/regex/latest/regex/#repetitions
// https://stackoverflow.com/a/8303552/1166518
pub fn exclude_advanced_query(md: String) -> String {
if !md.contains('#') {
return md;
}

lazy_static! {
static ref RE: Regex = Regex::new(
r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY")
.unwrap();
}
let result = RE.replace_all(&md, " ");
String::from(result)
// let mat = RE.find(&md);
// match mat {
// Some(m) => {
// todo!()
// },
// None => md
// }
}
4 changes: 4 additions & 0 deletions fire_seq_search_server/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ struct Cli{
#[arg(long="notebook_name")]
notebook_name: Option<String>,

#[arg(long, default_value_t = false)]
parse_pdf_links: bool,

#[arg(long, default_value_t = false)]
obsidian_md: bool,

Expand Down Expand Up @@ -105,6 +108,7 @@ fn build_server_info(args: Cli) -> ServerInformation {
show_top_hits: args.show_top_hits,
show_summary_single_line_chars_limit:
args.show_summary_single_line_chars_limit,
parse_pdf_links: args.parse_pdf_links,
obsidian_md: args.obsidian_md,
convert_underline_hierarchy: true,
}
Expand Down
68 changes: 67 additions & 1 deletion fire_seq_search_server/src/markdown_parser/markdown_to_text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,58 @@

#![warn(clippy::all, clippy::pedantic)]


use log::{debug, warn};
use pulldown_cmark::{Event, Options, Parser, Tag};
use crate::markdown_parser::pdf_parser::try_parse_pdf;
use crate::query_engine::ServerInformation;

pub fn convert_from_logseq(markdown:&str, document_title: &str, server_info: &ServerInformation) -> String {
let mut options = Options::empty();
options.insert(Options::ENABLE_STRIKETHROUGH);

let parser = Parser::new_ext(&markdown, options);
let mut tags_stack = Vec::new();
let mut buffer = String::default();

// For each event we push into the buffer to produce the plain text version.
for event in parser {
// println!("{:?}", &event);
match event {
// The start and end events don't contain the text inside the tag. That's handled by the `Event::Text` arm.
// However, pdf is considered as Image, and will be specially handled when parsing end tag
Event::Start(tag) => {
start_tag(&tag, &mut buffer, &mut tags_stack);
tags_stack.push(tag);
}
Event::End(tag) => {
tags_stack.pop();
end_tag(&tag, &mut buffer, &tags_stack);
if server_info.parse_pdf_links {
let pdf_str = try_parse_pdf(&tag, server_info);
match pdf_str {
Some(s) => {
debug!("PDF document {:?} appended to {}", &tag, document_title);
buffer.push_str(&s)
},
None => ()
}
}
}
Event::Text(content) => {
if !tags_stack.iter().any(is_strikethrough) {
buffer.push_str(&content)
}
}
Event::Code(content) => buffer.push_str(&content),
Event::SoftBreak => buffer.push(' '),
_ => (),
}
}
buffer.trim().to_string()
}



#[must_use]
pub fn convert(markdown: &str) -> String {
Expand All @@ -35,7 +86,7 @@ pub fn convert(markdown: &str) -> String {

let parser = Parser::new_ext(&markdown, options);
let mut tags_stack = Vec::new();
let mut buffer = String::new();
let mut buffer = String::default();

// For each event we push into the buffer to produce the plain text version.
for event in parser {
Expand Down Expand Up @@ -121,7 +172,22 @@ fn is_strikethrough(tag: &Tag) -> bool {

#[cfg(test)]
mod tests {
use crate::generate_server_info_for_test;
use super::convert;
use super::convert_from_logseq;

#[test]
fn links_to_pdf() {
let markdown = r#"Refer to ![order.pdf](../assets/readings_1634910859348_0.pdf)"#;
let expected = "Refer to order.pdf";
assert_eq!(convert(markdown), expected);

let mut info = generate_server_info_for_test();
info.notebook_path = "C:\\Users\\z2369li\\Nextcloud\\logseq_notebook".to_string();
info.parse_pdf_links = true;
// println!("{:?}", &info);
let _a = convert_from_logseq(markdown, "title", &info);
}

#[test]
fn basic_inline_strong() {
Expand Down
36 changes: 35 additions & 1 deletion fire_seq_search_server/src/markdown_parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,43 @@
mod markdown_to_text;
mod pdf_parser;

use std::borrow::Cow;
use regex::Regex;
use crate::query_engine::ServerInformation;

pub fn parse_to_plain_text(md: &str) -> String {
// https://docs.rs/regex/latest/regex/#repetitions
// https://stackoverflow.com/a/8303552/1166518
pub fn exclude_advanced_query(md: &str) -> Cow<str> {
if !md.contains('#') {
return Cow::Borrowed(md);
}

lazy_static! {
static ref RE: Regex = Regex::new(
r"\#\+BEGIN_QUERY[\S\s]+?\#\+END_QUERY")
.unwrap();
}
// return RE.replace_all(&md, " ")
return RE.replace_all(&md, " ");
}

fn hack_specific_chars_cow(text: Cow<str>) -> String {
//https://www.compart.com/en/unicode/U+2022
let bullet = char::from_u32(0x00002022).unwrap();
text.replace(bullet, " ")
}

pub fn parse_logseq_notebook(md: &str, title: &str, server_info: &ServerInformation) -> String {
// Now we do some parsing for this file
let content = exclude_advanced_query(md);
let content = hack_specific_chars_cow(content);
let content: String = markdown_to_text::convert_from_logseq(
&content, title, server_info);
content
}


pub fn parse_to_plain_text(md: &str) -> String {
let plain_text: String = markdown_to_text::convert(&md);
let plain_text = hack_specific_chars(plain_text);

Expand Down
49 changes: 49 additions & 0 deletions fire_seq_search_server/src/markdown_parser/pdf_parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

use std::path::Path;
use log::{debug, error, info};
use pulldown_cmark::Tag;
use crate::query_engine::ServerInformation;

// extern crate pdf_extract;
extern crate pdf_extract_temporary_mitigation_panic;
use pdf_extract_temporary_mitigation_panic::extract_text;

pub(crate) fn try_parse_pdf(tag: &Tag, server_info: &ServerInformation) -> Option<String> {

let destination_uri = match tag {
Tag::Image(_link_type, destination_uri, _title) => {
if !destination_uri.ends_with(".pdf") {
return None;
}
debug!("Trying to parse PDF {:?}", tag);
// println!("{:?}", &tag);
destination_uri.replace("../", "")
},
_ => {return None;}
};

let path = Path::new(&server_info.notebook_path);
let pdf_path = path.join(destination_uri);
// println!("{:?}, {:?}", &pdf_path, pdf_path.is_file());
if !pdf_path.is_file() {
error!("pdf_path is not a file, skipping {:?}", &pdf_path);
return None;
}


let text = match extract_text(&pdf_path) {
Ok(s) => {s}
Err(e) => {
error!("Failed({:?} to load pdf {:?}", e, pdf_path);
return None;
}
};

match pdf_path.file_name() {
None => {error!("Extracted text len {}, file_name() failed", text.len());}
Some(f) => {info!("Extracted text from {:?} len {}", f, text.len());}
};


Some(text)
}
40 changes: 20 additions & 20 deletions fire_seq_search_server/src/post_query/hit_parsed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,26 +72,26 @@ impl FireSeqSearchHitParsed {

#[cfg(test)]
mod test_serde {
use crate::generate_server_info_for_test;
use crate::post_query::hit_parsed::FireSeqSearchHitParsed;
use crate::post_query::logseq_uri::generate_logseq_uri;


fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed {
let server_info = generate_server_info_for_test();
let logseq_uri = generate_logseq_uri(title, &true, &server_info);
FireSeqSearchHitParsed{
title: title.to_owned(),
summary: String::from("summary"),
score: 1.0,
logseq_uri,
metadata: String::from("meta")
}
}
fn serde(title: &str) -> String {
let h = get_parsed_hit(title);
h.serde_to_string()
}
// use crate::generate_server_info_for_test;
// use crate::post_query::hit_parsed::FireSeqSearchHitParsed;
// use crate::post_query::logseq_uri::generate_logseq_uri;


// fn get_parsed_hit(title: &str) -> FireSeqSearchHitParsed {
// let server_info = generate_server_info_for_test();
// let logseq_uri = generate_logseq_uri(title, &true, &server_info);
// FireSeqSearchHitParsed{
// title: title.to_owned(),
// summary: String::from("summary"),
// score: 1.0,
// logseq_uri,
// metadata: String::from("meta")
// }
// }
// fn serde(title: &str) -> String {
// let h = get_parsed_hit(title);
// h.serde_to_string()
// }

// TODO: This solution is buggy. Consider PR#100, which might be a better idea. -Zli, 2023-Jan
// This test disabled on 2023-Feb-02 for PR #112
Expand Down
Loading