-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
258 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[package] | ||
name = "htmlproc" | ||
version = "0.2.2" | ||
version = "0.3.0" | ||
edition = "2021" | ||
authors = ["nabbisen <[email protected]>"] | ||
license = "Apache-2.0" | ||
|
@@ -14,8 +14,9 @@ rust-version = "1.74.0" | |
[features] | ||
default = [] | ||
# testing: `cargo test --features full` | ||
full = ["path_to_url"] | ||
full = ["omit_attr", "path_to_url"] | ||
# functions | ||
omit_attr = [] | ||
path_to_url = [] | ||
|
||
[package.metadata.docs.rs] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pub mod consts; | ||
pub mod utils; |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
// #[cfg(feature = "omit_attr")] | ||
pub mod omit_attr; | ||
pub mod omit_enclosure; | ||
// #[cfg(feature = "path_to_url")] | ||
pub mod path_to_url; | ||
|
||
mod consts; | ||
mod utils; | ||
mod core; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
use html5ever::tendril::TendrilSink; | ||
use html5ever::{parse_document, ParseOpts}; | ||
use markup5ever_rcdom::{Handle, NodeData, RcDom}; | ||
|
||
use crate::core::utils::reset_document_outline; | ||
|
||
mod tests; | ||
|
||
struct OmitOptions { | ||
attrs: Vec<String>, | ||
tag_attrs: Vec<(String, String)>, | ||
} | ||
|
||
/// omits specific attributes of tags in html | ||
/// | ||
/// [feature entry point] | ||
/// | ||
/// ```rust | ||
/// use htmlproc::omit_attr::manipulate; | ||
/// | ||
/// let source: &str = "<div id=\"preserved\"><span style=\"want: omitted;\" class=\"also: wanted;\" z-index=\"1\">Content</span></div>"; | ||
/// let omit_attrs = &["style", "*.class", "span.z-index"]; | ||
/// let expect: &str = "<div id=\"preserved\"><span>Content</span></div>"; | ||
/// | ||
/// let result = manipulate(source, omit_attrs); | ||
/// assert_eq!(result, expect); | ||
/// ``` | ||
/// | ||
pub fn manipulate(html: &str, omit_attrs: &[&str]) -> String { | ||
let dom = parse_document(RcDom::default(), ParseOpts::default()) | ||
.from_utf8() | ||
.read_from(&mut html.as_bytes()) | ||
.unwrap(); | ||
|
||
let options = options(omit_attrs); | ||
|
||
let mut output = String::new(); | ||
scan(&dom.document, &options, &mut output); | ||
|
||
reset_document_outline(output.as_str(), html) | ||
} | ||
|
||
fn options(omit_attrs: &[&str]) -> OmitOptions { | ||
if omit_attrs | ||
.iter() | ||
.any(|x| 1 < x.chars().filter(|y| *y == '.').count()) | ||
{ | ||
panic!("Invalid omit_attrs: each item should have single or none of \".\""); | ||
} | ||
|
||
let attr_options = omit_attrs | ||
.iter() | ||
.filter(|&&x| !x.contains('.') || x.starts_with("*.")) | ||
.map(|&x| { | ||
let ret = if x.starts_with("*.") { &x[2..] } else { x }; | ||
ret.to_owned() | ||
}) | ||
.collect(); | ||
let tag_attr_options = omit_attrs | ||
.iter() | ||
.filter(|&&x| x.contains('.') && !x.starts_with("*.")) | ||
.map(|x| { | ||
let splitted = x.split('.').collect::<Vec<&str>>(); | ||
let tag_name = splitted[0]; | ||
let attr_name = splitted[1]; | ||
if tag_name.is_empty() || attr_name.is_empty() { | ||
panic!("Invalid tag-attr pair found: {} - {}", tag_name, attr_name); | ||
} | ||
(tag_name.to_owned(), attr_name.to_owned()) | ||
}) | ||
.collect(); | ||
let options = OmitOptions { | ||
attrs: attr_options, | ||
tag_attrs: tag_attr_options, | ||
}; | ||
options | ||
} | ||
|
||
/// scan to manipulate dom recursively | ||
fn scan(handle: &Handle, options: &OmitOptions, output: &mut String) { | ||
let node = handle; | ||
|
||
match &node.data { | ||
NodeData::Document => { | ||
for child in node.children.borrow().iter() { | ||
scan(child, options, output); | ||
} | ||
} | ||
NodeData::Element { | ||
ref name, | ||
ref attrs, | ||
.. | ||
} => { | ||
let tag_name = name.local.to_string(); | ||
|
||
// start tag | ||
output.push('<'); | ||
output.push_str(tag_name.as_str()); | ||
|
||
let attrs = attrs | ||
.clone() | ||
.into_inner() | ||
.iter() | ||
.filter(|x| { | ||
let attr_name = x.name.local.to_string(); | ||
!options.attrs.contains(&attr_name) | ||
&& !options | ||
.tag_attrs | ||
.contains(&(tag_name.to_owned(), attr_name)) | ||
}) | ||
.map(|x| format!(" {}=\"{}\"", x.name.local, x.value)) | ||
.collect::<Vec<String>>() | ||
.join(""); | ||
output.push_str(attrs.as_str()); | ||
|
||
output.push('>'); | ||
|
||
for child in node.children.borrow().iter() { | ||
scan(child, options, output); | ||
} | ||
|
||
// end tag | ||
output.push_str("</"); | ||
output.push_str(tag_name.as_str()); | ||
output.push('>'); | ||
} | ||
NodeData::Text { ref contents } => { | ||
output.push_str(&contents.borrow()); | ||
} | ||
_ => {} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/// `cargo test --features omit_attr` | ||
#[cfg(test)] | ||
mod tests { | ||
use crate::omit_attr::manipulate; | ||
|
||
#[test] | ||
fn manipulate_attr_1() { | ||
let source = "<span style=\"remove: me;\">Content</span>"; | ||
let omits = &["style"]; | ||
let expect = "<span>Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_attr_2() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["style"]; | ||
let expect = "<span class=\"keep-me\">Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_attr_3() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["*.style"]; | ||
let expect = "<span class=\"keep-me\">Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_tag_attr_1() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["span.style"]; | ||
let expect = "<span class=\"keep-me\">Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_tag_attr_2() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["a.style"]; | ||
let expect = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_tag_attr_3() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["span.style", "span.class"]; | ||
let expect = "<span>Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_attr_and_tag_attr_1() { | ||
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>"; | ||
let omits = &["*.style", "span.class"]; | ||
let expect = "<span>Content</span>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
|
||
#[test] | ||
fn manipulate_attr_and_tag_attr_2() { | ||
let source = "<main id=\"preserved\"><div style=\"remove: me;\" class=\"keep-me\" z-index=\"1\">Content</div></main>"; | ||
let omits = &["style", "*.class", "div.z-index"]; | ||
let expect = "<main id=\"preserved\"><div>Content</div></main>"; | ||
|
||
let result = manipulate(source, omits); | ||
assert_eq!(result, expect); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters