Skip to content

Commit

Permalink
fix #5; fix #6 (#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
nabbisen authored Jun 28, 2024
1 parent b0618c9 commit a630099
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 19 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "htmlproc"
version = "0.2.2"
version = "0.3.0"
edition = "2021"
authors = ["nabbisen <[email protected]>"]
license = "Apache-2.0"
Expand All @@ -14,8 +14,9 @@ rust-version = "1.74.0"
[features]
default = []
# testing: `cargo test --features full`
full = ["path_to_url"]
full = ["omit_attr", "path_to_url"]
# functions
omit_attr = []
path_to_url = []

[package.metadata.docs.rs]
Expand Down
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@ cargo add htmlproc --features path_to_url

## Functions (Features)

### omit_attr

Remove specific tag attribute(s) from HTML text.

#### Usage

First, run `cargo add htmlproc --features omit_attr`. Then specify attrs to omit. Three formats are available:

- `attr`: remove all attrs from all tags.
- `*.attr`: same to the above.
- `tag.attr`: remove all attrs from specifig tag. ex) `span.style`

```rust
use htmlproc::omit_attr::manipulate;

let html = "<div id=\"preserved\"><span style=\"want: remove;\" class=\"also: wanted;\" z-index=\"1\">Content</span></div>";
let omit_attrs = &["style", "*.class", "span.z-index"];
let result: String = manipulate(html, omit_attrs);
```

### omit_enclosure

Remove specific tag enclosure(s) from HTML text.
Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions src/core/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod consts;
pub mod utils;
File renamed without changes.
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// #[cfg(feature = "omit_attr")]
pub mod omit_attr;
pub mod omit_enclosure;
// #[cfg(feature = "path_to_url")]
pub mod path_to_url;

mod consts;
mod utils;
mod core;
132 changes: 132 additions & 0 deletions src/omit_attr/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, ParseOpts};
use markup5ever_rcdom::{Handle, NodeData, RcDom};

use crate::core::utils::reset_document_outline;

mod tests;

struct OmitOptions {
attrs: Vec<String>,
tag_attrs: Vec<(String, String)>,
}

/// omits specific attributes of tags in html
///
/// [feature entry point]
///
/// ```rust
/// use htmlproc::omit_attr::manipulate;
///
/// let source: &str = "<div id=\"preserved\"><span style=\"want: omitted;\" class=\"also: wanted;\" z-index=\"1\">Content</span></div>";
/// let omit_attrs = &["style", "*.class", "span.z-index"];
/// let expect: &str = "<div id=\"preserved\"><span>Content</span></div>";
///
/// let result = manipulate(source, omit_attrs);
/// assert_eq!(result, expect);
/// ```
///
pub fn manipulate(html: &str, omit_attrs: &[&str]) -> String {
let dom = parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.read_from(&mut html.as_bytes())
.unwrap();

let options = options(omit_attrs);

let mut output = String::new();
scan(&dom.document, &options, &mut output);

reset_document_outline(output.as_str(), html)
}

fn options(omit_attrs: &[&str]) -> OmitOptions {
if omit_attrs
.iter()
.any(|x| 1 < x.chars().filter(|y| *y == '.').count())
{
panic!("Invalid omit_attrs: each item should have single or none of \".\"");
}

let attr_options = omit_attrs
.iter()
.filter(|&&x| !x.contains('.') || x.starts_with("*."))
.map(|&x| {
let ret = if x.starts_with("*.") { &x[2..] } else { x };
ret.to_owned()
})
.collect();
let tag_attr_options = omit_attrs
.iter()
.filter(|&&x| x.contains('.') && !x.starts_with("*."))
.map(|x| {
let splitted = x.split('.').collect::<Vec<&str>>();
let tag_name = splitted[0];
let attr_name = splitted[1];
if tag_name.is_empty() || attr_name.is_empty() {
panic!("Invalid tag-attr pair found: {} - {}", tag_name, attr_name);
}
(tag_name.to_owned(), attr_name.to_owned())
})
.collect();
let options = OmitOptions {
attrs: attr_options,
tag_attrs: tag_attr_options,
};
options
}

/// scan to manipulate dom recursively
fn scan(handle: &Handle, options: &OmitOptions, output: &mut String) {
let node = handle;

match &node.data {
NodeData::Document => {
for child in node.children.borrow().iter() {
scan(child, options, output);
}
}
NodeData::Element {
ref name,
ref attrs,
..
} => {
let tag_name = name.local.to_string();

// start tag
output.push('<');
output.push_str(tag_name.as_str());

let attrs = attrs
.clone()
.into_inner()
.iter()
.filter(|x| {
let attr_name = x.name.local.to_string();
!options.attrs.contains(&attr_name)
&& !options
.tag_attrs
.contains(&(tag_name.to_owned(), attr_name))
})
.map(|x| format!(" {}=\"{}\"", x.name.local, x.value))
.collect::<Vec<String>>()
.join("");
output.push_str(attrs.as_str());

output.push('>');

for child in node.children.borrow().iter() {
scan(child, options, output);
}

// end tag
output.push_str("</");
output.push_str(tag_name.as_str());
output.push('>');
}
NodeData::Text { ref contents } => {
output.push_str(&contents.borrow());
}
_ => {}
}
}
85 changes: 85 additions & 0 deletions src/omit_attr/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/// `cargo test --features omit_attr`
#[cfg(test)]
mod tests {
use crate::omit_attr::manipulate;

#[test]
fn manipulate_attr_1() {
let source = "<span style=\"remove: me;\">Content</span>";
let omits = &["style"];
let expect = "<span>Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_attr_2() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["style"];
let expect = "<span class=\"keep-me\">Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_attr_3() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["*.style"];
let expect = "<span class=\"keep-me\">Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_tag_attr_1() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["span.style"];
let expect = "<span class=\"keep-me\">Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_tag_attr_2() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["a.style"];
let expect = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_tag_attr_3() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["span.style", "span.class"];
let expect = "<span>Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_attr_and_tag_attr_1() {
let source = "<span style=\"remove: me;\" class=\"keep-me\">Content</span>";
let omits = &["*.style", "span.class"];
let expect = "<span>Content</span>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}

#[test]
fn manipulate_attr_and_tag_attr_2() {
let source = "<main id=\"preserved\"><div style=\"remove: me;\" class=\"keep-me\" z-index=\"1\">Content</div></main>";
let omits = &["style", "*.class", "div.z-index"];
let expect = "<main id=\"preserved\"><div>Content</div></main>";

let result = manipulate(source, omits);
assert_eq!(result, expect);
}
}
14 changes: 6 additions & 8 deletions src/omit_enclosure/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ use html5ever::tendril::TendrilSink;
use html5ever::{parse_document, ParseOpts};
use markup5ever_rcdom::{Handle, NodeData, RcDom};

use std::default::Default;

use crate::consts::SELF_CLOSING_TAGS;
use crate::core::consts::SELF_CLOSING_TAGS;

mod tests;

Expand Down Expand Up @@ -63,11 +61,11 @@ fn scan(handle: &Handle, omit_tags: &[&str], output: &mut String) {
ref attrs,
..
} => {
let name_local = name.local.as_ref();
if !omit_tags.contains(&name_local) {
let tag_name = name.local.as_ref();
if !omit_tags.contains(&tag_name) {
// start tag
output.push('<');
output.push_str(name_local);
output.push_str(tag_name);

let attrs = attrs
.clone()
Expand All @@ -85,10 +83,10 @@ fn scan(handle: &Handle, omit_tags: &[&str], output: &mut String) {
scan(child, omit_tags, output);
}

if !SELF_CLOSING_TAGS.contains(&name_local) && !omit_tags.contains(&name_local) {
if !SELF_CLOSING_TAGS.contains(&tag_name) && !omit_tags.contains(&tag_name) {
// end tag
output.push_str("</");
output.push_str(name_local);
output.push_str(tag_name);
output.push('>');
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/omit_enclosure/tests.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/// `cargo test --features omit_enclosure`
/// `cargo test`
#[cfg(test)]
mod tests {
use crate::omit_enclosure::manipulate;
Expand Down
12 changes: 6 additions & 6 deletions src/path_to_url/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use markup5ever_rcdom::{Handle, NodeData, RcDom};

use std::path::Path;

use crate::utils::reset_document_outline;
use crate::core::utils::reset_document_outline;

mod tests;

Expand Down Expand Up @@ -157,13 +157,13 @@ fn scan(handle: &Handle, options: &ActualConvertOptions, output: &mut String) {
ref attrs,
..
} => {
let name_local = name.local.as_ref();
let tag_name = name.local.as_ref();

let is_convert_tag = options.tags.contains(&name_local);
let is_convert_tag = options.tags.contains(&tag_name);

// start tag
output.push('<');
output.push_str(name_local);
output.push_str(tag_name);

let attrs = attrs
.clone()
Expand All @@ -175,7 +175,7 @@ fn scan(handle: &Handle, options: &ActualConvertOptions, output: &mut String) {
let mut attr_value = x.value.to_string();
// path conversion
if is_convert_tag
&& CONVERT_TAG_ATTRS.contains(&(name_local, attr_name.as_str()))
&& CONVERT_TAG_ATTRS.contains(&(tag_name, attr_name.as_str()))
&& !attr_value.contains("//")
{
// absolute path
Expand Down Expand Up @@ -204,7 +204,7 @@ fn scan(handle: &Handle, options: &ActualConvertOptions, output: &mut String) {

// end tag
output.push_str("</");
output.push_str(name_local);
output.push_str(tag_name);
output.push('>');
}
NodeData::Text { ref contents } => {
Expand Down

0 comments on commit a630099

Please sign in to comment.