Skip to content

Commit

Permalink
Merge idna-v1x to main (#990)
Browse files Browse the repository at this point in the history
* Adjust Punycode overflow checks

* The change made in 1.0.0 incorrectly assumed that the input length
  limit removed the need to do overflow check when decoding. Now the
  internal-caller length limit is taken as a permission to skip
  overflow checks only when encoding.
* The RFC gives overflow checking pre-flight math for languages like
  that don't have checked math. Since Rust does, the code now uses
  checked_add and checked_mul instead of pre-flight when overflow
  checks are performed.

* Remove no_std category (crates.io doesn't support it, and it is now rejected), use keywords instead

* Add benches that use the main idna 1.0 entry point in idna and url

* Put the Unicode back end behind an adapter crate

* Split fastest ASCII fast path from the rest

* Bench hyphen in a domain that is otherwise lower-case ASCII

* Adjust MSRV

* Add README remarks about alternative Unicode back ends

* Change the idna_adapter dependency to crates.io

* Address clippy lints

* Increment version number of idna to 1.0.3

* Test MSRV with idna unicode-rs back end and test ICU4X back end with 1.67

* Prepare url crate for publication with idna 1.0.3 (#987)

---------

Co-authored-by: Manish Goregaokar <[email protected]>
Co-authored-by: Valentin Gosu <[email protected]>
  • Loading branch information
3 people authored Nov 4, 2024
1 parent 08a3268 commit 8a683ff
Show file tree
Hide file tree
Showing 22 changed files with 8,539 additions and 30,453 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
rust: [1.56.0, stable, beta, nightly]
rust: [1.57.0, 1.67.0, stable, beta, nightly]
exclude:
- os: macos-latest
rust: 1.56.0
rust: 1.67.0
- os: windows-latest
rust: 1.56.0
rust: 1.67.0
- os: macos-latest
rust: beta
- os: windows-latest
Expand All @@ -37,6 +37,10 @@ jobs:
toolchain: ${{ matrix.rust }}
# Add toolchain for no_std tests
- run: rustup toolchain install nightly
- name: Downgrade idna_adapter on Rust 1.57.0
if: |
matrix.rust == '1.57.0'
run: cargo update -p idna_adapter --precise 1.1.0
- name: Add `aarch64-unknown-none` toolchain for `no_std` tests
if: |
matrix.os == 'ubuntu-latest' &&
Expand All @@ -54,7 +58,8 @@ jobs:
- name: Run debugger_visualizer tests
if: |
matrix.os == 'windows-latest' &&
matrix.rust != '1.56.0'
matrix.rust != '1.57.0' &&
matrix.rust != '1.67.0'
run: cargo test --test debugger_visualizer --features "url/debugger_visualizer,url_debug_tests/debugger_visualizer" -- --test-threads=1 || echo "debugger test failed"
continue-on-error: true # Fails on GH actions, but not locally.
- name: Test `no_std` support
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ URL library for Rust, based on the [URL Standard](https://url.spec.whatwg.org/).
[Documentation](https://docs.rs/url)

Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/main/UPGRADING.md) if you are upgrading from a previous version.

## Alternative Unicode back ends

`url` depends on the `idna` crate. By default, `idna` uses [ICU4X](https://github.com/unicode-org/icu4x/) as its Unicode back end. If you wish to opt for different tradeoffs between correctness, run-time performance, binary size, compile time, and MSRV, please see the [README of the latest version of the `idna_adapter` crate](https://docs.rs/crate/idna_adapter/latest) for how to opt into a different Unicode back end.
19 changes: 12 additions & 7 deletions idna/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
[package]
name = "idna"
version = "0.5.0"
version = "1.0.3"
authors = ["The rust-url developers"]
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
categories = ["no_std"]
keywords = ["no_std", "web", "http"]
repository = "https://github.com/servo/rust-url/"
license = "MIT OR Apache-2.0"
autotests = false
edition = "2018"
rust-version = "1.51"
rust-version = "1.57" # For panic in const context

[lib]
doctest = false

[features]
default = ["std"]
std = ["alloc", "unicode-bidi/std", "unicode-normalization/std"]
default = ["std", "compiled_data"]
std = ["alloc"]
alloc = []
compiled_data = ["idna_adapter/compiled_data"]

[[test]]
name = "tests"
Expand All @@ -25,15 +26,19 @@ harness = false
[[test]]
name = "unit"

[[test]]
name = "unitbis"

[dev-dependencies]
assert_matches = "1.3"
bencher = "0.1"
tester = "0.9"
serde_json = "1.0"

[dependencies]
unicode-bidi = { version = "0.3.10", default-features = false, features = ["hardcoded-data"] }
unicode-normalization = { version = "0.1.22", default-features = false }
utf8_iter = "1.0.4"
smallvec = { version = "1.13.1", features = ["const_generics"]}
idna_adapter = "1"

[[bench]]
name = "all"
Expand Down
42 changes: 42 additions & 0 deletions idna/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# `idna`

IDNA library for Rust implementing [UTS 46: Unicode IDNA Compatibility Processing](https://www.unicode.org/reports/tr46/) as parametrized by the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna).

## What it does

* An implementation of UTS 46 is provided, with configurable ASCII deny list (e.g. STD3 or WHATWG rules).
* A callback mechanism is provided for pluggable logic for deciding if a label is deemed potentially too misleading to render as Unicode in a user interface.
* Errors are marked as U+FFFD REPLACEMENT CHARACTERs in Unicode output so that locations of errors may be illustrated to the user.

## What it does not do

* There is no default/sample policy provided for the callback mechanism mentioned above.
* Only UTS 46 is implemented: There is no API to request strictly IDNA 2008 only or strictly IDNA 2003 only.
* There is no API for categorizing errors beyond there being an error.
* Checks that are configurable in UTS 46 but that the WHATWG URL Standard always set a particular way (regardless of the _beStrict_ flag in the URL Standard) cannot be configured (with the exception of the old deprecated API supporting transitional processing).

## Usage

Apps that need to prepare a hostname for usage in protocols are likely to only need the top-level function `domain_to_ascii_cow` with `AsciiDenyList::URL` as the second argument. Note that this rejects IPv6 addresses, so before this, you need to check if the first byte of the input is `b'['` and, if it is, treat the input as an IPv6 address instead.

Apps that need to display host names to the user should use `uts46::Uts46::to_user_interface`. The _ToUnicode_ operation is rarely appropriate for direct application usage.

## Cargo features

* `alloc` - For future proofing. Currently always required. Currently, the crate internal may allocate heap but for typical inputs do not allocate on the heap (apart from the output `String` when applicable).
* `compiled_data` - For future proofing. Currently always required. (Passed through to ICU4X.)
* `std` - Adds `impl std::error::Error for Errors {}` (and implies `alloc`).
* By default, all of the above are enabled.

## Alternative Unicode back ends

By default, `idna` uses [ICU4X](https://github.com/unicode-org/icu4x/) as its Unicode back end. If you wish to opt for different tradeoffs between correctness, run-time performance, binary size, compile time, and MSRV, please see the [README of the latest version of the `idna_adapter` crate](https://docs.rs/crate/idna_adapter/latest) for how to opt into a different Unicode back end.

## Breaking changes since 0.5.0

* Stricter IDNA 2008 restrictions are no longer supported. Attempting to enable them panics immediately. UTS 46 allows all the names that IDNA 2008 allows, and when transitional processing is disabled, they resolve the same way. There are additional names that IDNA 2008 disallows but UTS 46 maps to names that IDNA 2008 allows (notably, input is mapped to fold-case output). UTS 46 also allows symbols that were allowed in IDNA 2003 as well as newer symbols that are allowed according to the same principle. (Earlier versions of this crate allowed rejecting such symbols. Rejecting characters that UTS 46 maps to IDNA 2008-permitted characters wasn't supported in earlier versions, either.)
* `domain_to_ascii_strict` now performs the _CheckHyphens_ check (matching previous documentation).
* The ContextJ rules are now implemented and always enabled, even when using the old deprecated API, so input that fails those rules is rejected.
* The `Idna::to_ascii_inner` method has been removed. It didn't make sense as a public method, since callers were unable to figure out if there were errors. (A GitHub search found no callers for this method.)
* Punycode labels whose decoding does not yield any non-ASCII characters are now treated as being in error.
* When turning off default cargo features, the cargo feature `compiled_data` needs to be explicitly enabled.
56 changes: 56 additions & 0 deletions idna/benches/all.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![allow(deprecated)]

#[macro_use]
extern crate bencher;
extern crate idna;
Expand Down Expand Up @@ -47,6 +49,51 @@ fn to_ascii_merged(bench: &mut Bencher) {
bench.iter(|| config.to_ascii(black_box(encoded)));
}

fn to_ascii_cow_plain(bench: &mut Bencher) {
let encoded = "example.com".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_hyphen(bench: &mut Bencher) {
let encoded = "hyphenated-example.com".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_leading_digit(bench: &mut Bencher) {
let encoded = "1test.example".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_unicode_mixed(bench: &mut Bencher) {
let encoded = "مثال.example".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_punycode_mixed(bench: &mut Bencher) {
let encoded = "xn--mgbh0fb.example".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_unicode_ltr(bench: &mut Bencher) {
let encoded = "නම.උදාහරණ".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_punycode_ltr(bench: &mut Bencher) {
let encoded = "xn--r0co.xn--ozc8dl2c3bxd".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_unicode_rtl(bench: &mut Bencher) {
let encoded = "الاسم.مثال".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

fn to_ascii_cow_punycode_rtl(bench: &mut Bencher) {
let encoded = "xn--mgba0b1dh.xn--mgbh0fb".as_bytes();
bench.iter(|| idna::domain_to_ascii_cow(black_box(encoded), idna::AsciiDenyList::URL));
}

benchmark_group!(
benches,
to_unicode_puny_label,
Expand All @@ -56,5 +103,14 @@ benchmark_group!(
to_ascii_already_puny_label,
to_ascii_simple,
to_ascii_merged,
to_ascii_cow_plain,
to_ascii_cow_hyphen,
to_ascii_cow_leading_digit,
to_ascii_cow_unicode_mixed,
to_ascii_cow_punycode_mixed,
to_ascii_cow_unicode_ltr,
to_ascii_cow_punycode_ltr,
to_ascii_cow_unicode_rtl,
to_ascii_cow_punycode_rtl,
);
benchmark_main!(benches);
Loading

0 comments on commit 8a683ff

Please sign in to comment.