Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

net: use icu's punycode implementation #7355

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions benchmark/net/punycode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
'use strict';

const common = require('../common.js');
const icu = process.binding('icu');
const punycode = require('punycode');

const bench = common.createBenchmark(main, {
method: ['punycode', 'icu'],
n: [1024],
val: [
'افغانستا.icom.museum',
'الجزائر.icom.museum',
'österreich.icom.museum',
'বাংলাদেশ.icom.museum',
'беларусь.icom.museum',
'belgië.icom.museum',
'българия.icom.museum',
'تشادر.icom.museum',
'中国.icom.museum',
'القمر.icom.museum',
'κυπρος.icom.museum',
'českárepublika.icom.museum',
'مصر.icom.museum',
'ελλάδα.icom.museum',
'magyarország.icom.museum',
'ísland.icom.museum',
'भारत.icom.museum',
'ايران.icom.museum',
'éire.icom.museum',
'איקו״ם.ישראל.museum',
'日本.icom.museum',
'الأردن.icom.museum'
]
});

function usingPunycode(val) {
punycode.toUnicode(punycode.toASCII(val));
}

function usingICU(val) {
icu.toUnicode(icu.toASCII(val));
}

function runPunycode(n, val) {
common.v8ForceOptimization(usingPunycode, val);
var i = 0;
bench.start();
for (; i < n; i++)
usingPunycode(val);
bench.end(n);
}

function runICU(n, val) {
common.v8ForceOptimization(usingICU, val);
var i = 0;
bench.start();
for (; i < n; i++)
usingICU(val);
bench.end(n);
}

function main(conf) {
const n = +conf.n;
const val = conf.val;
switch (conf.method) {
case 'punycode':
runPunycode(n, val);
break;
case 'icu':
runICU(n, val);
break;
default:
throw new Error('Unexpected method');
}
}
Binary file modified deps/icu-small/source/data/in/icudt57l.dat
Binary file not shown.
12 changes: 10 additions & 2 deletions lib/url.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
'use strict';

const punycode = require('punycode');
function importPunycode() {
try {
return process.binding('icu');
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When would this fail?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

./configure --without-intl

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, may want to look at that flag. I swear it still builds icu even though it isn't linked.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't.. Should be same As --with-intl=none

} catch (e) {
return require('punycode');
}
}

const { toASCII } = importPunycode();

exports.parse = urlParse;
exports.resolve = urlResolve;
Expand Down Expand Up @@ -309,7 +317,7 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) {
// It only converts parts of the domain name that
// have non-ASCII characters, i.e. it doesn't matter if
// you call it with a domain that already is ASCII-only.
this.hostname = punycode.toASCII(this.hostname);
this.hostname = toASCII(this.hostname);
}

var p = this.port ? ':' + this.port : '';
Expand Down
132 changes: 132 additions & 0 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,16 @@

#if defined(NODE_HAVE_I18N_SUPPORT)

#include "node.h"
#include "env.h"
#include "env-inl.h"
#include "util.h"
#include "util-inl.h"
#include "v8.h"

#include <unicode/putil.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>

#ifdef NODE_HAVE_SMALL_ICU
/* if this is defined, we have a 'secondary' entry point.
Expand All @@ -43,6 +51,13 @@ extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];

namespace node {

using v8::Context;
using v8::FunctionCallbackInfo;
using v8::Local;
using v8::Object;
using v8::String;
using v8::Value;

bool flag_icu_data_dir = false;

namespace i18n {
Expand All @@ -64,7 +79,124 @@ bool InitializeICUDirectory(const char* icu_data_path) {
}
}

static int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_UNICODE;
UIDNA* uidna = uidna_openUTS46(options, &status);
Copy link
Member

@bnoordhuis bnoordhuis Jun 28, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super familiar with ICU but shouldn't you check status afterwards? Likewise in ToASCII?

EDIT: Or do all ICU functions bail out if U_FAILURE(*status)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically, yeah. I haven't found a case where this line actually fails but it would be good house keeping

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the next use of &status would bail if it isn't U_ZERO_ERROR

if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);
}

if (U_FAILURE(status))
len = -1;

uidna_close(uidna);
return len;
}

static int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_ASCII;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);
}

if (U_FAILURE(status))
len = -1;

uidna_close(uidna);
return len;
}

static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
MaybeStackBuffer<char> buf;
int32_t len = ToUnicode(&buf, *val, val.length());

if (len < 0) {
return env->ThrowError("Cannot convert name to Unicode");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
v8::NewStringType::kNormal,
len).ToLocalChecked());
}

static void ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
MaybeStackBuffer<char> buf;
int32_t len = ToASCII(&buf, *val, val.length());

if (len < 0) {
return env->ThrowError("Cannot convert name to ASCII");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
v8::NewStringType::kNormal,
len).ToLocalChecked());
}

void Init(Local<Object> target,
Local<Value> unused,
Local<Context> context,
void* priv) {
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "toUnicode", ToUnicode);
env->SetMethod(target, "toASCII", ToASCII);
}

} // namespace i18n
} // namespace node

NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init)

#endif // NODE_HAVE_I18N_SUPPORT
72 changes: 72 additions & 0 deletions test/parallel/test-icu-punycode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
'use strict';

const common = require('../common');
const icu = getPunycode();
const assert = require('assert');

function getPunycode() {
try {
return process.binding('icu');
} catch (err) {
return undefined;
}
}

if (!icu) {
common.skip('icu punycode tests because ICU is not present.');
return;
}

// Credit for list: http://www.i18nguy.com/markup/idna-examples.html
const tests = [
'افغانستا.icom.museum',
'الجزائر.icom.museum',
'österreich.icom.museum',
'বাংলাদেশ.icom.museum',
'беларусь.icom.museum',
'belgië.icom.museum',
'българия.icom.museum',
'تشادر.icom.museum',
'中国.icom.museum',
'القمر.icom.museum',
'κυπρος.icom.museum',
'českárepublika.icom.museum',
'مصر.icom.museum',
'ελλάδα.icom.museum',
'magyarország.icom.museum',
'ísland.icom.museum',
'भारत.icom.museum',
'ايران.icom.museum',
'éire.icom.museum',
'איקו״ם.ישראל.museum',
'日本.icom.museum',
'الأردن.icom.museum',
'қазақстан.icom.museum',
'한국.icom.museum',
'кыргызстан.icom.museum',
'ລາວ.icom.museum',
'لبنان.icom.museum',
'македонија.icom.museum',
'موريتانيا.icom.museum',
'méxico.icom.museum',
'монголулс.icom.museum',
'المغرب.icom.museum',
'नेपाल.icom.museum',
'عمان.icom.museum',
'قطر.icom.museum',
'românia.icom.museum',
'россия.иком.museum',
'србијаицрнагора.иком.museum',
'இலங்கை.icom.museum',
'españa.icom.museum',
'ไทย.icom.museum',
'تونس.icom.museum',
'türkiye.icom.museum',
'украина.icom.museum',
'việtnam.icom.museum'
];

// Testing the roundtrip
tests.forEach((i) => {
assert.strictEqual(i, icu.toUnicode(icu.toASCII(i)));
});
9 changes: 1 addition & 8 deletions tools/icu/icu-generic.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@
'defines': [
# ICU cannot swap the initial data without this.
# http://bugs.icu-project.org/trac/ticket/11046
'UCONFIG_NO_LEGACY_CONVERSION=1',
'UCONFIG_NO_IDNA=1',
'UCONFIG_NO_LEGACY_CONVERSION=1'
],
}],
],
Expand Down Expand Up @@ -428,9 +427,6 @@
#'<(icu_path)/source/common/ubidi_props_data.h',
# and the callers
'<(icu_path)/source/common/ushape.cpp',
'<(icu_path)/source/common/usprep.cpp',
'<(icu_path)/source/common/uts46.cpp',
'<(icu_path)/source/common/uidna.cpp',
]}],
[ 'icu_ver_major == 57', { 'sources!': [
# work around http://bugs.icu-project.org/trac/ticket/12451
Expand All @@ -447,9 +443,6 @@
#'<(icu_path)/source/common/ubidi_props_data.h',
# and the callers
'<(icu_path)/source/common/ushape.cpp',
'<(icu_path)/source/common/usprep.cpp',
'<(icu_path)/source/common/uts46.cpp',
'<(icu_path)/source/common/uidna.cpp',
]}],
[ 'OS == "solaris"', { 'defines': [
'_XOPEN_SOURCE_EXTENDED=0',
Expand Down
3 changes: 1 addition & 2 deletions tools/icu/icu_small.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"region": "none",
"zone": "locales",
"converters": "none",
"stringprep": "none",
"stringprep": "locales",
"translit": "none",
"brkfiles": "none",
"brkdict": "none",
Expand All @@ -34,7 +34,6 @@
"remove": [
"cnvalias.icu",
"postalCodeData.res",
"uts46.nrm",
"genderList.res",
"brkitr/root.res",
"unames.icu"
Expand Down