Skip to content

Commit

Permalink
buffer: optimize Buffer.byteLength
Browse files Browse the repository at this point in the history
Buffer.byteLength is called whenever a new string Buffer is created.
UTF8 is used as the default encoding, and base64 is also popular. These
must be fast and take up a relatively significant part of Buffer
instantiation.

This commit moves the Buffer.byteLength calculations into only JS-land,
moving it from C++ land for base64 and UTF8. It also removes the
ByteLength function on the C++ Buffer.

It also adds a benchmark for both encodings; the improvements hover for
UTF8, change a lot, but base64 is about
  • Loading branch information
brendanashworth committed May 17, 2015
1 parent 0a48a8b commit f6299c4
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 23 deletions.
55 changes: 55 additions & 0 deletions benchmark/buffers/buffer-bytelength.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
var common = require('../common');

var bench = common.createBenchmark(main, {
encoding: ['utf8', 'base64'],
len: [1, 2, 4, 16, 64], // x16
n: [5e6]
});

// 16 chars each
var chars = [
'hello brendan!!!', // 1 byte
'ΰαβγδεζηθικλμνξο', // 2 bytes
'挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
'𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
];

function main(conf) {
var n = conf.n | 0;
var len = conf.len | 0;
var encoding = conf.encoding;

var strings = [];
for (var string of chars) {
// Strings must be built differently, depending on encoding
var data = buildString(string, len);
if (encoding === 'utf8') {
strings.push(data);
} else if (encoding === 'base64') {
// Base64 strings will be much longer than their UTF8 counterparts
strings.push(new Buffer(data, 'utf8').toString('base64'));
}
}

// Check the result to ensure it is *properly* optimized
var results = strings.map(function(val) {
return Buffer.byteLength(val, encoding);
});

bench.start();
for (var i = 0; i < n; i++) {
var index = n % strings.length;
// Go!
var r = Buffer.byteLength(strings[index], encoding);

if (r !== results[index])
process.exit(1);
}
bench.end(n);
}

function buildString(str, times) {
if (times == 1) return str;

return str + buildString(str, times - 1);
}
72 changes: 64 additions & 8 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,30 +272,86 @@ Buffer.concat = function(list, length) {
};


function base64ByteLength(str, len) {
var bytes = len;

// Handle padding
if (str[len - 1] === '=')
bytes--;
if (len > 2 && str[len - 2] === '=')
bytes--;

// Base64 ratio: 3/4
bytes = (bytes / 4) * 3;

return Math.floor(bytes);
}

function utf8ByteLength(str, len) {
var bytes = len;

for (var i = 0; i < len; i++) {
var code = str.charCodeAt(i);

// Based on where the code lies (0x7F-0x10FFFF), derive char value.
if (code <= 0x7F)
{ } // nop
else if (code <= 0x7FF)
bytes += 1;
else if (code <= 0xFFFF)
bytes += 2;
else if (code <= 0x10FFFF)
bytes += 3;
}

return bytes;
}


function byteLength(string, encoding) {
if (typeof(string) !== 'string')
string = String(string);
if (typeof string !== 'string')
string = '' + string;

if (string.length === 0)
var len = string.length;
if (len === 0)
return 0;

switch (encoding) {
case 'ascii':
case 'binary':
// Deprecated
case 'raw':
return string.length;
case 'raws':
return len;

case 'utf8':
case 'utf-8':
return utf8ByteLength(string, len);

case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
return string.length * 2;
return len * 2;

case 'hex':
return string.length >>> 1;
}
return len >>> 1;

return binding.byteLength(string, encoding);
case 'base64':
return base64ByteLength(string, len);

default:
// The C++ binding defaulted to UTF8, we should too.
if (typeof encoding !== 'string')
return utf8ByteLength(string, len);

// Handle uppercased encodings
if (encoding !== encoding.toLowerCase())
return byteLength(string, encoding.toLowerCase());
// Lowercased, unrecognized string encoding. Default to UTF8.
else
return utf8ByteLength(string, len);
}
}

Buffer.byteLength = byteLength;
Expand Down
15 changes: 0 additions & 15 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -541,20 +541,6 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
}


void ByteLength(const FunctionCallbackInfo<Value> &args) {
Environment* env = Environment::GetCurrent(args);

if (!args[0]->IsString())
return env->ThrowTypeError("Argument must be a string");

Local<String> s = args[0]->ToString(env->isolate());
enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8);

uint32_t size = StringBytes::Size(env->isolate(), s, e);
args.GetReturnValue().Set(size);
}


void Compare(const FunctionCallbackInfo<Value> &args) {
Local<Object> obj_a = args[0].As<Object>();
char* obj_a_data =
Expand Down Expand Up @@ -745,7 +731,6 @@ void Initialize(Handle<Object> target,

env->SetMethod(target, "setupBufferJS", SetupBufferJS);

env->SetMethod(target, "byteLength", ByteLength);
env->SetMethod(target, "compare", Compare);
env->SetMethod(target, "fill", Fill);
env->SetMethod(target, "indexOfBuffer", IndexOfBuffer);
Expand Down
4 changes: 4 additions & 0 deletions test/parallel/test-buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,10 @@ assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));

// should use UTF8 with an unrecognized encoding
assert.equal(11, Buffer.byteLength('hello world', 'abc'));
assert.equal(10, Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'));

// slice(0,0).length === 0
assert.equal(0, Buffer('hello').slice(0, 0).length);

Expand Down

0 comments on commit f6299c4

Please sign in to comment.