buffer: optimize Buffer.byteLength

Buffer.byteLength is called whenever a new string Buffer is created. UTF8 is used as the default encoding, and base64 is also popular. These must be fast and take up a relatively significant part of Buffer instantiation. This commit moves the Buffer.byteLength calculations into only JS-land, moving it from C++ land for base64 and UTF8. It also removes the ByteLength function on the C++ Buffer. It also adds a benchmark for both encodings; the improvements hover for UTF8, change a lot, but base64 is about
nodejs · May 17, 2015 · f6299c4 · f6299c4
1 parent 0a48a8b
commit f6299c4
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 23 deletions.
diff --git a/benchmark/buffers/buffer-bytelength.js b/benchmark/buffers/buffer-bytelength.js
@@ -0,0 +1,55 @@
+var common = require('../common');
+
+var bench = common.createBenchmark(main, {
+  encoding: ['utf8', 'base64'],
+  len: [1, 2, 4, 16, 64], // x16
+  n: [5e6]
+});
+
+// 16 chars each
+var chars = [
+  'hello brendan!!!', // 1 byte
+  'ΰαβγδεζηθικλμνξο', // 2 bytes
+  '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
+  '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
+];
+
+function main(conf) {
+  var n = conf.n | 0;
+  var len = conf.len | 0;
+  var encoding = conf.encoding;
+
+  var strings = [];
+  for (var string of chars) {
+    // Strings must be built differently, depending on encoding
+    var data = buildString(string, len);
+    if (encoding === 'utf8') {
+      strings.push(data);
+    } else if (encoding === 'base64') {
+      // Base64 strings will be much longer than their UTF8 counterparts
+      strings.push(new Buffer(data, 'utf8').toString('base64'));
+    }
+  }
+
+  // Check the result to ensure it is *properly* optimized
+  var results = strings.map(function(val) {
+    return Buffer.byteLength(val, encoding);
+  });
+
+  bench.start();
+  for (var i = 0; i < n; i++) {
+    var index = n % strings.length;
+    // Go!
+    var r = Buffer.byteLength(strings[index], encoding);
+
+    if (r !== results[index])
+      process.exit(1);
+  }
+  bench.end(n);
+}
+
+function buildString(str, times) {
+  if (times == 1) return str;
+
+  return str + buildString(str, times - 1);
+}
diff --git a/lib/buffer.js b/lib/buffer.js
@@ -272,30 +272,86 @@ Buffer.concat = function(list, length) {
 };
 
 
+function base64ByteLength(str, len) {
+  var bytes = len;
+
+  // Handle padding
+  if (str[len - 1] === '=')
+    bytes--;
+  if (len > 2 && str[len - 2] === '=')
+    bytes--;
+
+  // Base64 ratio: 3/4
+  bytes = (bytes / 4) * 3;
+
+  return Math.floor(bytes);
+}
+
+function utf8ByteLength(str, len) {
+  var bytes = len;
+
+  for (var i = 0; i < len; i++) {
+    var code = str.charCodeAt(i);
+
+    // Based on where the code lies (0x7F-0x10FFFF), derive char value.
+    if (code <= 0x7F)
+      { } // nop
+    else if (code <= 0x7FF)
+      bytes += 1;
+    else if (code <= 0xFFFF)
+      bytes += 2;
+    else if (code <= 0x10FFFF)
+      bytes += 3;
+  }
+
+  return bytes;
+}
+
+
 function byteLength(string, encoding) {
-  if (typeof(string) !== 'string')
-    string = String(string);
+  if (typeof string !== 'string')
+    string = '' + string;
 
-  if (string.length === 0)
+  var len = string.length;
+  if (len === 0)
     return 0;
 
   switch (encoding) {
     case 'ascii':
     case 'binary':
+    // Deprecated
     case 'raw':
-      return string.length;
+    case 'raws':
+      return len;
+
+    case 'utf8':
+    case 'utf-8':
+      return utf8ByteLength(string, len);
 
     case 'ucs2':
     case 'ucs-2':
     case 'utf16le':
     case 'utf-16le':
-      return string.length * 2;
+      return len * 2;
 
     case 'hex':
-      return string.length >>> 1;
-  }
+      return len >>> 1;
 
-  return binding.byteLength(string, encoding);
+    case 'base64':
+      return base64ByteLength(string, len);
+
+    default:
+      // The C++ binding defaulted to UTF8, we should too.
+      if (typeof encoding !== 'string')
+        return utf8ByteLength(string, len);
+
+      // Handle uppercased encodings
+      if (encoding !== encoding.toLowerCase())
+        return byteLength(string, encoding.toLowerCase());
+      // Lowercased, unrecognized string encoding. Default to UTF8.
+      else
+        return utf8ByteLength(string, len);
+  }
 }
 
 Buffer.byteLength = byteLength;

diff --git a/src/node_buffer.cc b/src/node_buffer.cc
@@ -541,20 +541,6 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
 }
 
 
-void ByteLength(const FunctionCallbackInfo<Value> &args) {
-  Environment* env = Environment::GetCurrent(args);
-
-  if (!args[0]->IsString())
-    return env->ThrowTypeError("Argument must be a string");
-
-  Local<String> s = args[0]->ToString(env->isolate());
-  enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8);
-
-  uint32_t size = StringBytes::Size(env->isolate(), s, e);
-  args.GetReturnValue().Set(size);
-}
-
-
 void Compare(const FunctionCallbackInfo<Value> &args) {
   Local<Object> obj_a = args[0].As<Object>();
   char* obj_a_data =
@@ -745,7 +731,6 @@ void Initialize(Handle<Object> target,
 
   env->SetMethod(target, "setupBufferJS", SetupBufferJS);
 
-  env->SetMethod(target, "byteLength", ByteLength);
   env->SetMethod(target, "compare", Compare);
   env->SetMethod(target, "fill", Fill);
   env->SetMethod(target, "indexOfBuffer", IndexOfBuffer);

diff --git a/test/parallel/test-buffer.js b/test/parallel/test-buffer.js
@@ -569,6 +569,10 @@ assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
 assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
 assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));
 
+// should use UTF8 with an unrecognized encoding
+assert.equal(11, Buffer.byteLength('hello world', 'abc'));
+assert.equal(10, Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'));
+
 // slice(0,0).length === 0
 assert.equal(0, Buffer('hello').slice(0, 0).length);