Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions lib/algos/list/MPU.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
'use strict'; // eslint-disable-line strict

const { inc, checkLimit, FILTER_END, FILTER_ACCEPT } = require('./tools');
const {
inc, checkLimit, utf8Compare,
FILTER_END, FILTER_ACCEPT, UNICODE_MAX,
} = require('./tools');
const DEFAULT_MAX_KEYS = 1000;

function numberDefault(num, defaultNum) {
Expand Down Expand Up @@ -48,11 +51,14 @@ class MultipartUploads {
params.gt = inc(params.gt);
}
if (this.params.prefix) {
if (params.gt === undefined || this.params.prefix > params.gt) {
if (params.gt === undefined
|| utf8Compare(this.params.prefix, params.gt) > 0) {
delete params.gt;
params.gte = this.params.prefix;
}
params.lt = inc(this.params.prefix);
} else {
params.lte = UNICODE_MAX;
}
return params;
}
Expand Down
12 changes: 9 additions & 3 deletions lib/algos/list/delimiter.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
'use strict'; // eslint-disable-line strict

const Extension = require('./Extension').default;
const { inc, FILTER_END, FILTER_ACCEPT, FILTER_SKIP } = require('./tools');
const {
inc, utf8Compare,
FILTER_END, FILTER_ACCEPT, FILTER_SKIP, UNICODE_MAX,
} = require('./tools');

/**
* Find the next delimiter in the path
Expand Down Expand Up @@ -106,10 +109,12 @@ class Delimiter extends Extension {
if (this.prefix) {
params.gte = this.prefix;
params.lt = inc(this.prefix);
} else {
params.lte = UNICODE_MAX;
}
const startVal = this[this.continueMarker] || this[this.startMarker];
if (startVal) {
if (params.gte && params.gte > startVal) {
if (params.gte && utf8Compare(params.gte, startVal) > 0) {
return params;
}
delete params.gte;
Expand Down Expand Up @@ -167,7 +172,8 @@ class Delimiter extends Extension {
if ((this.prefix && !key.startsWith(this.prefix))
|| (this.alphabeticalOrder
&& typeof this[this.nextContinueMarker] === 'string'
&& key <= this[this.nextContinueMarker])) {
&& key <= this[this.nextContinueMarker]
&& utf8Compare(key, this[this.nextContinueMarker]) <= 0)) {
return FILTER_SKIP;
}
if (this.delimiter) {
Expand Down
10 changes: 7 additions & 3 deletions lib/algos/list/delimiterMaster.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
const Delimiter = require('./delimiter').Delimiter;
const Version = require('../../versioning/Version').Version;
const VSConst = require('../../versioning/constants').VersioningConstants;
const { FILTER_ACCEPT, FILTER_SKIP, SKIP_NONE } = require('./tools');
const {
utf8Compare,
FILTER_ACCEPT, FILTER_SKIP, SKIP_NONE,
} = require('./tools');

const VID_SEP = VSConst.VersionId.Separator;

Expand Down Expand Up @@ -49,8 +52,9 @@ class DelimiterMaster extends Delimiter {
/* Skip keys not starting with the prefix or not alphabetically
* ordered. */
if ((this.prefix && !key.startsWith(this.prefix))
|| (typeof this[this.nextContinueMarker] === 'string' &&
key <= this[this.nextContinueMarker])) {
|| (typeof this[this.nextContinueMarker] === 'string'
&& key <= this[this.nextContinueMarker]
&& utf8Compare(key, this[this.nextContinueMarker]) <= 0)) {
return FILTER_SKIP;
}

Expand Down
4 changes: 3 additions & 1 deletion lib/algos/list/delimiterVersions.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
const Delimiter = require('./delimiter').Delimiter;
const Version = require('../../versioning/Version').Version;
const VSConst = require('../../versioning/constants').VersioningConstants;
const { inc, FILTER_END, FILTER_ACCEPT, FILTER_SKIP, SKIP_NONE } =
const { inc, FILTER_END, FILTER_ACCEPT, FILTER_SKIP, SKIP_NONE, UNICODE_MAX } =
require('./tools');

const VID_SEP = VSConst.VersionId.Separator;
Expand Down Expand Up @@ -43,6 +43,8 @@ class DelimiterVersions extends Delimiter {
if (this.parameters.prefix) {
params.gte = this.parameters.prefix;
params.lt = inc(this.parameters.prefix);
} else {
params.lte = UNICODE_MAX;
}
if (this.parameters.keyMarker) {
if (params.gte && params.gte > this.parameters.keyMarker) {
Expand Down
34 changes: 34 additions & 0 deletions lib/algos/list/tools.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ const SKIP_NONE = undefined; // to be inline with the values of NextMarker
const FILTER_ACCEPT = 1;
const FILTER_SKIP = 0;
const FILTER_END = -1;
const UNICODE_MAX = String.fromCodePoint(0x10FFFF);

/**
* This function check if number is valid
Expand Down Expand Up @@ -31,11 +32,44 @@ function inc(str) {
String.fromCharCode(str.charCodeAt(str.length - 1) + 1)) : str;
}

/*
* Compares two Strings, s1 and s2, using UTF-8 lexicographic ordering.
* @function
* @param {String} s1 the first string to compare
* @param {String} s2 the second string to compare
* @return {number} -1, 0, or 1 if s1 is less than, equal or greater than s2
* respectively
*/
function utf8Compare(s1, s2) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, me again. This seems very similar to what localeCompare probably does, have you tried using that? https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/localeCompare

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand it correctly localeCompare does locale sensitive collation. That's not what you want here; it needs to be locale independent UTF-8 binary order.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had another look at the description on MDN and the two pointers to the relevant Ecma specifications. There's a lot of room for 'implementation dependent behaviour' which makes localeCompare unsuitable for this usage. Amazon is fairly vague on this, but https://docs.aws.amazon.com/AmazonS3/latest/dev/ListingKeysUsingAPIs.html does state

List results are always returned in UTF-8 binary order.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like the optional arguments will have this type of comparison but I can not find the exact one as I don't fully understand the problem.

Copy link
Contributor Author

@pepijnve pepijnve Apr 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're going to go down a rabbit hole here 😄

Looking at V8 you end up at https://github.com/v8/v8/blob/4b9b23521e6fd42373ebbcb20ebe03bf445494f9/src/builtins/builtins-string.cc#L135
which, assuming ICU support has been compiled in, delegates to
https://github.com/v8/v8/blob/4b9b23521e6fd42373ebbcb20ebe03bf445494f9/src/objects/intl-objects.cc#L947
which in turn creates an ICU collator and then compares using that.

Making some assumption here, but you're most likely going get a Unicode collator that uses the system's default locale if you do not specify any options. You can find the details of what that is exactly at https://www.unicode.org/reports/tr10/#Main_Algorithm. In a nutshell this is sorting for humans taking language/culture specific details into account. That's not what you want here.
I had hoped to be able to find a locale code or an option value that says 'I want codepoint order', but as far as I can tell that does not exist.

UTF-8 binary order means encoding both strings as UTF-8 and comparing byte-wise. Thanks to the way UTF-8 is constructed this is identical to the ordering you get when you take the codepoint values of the string and compare those integer-wise.

You might be wondering why the default UTF-16 comparison isn't good enough. Best way I can explain that is with an example.
Take codepoints 48, 65104 and 129648. That's 0,﹐and 🩰 respectively.
Sorting in codepoint order you get them in exactly that order.

If you encode in UTF-16 you get

0x0030
0xFE50
0xD83E 0xDE70

respectively. UTF-16 binary sorting compares each 16-bit value one by one so that would return the string in the order [0, 🩰, ﹐].

Encode in UTF-8 and you get

0x30
0xEF 0xB9 0x90
0xF0 0x9F 0xA9 0xB0

respectively. UTF-8 binary sorting compares each 8-bit value one by one and then you get [0,﹐,🩰] which is what you want.

This is by design in UTF-8. Binary sort order for UTF-8 is identical to codepoint order.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea that's fair, I had assumed it was something similar to that affect but was hoping there was something built in as this seems like it would be easy and common enough. I am new to scality so I won't be able to merge this but its got my approval if that means anything.

const l1 = s1.length;
const l2 = s2.length;
const l = Math.min(l1, l2);

for (let i = 0; i < l; i++) {
const cp1 = s1.codePointAt(i);
const cp2 = s2.codePointAt(i);

if (cp1 < cp2) {
return -1;
} else if (cp1 > cp2) {
return 1;
}

if (cp1 > 0xFFFF) {
i++;
}
}

return Math.sign(l1 - l2);
}

module.exports = {
checkLimit,
inc,
utf8Compare,
SKIP_NONE,
FILTER_END,
FILTER_SKIP,
FILTER_ACCEPT,
UNICODE_MAX,
};
Loading