Skip to content

Commit dedd8fc

Browse files
committed
Review unescaping.
1 parent 112af5f commit dedd8fc

File tree

3 files changed

+26
-44
lines changed

3 files changed

+26
-44
lines changed

perf/N3Store-perf.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ var prefix = 'http://example.org/#';
88
var TEST, dim, dimSquared, dimCubed, dimQuads, store;
99

1010
/* Test triples */
11-
dim = parseInt(process.argv[2], 10) || 256;
11+
dim = Number.parseInt(process.argv[2], 10) || 256;
1212
dimSquared = dim * dim;
1313
dimCubed = dimSquared * dim;
1414

src/N3Lexer.js

Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@ import namespaces from './IRIs';
33
import queueMicrotask from 'queue-microtask';
44

55
const { xsd } = namespaces;
6-
const { fromCharCode } = String;
76

8-
// Regular expression and replacement string to escape N3 strings.
9-
// Note how we catch invalid unicode sequences separately (they will trigger an error).
10-
var escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\[uU]|\\(.)/g;
7+
// Regular expression and replacement string to escape N3 strings
8+
var escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\([^])/g;
119
var escapeReplacements = {
1210
'\\': '\\', "'": "'", '"': '"',
1311
'n': '\n', 'r': '\r', 't': '\t', 'f': '\f', 'b': '\b',
@@ -351,29 +349,25 @@ export default class N3Lexer {
351349

352350
// ### `_unescape` replaces N3 escape codes by their corresponding characters
353351
_unescape(item) {
354-
try {
355-
return item.replace(escapeSequence, function (sequence, unicode4, unicode8, escapedChar) {
356-
var charCode;
357-
if (unicode4) {
358-
charCode = parseInt(unicode4, 16);
359-
if (isNaN(charCode)) throw new Error(); // can never happen (regex), but helps performance
360-
return fromCharCode(charCode);
361-
}
362-
else if (unicode8) {
363-
charCode = parseInt(unicode8, 16);
364-
if (isNaN(charCode)) throw new Error(); // can never happen (regex), but helps performance
365-
if (charCode <= 0xFFFF) return fromCharCode(charCode);
366-
return fromCharCode(0xD800 + ((charCode -= 0x10000) / 0x400), 0xDC00 + (charCode & 0x3FF));
367-
}
368-
else {
369-
var replacement = escapeReplacements[escapedChar];
370-
if (!replacement)
371-
throw new Error();
372-
return replacement;
373-
}
374-
});
375-
}
376-
catch (error) { return null; }
352+
let invalid = false;
353+
const replaced = item.replace(escapeSequence, (sequence, unicode4, unicode8, escapedChar) => {
354+
// 4-digit unicode character
355+
if (typeof unicode4 === 'string')
356+
return String.fromCharCode(Number.parseInt(unicode4, 16));
357+
// 8-digit unicode character
358+
if (typeof unicode8 === 'string') {
359+
let charCode = Number.parseInt(unicode8, 16);
360+
return charCode <= 0xFFFF ? String.fromCharCode(Number.parseInt(unicode8, 16)) :
361+
String.fromCharCode(0xD800 + ((charCode -= 0x10000) >> 10), 0xDC00 + (charCode & 0x3FF));
362+
}
363+
// fixed escape sequence
364+
if (escapedChar in escapeReplacements)
365+
return escapeReplacements[escapedChar];
366+
// invalid escape sequence
367+
invalid = true;
368+
return '';
369+
});
370+
return invalid ? null : replaced;
377371
}
378372

379373
// ### `_parseLiteral` parses a literal into an unescaped value

test/N3Lexer-test.js

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,35 +49,23 @@ describe('Lexer', function () {
4949
'Unexpected "<http://ex.org/bla\\uXYZZxyzzfoo>" on line 1.'));
5050

5151
it('should not tokenize an IRI with a non-numeric 4-digit unicode escapes', function (done) {
52-
// Replace global isNaN
53-
var isNaN = global.isNaN;
54-
global.isNaN = function () { return true; };
55-
// Try parsing
5652
var stream = new EventEmitter(), lexer = new Lexer();
5753
lexer.tokenize(stream, function (error, token) {
5854
error.should.be.an.instanceof(Error);
59-
error.message.should.equal('Unexpected "<\\u1234>" on line 1.');
55+
error.message.should.equal('Unexpected "<\\uz234>" on line 1.');
6056
done(token);
6157
});
62-
stream.emit('data', '<\\u1234>');
63-
// Restore global isNaN
64-
global.isNaN = isNaN;
58+
stream.emit('data', '<\\uz234>');
6559
});
6660

6761
it('should not tokenize an IRI with a non-numeric 8-digit unicode escapes', function (done) {
68-
// Replace global isNaN
69-
var isNaN = global.isNaN;
70-
global.isNaN = function () { return true; };
71-
// Try parsing
7262
var stream = new EventEmitter(), lexer = new Lexer();
7363
lexer.tokenize(stream, function (error, token) {
7464
error.should.be.an.instanceof(Error);
75-
error.message.should.equal('Unexpected "<\\U12345678>" on line 1.');
65+
error.message.should.equal('Unexpected "<\\Uz2345678>" on line 1.');
7666
done(token);
7767
});
78-
stream.emit('data', '<\\U12345678>');
79-
// Restore global isNaN
80-
global.isNaN = isNaN;
68+
stream.emit('data', '<\\Uz2345678>');
8169
});
8270

8371
it('should tokenize an IRI with four-digit unicode escapes',

0 commit comments

Comments
 (0)