Skip to content

Commit 512d432

Browse files
authored
Fix encodeTrim* on special strings with repeat tokens (#26)
* Fix tests * Fix unused variables
1 parent 3c6fcb9 commit 512d432

File tree

4 files changed

+48
-14
lines changed

4 files changed

+48
-14
lines changed

tokenizer_ts/package-lock.json

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tokenizer_ts/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "@microsoft/tiktokenizer",
33
"displayName": "tiktokenizer",
44
"description": "Tokenizer for OpenAI large language models.",
5-
"version": "1.0.3",
5+
"version": "1.0.4",
66
"author": {
77
"name": "Microsoft Corporation"
88
},

tokenizer_ts/src/tikTokenizer.ts

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -241,11 +241,15 @@ export class TikTokenizer {
241241
const piece = match[0];
242242
if (this.cache.has(piece)) {
243243
let tokens = this.cache.get(piece);
244-
tokenCount += tokens!.length;
245-
if (tokenCount <= maxTokenCount) {
244+
if (tokenCount + tokens!.length <= maxTokenCount) {
245+
tokenCount += tokens!.length;
246246
encodeLength += piece.length;
247247
tokenIds.push(...tokens!);
248248
} else {
249+
let remainingTokens = maxTokenCount - tokenCount;
250+
tokenCount += remainingTokens;
251+
encodeLength += piece.length;
252+
tokenIds.push(...tokens!.slice(0, remainingTokens));
249253
break;
250254
}
251255
} else {
@@ -254,8 +258,8 @@ export class TikTokenizer {
254258
const token = this.encoder!.get(uint8ArrayToString(bytes));
255259
if (token !== undefined) {
256260
this.cache.set(piece, [token]);
257-
tokenCount++;
258-
if (tokenCount <= maxTokenCount) {
261+
if (tokenCount + 1 <= maxTokenCount) {
262+
tokenCount++;
259263
encodeLength += piece.length;
260264
tokenIds.push(token);
261265
} else {
@@ -264,11 +268,15 @@ export class TikTokenizer {
264268
} else {
265269
const encodedTokens = bytePairEncode(bytes, this.encoder!);
266270
this.cache.set(piece, encodedTokens);
267-
tokenCount += encodedTokens.length;
268-
if (tokenCount <= maxTokenCount) {
271+
if (tokenCount + encodedTokens.length <= maxTokenCount) {
272+
tokenCount += encodedTokens.length;
269273
encodeLength += piece.length;
270274
tokenIds.push(...encodedTokens);
271275
} else {
276+
let remainingTokens = maxTokenCount - tokenCount;
277+
tokenCount += remainingTokens;
278+
encodeLength += piece.length;
279+
tokenIds.push(...encodedTokens.slice(0, remainingTokens));
272280
break;
273281
}
274282
}
@@ -443,6 +451,16 @@ export class TikTokenizer {
443451
}
444452
}
445453

454+
// Naive approach if chunks are incorrect
455+
if (actualPrefixTokenCount > maxTokenCount) {
456+
const encodedTokens = this.encode(text, allowedSpecial);
457+
const slicedTokens = encodedTokens.slice(encodedTokens.length - maxTokenCount);
458+
return {
459+
tokenIds: slicedTokens,
460+
text: this.decode(slicedTokens)
461+
};
462+
}
463+
446464
return {
447465
tokenIds: tokenIds.slice(actualPrefixTokenCount),
448466
text: text.slice(actualPrefixStrLength)

tokenizer_ts/test/tikTokenizer.test.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ suite("TikTokenizer Test Suite", function() {
9191

9292
test("encode trim suffix - 2", () => {
9393
const str = "<|im_start|>Hello TempWorld<|im_end|>";
94-
const encodedStr = "<|im_start|>Hello";
94+
const encodedStr = "<|im_start|>Hello TempWorld";
9595
let encoded = tokenizer.encodeTrimSuffix(
9696
str,
9797
5,
@@ -125,10 +125,18 @@ suite("TikTokenizer Test Suite", function() {
125125
3,
126126
Array.from(specialTokens.keys())
127127
);
128-
assert.deepStrictEqual(encoded.tokenIds, [100264, 9906]);
128+
assert.deepStrictEqual(encoded.tokenIds, [100264, 9906, 20539]);
129129
assert.deepStrictEqual(encoded.text, encodedStr);
130130
});
131131

132+
test("encode trim suffix - 3", () => {
133+
const str = "t".repeat(4000);
134+
const encodedStr = tokenizer.encode(str);
135+
let encodedTrimSuffix = tokenizer.encodeTrimSuffix(str, 5, []);
136+
assert.deepStrictEqual(encodedTrimSuffix.tokenIds.length, 5);
137+
assert.deepStrictEqual(encodedTrimSuffix.tokenIds, encodedStr.slice(0, 5));
138+
});
139+
132140
test("encode trim prefix", () => {
133141
const str = "<|im_start|>Hello World<|im_end|>";
134142
const encodedStr = "Hello World<|im_end|>";
@@ -197,6 +205,14 @@ suite("TikTokenizer Test Suite", function() {
197205
assert.deepStrictEqual(encoded.text, encodedStr);
198206
});
199207

208+
test("encode trim prefix - 3", () => {
209+
const str = "t".repeat(4000);
210+
const encodedStr = tokenizer.encode(str);
211+
let encodedTrimSuffix = tokenizer.encodeTrimPrefix(str, 5, []);
212+
assert.deepStrictEqual(encodedTrimSuffix.tokenIds.length, 5);
213+
assert.deepStrictEqual(encodedTrimSuffix.tokenIds, encodedStr.slice(encodedStr.length - 5));
214+
});
215+
200216
test("tokenize source code - gpt-3.5", done => {
201217
const source = fs.readFileSync("test/testdata/lib.rs.txt", "utf8");
202218
const filePath = "test/testdata/tokens_gpt_3.5_turbo.json";

0 commit comments

Comments
 (0)