Skip to content

Commit 554e133

Browse files
authored
feat: add function parseStream (#16)
* wip: parsing stream * wip: first example with generator * chore: allow to define CHUNK_SIZE in tests * chore: fix parseStream chunk size * wip: allow to join chunks * wip: try to slice the data * fix: throw error if no closing tag * feat: add parseStream * chore: test with node 18 * fix: only test parseStream with node 18 and greater
1 parent 21e3152 commit 554e133

File tree

10 files changed

+413
-10
lines changed

10 files changed

+413
-10
lines changed

.github/workflows/nodejs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ jobs:
1010
nodejs:
1111
uses: zakodium/workflows/.github/workflows/nodejs.yml@nodejs-v1
1212
with:
13-
node-version-matrix: '[14, 16]'
13+
node-version-matrix: '[14, 16, 18]'
1414
lint-check-types: true

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,6 @@ dist
119119

120120
lib
121121
lib-esm
122-
big.xml
122+
big.xml
123+
124+
script/medline.xml

package.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,18 +39,18 @@
3939
"homepage": "https://github.com/cheminfo/arraybuffer-xml-parser#readme",
4040
"devDependencies": {
4141
"@types/he": "^1.1.2",
42-
"@types/jest": "^27.5.0",
42+
"@types/jest": "^27.5.1",
4343
"cheminfo-build": "^1.1.11",
44-
"eslint": "^8.15.0",
44+
"eslint": "^8.16.0",
4545
"eslint-config-cheminfo-typescript": "^10.4.0",
4646
"he": "^1.2.0",
4747
"iobuffer": "^5.1.0",
4848
"jest": "^28.1.0",
4949
"pako": "^2.0.4",
5050
"prettier": "^2.6.2",
5151
"rimraf": "^3.0.2",
52-
"ts-jest": "^28.0.2",
53-
"typescript": "^4.6.4",
52+
"ts-jest": "^28.0.3",
53+
"typescript": "^4.7.2",
5454
"uint8-base64": "^0.1.1"
5555
},
5656
"dependencies": {

script/medline.mjs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import { parseStream } from '../lib/index.js';
2+
import { open } from 'fs/promises';
3+
4+
/*
5+
In order to test this script you should first build the package: `npm run prepack`
6+
And you also need a (big) file from medline called 'medline.xml'
7+
*/
8+
9+
async function doAll() {
10+
const file = await open(new URL('medline.xml', import.meta.url), 'r');
11+
const stream = file.readableWebStream();
12+
let i = 0;
13+
for await (const entry of parseStream(stream, 'PubmedArticle')) {
14+
console.log(entry);
15+
console.log(i++);
16+
}
17+
}
18+
19+
doAll();

src/__tests__/parseStream.test.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import { open } from 'fs/promises';
2+
import { join } from 'path';
3+
4+
import { parseStream } from '../parseStream';
5+
6+
describe('parseStream', () => {
7+
it('simple case', async () => {
8+
// eslint-disable-next-line jest/no-if
9+
if (Number(process.versions.node.split('.')[0]) >= 18) {
10+
const file = await open(join(__dirname, 'assets/sample.xml'), 'r');
11+
const CHUNK_SIZE = 10;
12+
const transformStream = new TransformStream({
13+
start: function start() {}, // required.
14+
transform: async function transform(chunk, controller) {
15+
if (chunk === null) controller.terminate();
16+
chunk = new Uint8Array(await chunk);
17+
for (let i = 0; i < chunk.length; i += CHUNK_SIZE) {
18+
controller.enqueue(chunk.slice(i, i + CHUNK_SIZE));
19+
}
20+
},
21+
});
22+
23+
const results = [];
24+
//@ts-expect-error feature is too new
25+
const readableStream = file.readableWebStream();
26+
for await (let entry of parseStream(
27+
readableStream.pipeThrough(transformStream),
28+
'address',
29+
)) {
30+
results.push(entry);
31+
//console.log(entry);
32+
}
33+
expect(results).toMatchInlineSnapshot(`
34+
Array [
35+
Object {
36+
"buildingNo": 1,
37+
"city": "New York",
38+
"flatNo": 1,
39+
"street": "Park Ave",
40+
},
41+
Object {
42+
"buildingNo": 33,
43+
"city": "Boston",
44+
"flatNo": 24,
45+
"street": "Centre St",
46+
},
47+
Object {
48+
"buildingNo": 1,
49+
"city": "Moscow",
50+
"flatNo": 2,
51+
"street": "Kahovka",
52+
},
53+
Object {
54+
"buildingNo": 3,
55+
"city": "Tula",
56+
"flatNo": 78,
57+
"street": "Lenina",
58+
},
59+
]
60+
`);
61+
}
62+
});
63+
});

src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
export * from './parse';
2+
export * from './parseStream';

src/parseStream.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import {
2+
defaultOptions,
3+
StreamParseOptions,
4+
} from './traversable/defaultOptions';
5+
import { getTraversableGenerator } from './traversable/getTraversableGenerator';
6+
import { traversableToJSON } from './traversableToJSON';
7+
8+
/**
9+
* Parse a web stream representing an XML and emit objects
10+
*/
11+
export async function* parseStream(
12+
readableStream: ReadableStream,
13+
lookupTagName: string,
14+
options: StreamParseOptions = {},
15+
) {
16+
options = { ...defaultOptions, ...options };
17+
18+
for await (const traversableEntry of getTraversableGenerator(
19+
readableStream,
20+
lookupTagName,
21+
options,
22+
)) {
23+
yield traversableToJSON(traversableEntry, options);
24+
}
25+
}

src/traversable/closingIndexForOpeningTag.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
import { decoder } from './utils/utf8Decoder';
22

3+
/**
4+
* Search for the corresponding closing tag '>'
5+
* @param data
6+
* @param i
7+
* @returns
8+
*/
39
export function closingIndexForOpeningTag(
410
data: Uint8Array,
511
i: number,
@@ -25,8 +31,5 @@ export function closingIndexForOpeningTag(
2531
}
2632
endIndex++;
2733
}
28-
return {
29-
data: decoder.decode(data.subarray(i, i + endIndex)),
30-
index: 0,
31-
};
34+
throw new Error('Could not find closing tag');
3235
}

src/traversable/defaultOptions.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@ export const decoder = {
77
return utf8Decoder.decode(array);
88
},
99
};
10+
11+
export interface StreamParseOptions extends ParseOptions {
12+
/**
13+
* What is the maximal size (in bytes) of an entry
14+
* @default 1e7
15+
*/
16+
maxEntrySize?: number;
17+
/**
18+
* What is the maximal size for the buffer
19+
* @default 2e8
20+
*/
21+
maxBufferSize?: number;
22+
}
23+
1024
export interface ParseOptions {
1125
/**
1226
* should we remove ascii < 32
@@ -92,6 +106,7 @@ export interface ParseOptions {
92106
*/
93107
stopNodes?: string[];
94108
}
109+
95110
export const defaultOptions: ParseOptions = {
96111
trimValues: true,
97112
attributeNamePrefix: '$',

0 commit comments

Comments
 (0)