@@ -37,6 +37,8 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p
3737 - [ page instance] ( #page-instance )
3838 - [ Crawl interface] ( #Crawl-interface )
3939 - [ Crawl files] ( #Crawl-files )
40+ - [ life cycle] ( #life-cycle )
41+ - [ beforeSave] ( #beforeSave )
4042 - [ Start polling] ( #Start-polling )
4143 - [ Config priority] ( #Config-Priority )
4244 - [ Interval time] ( #Interval-time )
@@ -135,7 +137,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
135137
136138 // Gets the URL of the page's wheel image element
137139 const boxHandle = await page .$ (elSelectorMap[id - 1 ])
138- const urls = await boxHandle! .$$eval (' picture img' , (imgEls ) => {
140+ const urls = await boxHandle .$$eval (' picture img' , (imgEls ) => {
139141 return imgEls .map ((item ) => item .src )
140142 })
141143 imgUrls .push (... urls)
@@ -224,7 +226,7 @@ import xCrawl from 'x-crawl'
224226
225227const myXCrawl = xCrawl ()
226228
227- myXCrawl .crawlPage (' https://xxx .com' ).then ((res ) => {
229+ myXCrawl .crawlPage (' https://www.example .com' ).then ((res ) => {
228230 const { browser , page } = res .data
229231
230232 // Close the browser
@@ -253,7 +255,7 @@ import xCrawl from 'x-crawl'
253255
254256const myXCrawl = xCrawl ()
255257
256- myXCrawl .crawlPage (' https://xxx .com' ).then (async (res ) => {
258+ myXCrawl .crawlPage (' https://www.example .com' ).then (async (res ) => {
257259 const { browser , page } = res .data
258260
259261 // Get a screenshot of the rendered page
@@ -275,9 +277,9 @@ import xCrawl from 'x-crawl'
275277const myXCrawl = xCrawl ({ intervalTime: { max: 3000 , min: 1000 } })
276278
277279const requestConfigs = [
278- ' https://xxx. com/xxxx ' ,
279- ' https://xxx. com/xxxx ' ,
280- { url: ' https://xxx. com/xxxx ' , method: ' POST' , data: { name: ' coderhxl' } }
280+ ' https://www.example. com/api-1 ' ,
281+ ' https://www.example. com/api-2 ' ,
282+ { url: ' https://www.example. com/api-3 ' , method: ' POST' , data: { name: ' coderhxl' } }
281283]
282284
283285myXCrawl .crawlData ({ requestConfigs }).then ((res ) => {
@@ -296,7 +298,7 @@ const myXCrawl = xCrawl({ intervalTime: { max: 3000, min: 1000 } })
296298
297299myXCrawl
298300 .crawlFile ({
299- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
301+ requestConfigs: [' https://www.example. com/file-1 ' , ' https://www.example. com/file-2 ' ],
300302 fileConfig: {
301303 storeDir: ' ./upload' // storage folder
302304 }
@@ -306,6 +308,42 @@ myXCrawl
306308 })
307309```
308310
311+ #### life cycle
312+
313+ The crawlFile API has a lifetime function:
314+
315+ - beforeSave: executed before saving the file
316+
317+ ##### beforeSave
318+
319+ In the beforeSave function you can get a file of type Buffer, which you can process and return a Promise and resolve as a Buffer.
320+
321+ ** Resize picture**
322+
323+ Use the sharp library to resize the images to be crawled:
324+
325+ ``` js
326+ import xCrawl from ' x-crawl'
327+ import sharp from ' sharp'
328+
329+ const testXCrawl = xCrawl ()
330+
331+ testXCrawl
332+ .crawlFile ({
333+ requestConfigs: [' https://www.example.com/file-1.jpg' , ' https://www.example.com/file-2.jpg' ],
334+ fileConfig: {
335+ beforeSave (info ) {
336+ return sharp (info .data ).resize (200 ).toBuffer ()
337+ }
338+ }
339+ })
340+ .then ((res ) => {
341+ res .forEach ((item ) => {
342+ console .log (item .data ? .data .isSuccess )
343+ })
344+ })
345+ ` ` `
346+
309347### Start polling
310348
311349Start a polling crawl with [startPolling()](#startPolling) .
@@ -321,7 +359,7 @@ const myXCrawl = xCrawl({
321359myXCrawl .startPolling ({ h: 2 , m: 30 }, async (count , stopPolling ) => {
322360 // will be executed every two and a half hours
323361 // crawlPage/crawlData/crawlFile
324- const res = await myXCrawl .crawlPage (' https://xxx .com' )
362+ const res = await myXCrawl .crawlPage (' https://www.example .com' )
325363 res .data .page .close ()
326364})
327365` ` `
@@ -356,7 +394,7 @@ const myXCrawl = xCrawl()
356394
357395myXCrawl
358396 .crawlData ({
359- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
397+ requestConfigs: [' https://www.example. com/api-1 ' , ' https://www.example. com/api-2 ' ],
360398 intervalTime: { max: 2000 , min: 1000 }
361399 })
362400 .then ((res ) => {})
@@ -378,7 +416,7 @@ import xCrawl from 'x-crawl'
378416
379417const myXCrawl = xCrawl ()
380418
381- myXCrawl .crawlData ({ url: ' https://xxx. com/xxxx ' , maxRetry: 1 }).then ((res ) => {})
419+ myXCrawl .crawlData ({ url: ' https://www.example. com/api ' , maxRetry: 1 }).then ((res ) => {})
382420` ` `
383421
384422The maxRetry attribute determines how many times to retry.
@@ -394,9 +432,9 @@ const myXCrawl = xCrawl()
394432
395433myXCrawl
396434 .crawlData ([
397- { url: ' https://xxx. com/xxxx ' , priority: 1 },
398- { url: ' https://xxx. com/xxxx ' , priority: 10 },
399- { url: ' https://xxx. com/xxxx ' , priority: 8 }
435+ { url: ' https://www.example. com/api-1 ' , priority: 1 },
436+ { url: ' https://www.example. com/api-2 ' , priority: 10 },
437+ { url: ' https://www.example. com/api-3 ' , priority: 8 }
400438 ])
401439 .then ((res ) => {})
402440` ` `
@@ -439,7 +477,7 @@ import xCrawl from 'x-crawl'
439477
440478// xCrawl API
441479const myXCrawl = xCrawl({
442- baseUrl: 'https://xxx .com',
480+ baseUrl: ' https://www.example .com' ,
443481 timeout: 10000 ,
444482 intervalTime: { max: 2000 , min: 1000 }
445483})
@@ -472,7 +510,7 @@ import xCrawl from 'x-crawl'
472510const myXCrawl = xCrawl()
473511
474512// crawlPage API
475- myXCrawl.crawlPage('https://xxx. com/xxxx ').then((res) => {
513+ myXCrawl.crawlPage('https:// www.example. com').then((res) => {
476514 const { browser, page } = res .data
477515
478516 // Close the browser
@@ -498,7 +536,7 @@ import xCrawl from 'x-crawl'
498536
499537const myXCrawl = xCrawl ()
500538
501- myXCrawl.crawlPage('https://xxx. com/xxxx ').then((res) => {})
539+ myXCrawl .crawlPage (' https://www.example. com' ).then ((res ) => {})
502540` ` `
503541
504542The res you get will be an object.
@@ -516,7 +554,7 @@ const myXCrawl = xCrawl()
516554
517555myXCrawl
518556 .crawlPage ({
519- url: 'https://xxx. com/xxxx ',
557+ url: ' https://www.example. com' ,
520558 proxy: ' xxx' ,
521559 maxRetry: 1
522560 })
@@ -537,7 +575,10 @@ import xCrawl from 'x-crawl'
537575const myXCrawl = xCrawl ()
538576
539577myXCrawl
540- .crawlPage(['https://xxx.com/xxxx', { url: 'https://xxx.com/xxxx', maxRetry: 2 }])
578+ .crawlPage ([
579+ ' https://www.example.com/page-1' ,
580+ { url: ' https://www.example.com/page-2' , maxRetry: 2 }
581+ ])
541582 .then ((res ) => {})
542583` ` `
543584
@@ -549,20 +590,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP
549590
550591If you want to crawl multiple pages, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this way of writing:
551592
552- ```
593+ ` ` ` js
553594import xCrawl from ' x-crawl'
554595
555596const myXCrawl = xCrawl ()
556597
557- myXCrawl.crawlPage({
558- requestConfigs: [
559- 'https://xxx.com/xxxx ',
560- { url: 'https://xxx.com/xxxx ', maxRetry: 6 }
561- ] ,
562- intervalTime: { max: 3000, min: 1000 },
563- cookies: 'xxx',
564- maxRetry: 1
565- }).then((res) => {})
598+ myXCrawl
599+ .crawlPage ({
600+ requestConfigs: [
601+ ' https://www.example.com/page-1' ,
602+ { url: ' https://www.example.com/page-2' , maxRetry: 6 }
603+ ],
604+ intervalTime: { max: 3000 , min: 1000 },
605+ cookies: ' xxx' ,
606+ maxRetry: 1
607+ })
608+ .then ((res ) => {})
566609` ` `
567610
568611The res you get will be an array of objects.
@@ -598,7 +641,7 @@ const myXCrawl = xCrawl({
598641
599642myXCrawl
600643 .crawlData ({
601- requestConfigs: [' https://xxx. com/xxxx ' , ' https://xxx. com/xxxx ' ],
644+ requestConfigs: [' https://www.example. com/api-1 ' , ' https://www.example. com/api-2 ' ],
602645 intervalTime: { max: 3000 , min: 1000 },
603646 cookies: ' xxx' ,
604647 maxRetry: 1
@@ -626,7 +669,7 @@ import xCrawl from 'x-crawl'
626669
627670const myXCrawl = xCrawl ()
628671
629- myXCrawl .crawlData (' https://xxx. com/xxxx ' ).then ((res ) => {})
672+ myXCrawl .crawlData (' https://www.example. com/api ' ).then ((res ) => {})
630673` ` `
631674
632675The res you get will be an object.
@@ -644,7 +687,7 @@ const myXCrawl = xCrawl()
644687
645688myXCrawl
646689 .crawlData ({
647- url: ' https://xxx. com/xxxx ' ,
690+ url: ' https://www.example. com/api ' ,
648691 proxy: ' xxx' ,
649692 maxRetry: 1
650693 })
@@ -665,7 +708,10 @@ import xCrawl from 'x-crawl'
665708const myXCrawl = xCrawl ()
666709
667710myXCrawl
668- .crawlPage ([' https://xxx.com/xxxx' , { url: ' https://xxx.com/xxxx' , maxRetry: 2 }])
711+ .crawlData ([
712+ ' https://www.example.com/api-1' ,
713+ { url: ' https://www.example.com/api-2' , maxRetry: 2 }
714+ ])
669715 .then ((res ) => {})
670716` ` `
671717
@@ -677,20 +723,22 @@ For more configuration options of CrawlPageConfigObject, please refer to [CrawlP
677723
678724If you want to crawl multiple data, and the request configuration (proxy, cookies, retry, etc.) does not want to be written repeatedly, if you need an interval, you can try this writing method:
679725
680- ```
726+ ` ` ` js
681727import xCrawl from ' x-crawl'
682728
683729const myXCrawl = xCrawl ()
684730
685- myXCrawl.crawlData({
686- requestConfigs: [
687- 'https://xxx.com/xxxx',
688- { url: 'https://xxx.com/xxxx', maxRetry: 6 }
689- ],
690- intervalTime: { max: 3000, min: 1000 },
691- cookies: 'xxx',
692- maxRetry: 1
693- }).then((res) => {})
731+ myXCrawl
732+ .crawlData ({
733+ requestConfigs: [
734+ ' https://www.example.com/api-1' ,
735+ { url: ' https://www.example.com/api-2' , maxRetry: 6 }
736+ ],
737+ intervalTime: { max: 3000 , min: 1000 },
738+ cookies: ' xxx' ,
739+ maxRetry: 1
740+ })
741+ .then ((res ) => {})
694742` ` `
695743
696744The res you get will be an array of objects.
@@ -727,7 +775,7 @@ const myXCrawl = xCrawl({
727775// crawlFile API
728776myXCrawl
729777 .crawlFile ({
730- requestConfigs: ['https://xxx. com/xxxx ', 'https://xxx. com/xxxx '],
778+ requestConfigs: [' https://www.example. com/file-1 ' , ' https://www.example. com/file-2 ' ],
731779 storeDir: ' ./upload' ,
732780 intervalTime: { max: 3000 , min: 1000 },
733781 maxRetry: 1
@@ -757,7 +805,7 @@ const myXCrawl = xCrawl()
757805
758806myXCrawl
759807 .crawlFile ({
760- url: 'https://xxx. com/xxxx ',
808+ url: ' https://www.example. com/file ' ,
761809 proxy: ' xxx' ,
762810 maxRetry: 1 ,
763811 storeDir: ' ./upload' ,
@@ -781,8 +829,8 @@ const myXCrawl = xCrawl()
781829
782830myXCrawl
783831 .crawlFile ([
784- { url: 'https://xxx. com/xxxx ', storeDir: './upload' },
785- { url: 'https://xxx. com/xxxx ', storeDir: './upload', maxRetry: 2 }
832+ { url: ' https://www.example. com/file-1 ' , storeDir: ' ./upload' },
833+ { url: ' https://www.example. com/file-2 ' , storeDir: ' ./upload' , maxRetry: 2 }
786834 ])
787835 .then ((res ) => {})
788836` ` `
@@ -795,20 +843,22 @@ For more configuration options of CrawlFileConfigObject, please refer to [CrawlF
795843
796844If you want to crawl multiple data, and the request configuration (storeDir, proxy, retry, etc.) does not want to be written repeatedly, and you need interval time, etc., you can try this way of writing:
797845
798- ```
846+ ` ` ` js
799847import xCrawl from ' x-crawl'
800848
801849const myXCrawl = xCrawl ()
802850
803- myXCrawl.crawlFile({
804- requestConfigs: [
805- 'https://xxx.com/xxxx ',
806- { url: 'https://xxx.com/xxxx ', storeDir: './upload/xxx' }
807- ] ,
808- storeDir: './upload',
809- intervalTime: { max: 3000, min: 1000 },
810- maxRetry: 1
811- }).then((res) => {})
851+ myXCrawl
852+ .crawlFile ({
853+ requestConfigs: [
854+ ' https://www.example.com/file-1' ,
855+ { url: ' https://www.example.com/file-2' , storeDir: ' ./upload/xxx' }
856+ ],
857+ storeDir: ' ./upload' ,
858+ intervalTime: { max: 3000 , min: 1000 },
859+ maxRetry: 1
860+ })
861+ .then ((res ) => {})
812862` ` `
813863
814864The res you get will be an array of objects.
@@ -999,7 +1049,7 @@ export interface CrawlFileConfigObject {
9991049 fileName : string
10001050 filePath : string
10011051 data : Buffer
1002- }) => Buffer | void
1052+ }) = > Promise < Buffer >
10031053 }
10041054}
10051055` ` `
@@ -1167,3 +1217,5 @@ export interface AnyObject extends Object {
11671217## More
11681218
11691219If you have **problems, needs, good suggestions** please raise **Issues** in https://github.com/coder-hxl/x-crawl/issues.
1220+
1221+ [#life-cycle]:
0 commit comments