用节点骑士刮一批页面:一些例外情况没有抓住

我正在使用nodeman和horseman来search(已知是有效的)url列表,并依次对它们执行相同的scraping操作。 我试图捕捉错误,但有时候,一些exception仍然通过并挂起脚本。 我可以一次运行同一个urls'batch',并且再次发生错误。

到目前为止,我已经尝试过: – 捕捉错误; – 使用promiseRetry模块重试promise – 使用打开和closures每个刮擦操作的新马实例

// users: [ { href: '?id=abc123' }, ... ] // queuePromise: function which returns a chain of promises from an array of functions with callbacks // promiseRetry: used to retry a promise :3, see promiseRetry documentation fetch = function(users, options){ var baseUrl = 'https://www.somedomain.com/Users' var user_queue = [] var items = [] var ci = 0 users.forEach(function (u){ u.url = baseUrl + u.href user_queue.push(function (next){ promiseRetry(options, function (retry, number) { var error = true var horse = new Horseman() horse .open(u.url) .catch(retry) .evaluate(scrapeitems) .then(function (_items){ error = false console.log('Scraped user', ++ci, ':', u.url, ':\n\tfound ' + _items.length + ' items') if (_items && _items.length) { items = items.concat(_items) } }) .finally(function () { horse.close() if (error && number < options.retries) { retry() } else { next() } }) }) }) }) return _t.queuePromise(user_queue).then(function (){ return items }) } 

这里是未被捕获的错误:

 [...] Scraped user 22: https://www.somedomain.com/Users?id=abc123 : found 1 items Unhandled rejection Error: Retrying at createError (/Users/andrea/src/userscraper/node_modules/err-code/index.js:4:44) at /Users/andrea/src/userscraper/node_modules/promise-retry/index.js:34:27 at Horseman.<anonymous> (/Users/andrea/src/userscraper/modules/fetcher.js:179:110) at PassThroughHandlerContext.finallyHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/finally.js:56:23) at PassThroughHandlerContext.tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23) at Promise._settlePromiseFromHandler (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:512:31) at Promise._settlePromise (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:569:18) at Promise._settlePromise0 (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:614:10) at Promise._settlePromises (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/promise.js:689:18) at Async._drainQueue (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:133:16) at Async._drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:143:10) at Immediate.Async.drainQueues (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/async.js:17:14) at runCallback (timers.js:570:20) at tryOnImmediate (timers.js:550:5) at processImmediate [as _immediateCallback] (timers.js:529:5) Unhandled rejection Error: Failed to load url at checkStatus (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:292:16) at tryCatcher (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/util.js:16:23) at Function.Promise.attempt.Promise.try (/Users/andrea/src/userscraper/node_modules/node-horseman/node_modules/bluebird/js/release/method.js:39:29) at Object.loadFinishedSetup [as onLoadFinished] (/Users/andrea/src/userscraper/node_modules/node-horseman/lib/index.js:290:43) at /Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:636:30 at Array.forEach (native) at IncomingMessage.<anonymous> (/Users/andrea/src/userscraper/node_modules/node-phantom-simple/node-phantom-simple.js:617:17) at emitNone (events.js:91:20) at IncomingMessage.emit (events.js:185:7) at endReadableNT (_stream_readable.js:975:12) at _combinedTickCallback (internal/process/next_tick.js:74:11) at process._tickCallback (internal/process/next_tick.js:98:9)