Node.js GET请求ETIMEDOUT&ESOCKETTIMEDOUT

我正在使用Node.js – asynchronous和请求模块来抓取100多万个网站,并在几分钟后不断碰到错误ESOCKETTIMEDOUTETIMEDOUT

我重新启动脚本后,它再次工作。 这似乎不是连接限制的问题,因为我仍然可以解决,resolveNs,resolveMx,也毫不拖延地curl

你看到代码的任何问题? 或任何build议? 我想推高async.queue()并发到至less1000.谢谢。

 var request = require('request'), async = require('async'), mysql = require('mysql'), dns = require('dns'), url = require('url'), cheerio = require('cheerio'), iconv = require('iconv-lite'), charset = require('charset'), config = require('./spy.config'), pool = mysql.createPool(config.db); iconv.skipDecodeWarning = true; var queue = async.queue(function (task, cb) { dns.resolve4('www.' + task.domain, function (err, addresses) { if (err) { // // Do something // setImmediate(function () { cb() }); } else { request({ url: 'http://www.' + task.domain, method: 'GET', encoding: 'binary', followRedirect: true, pool: false, pool: { maxSockets: 1000 }, timeout: 15000 // 15 sec }, function (error, response, body) { //console.info(task); if (!error) { // If ok, do something } else { // If not ok, do these console.log(error); // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here. // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' } // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' } var ns = [], ip = [], mx = []; async.parallel([ function (callback) { // Resolves the domain's name server records dns.resolveNs(task.domain, function (err, addresses) { if (!err) { ns = addresses; } callback(); }); }, function (callback) { // Resolves the domain's IPV4 addresses dns.resolve4(task.domain, function (err, addresses) { if (!err) { ip = addresses; } callback(); }); }, function (callback) { // Resolves the domain's MX records dns.resolveMx(task.domain, function (err, addresses) { if (!err) { addresses.forEach(function (a) { mx.push(a.exchange); }); } callback(); }); } ], function (err) { if (err) return next(err); // do something }); } setImmediate(function () { cb() }); }); } }); }, 200); // When the queue is emptied we want to check if we're done queue.drain = function () { setImmediate(function () { checkDone() }); }; function consoleLog(msg) { //console.info(msg); } function checkDone() { if (queue.length() == 0) { setImmediate(function () { crawlQueue() }); } else { console.log("checkDone() not zero"); } } function query(sql) { pool.getConnection(function (err, connection) { if (!err) { //console.log(sql); connection.query(sql, function (err, results) { connection.release(); }); } }); } function crawlQueue() { pool.getConnection(function (err, connection) { if (!err) { var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500"; connection.query(sql, function (err, results) { if (!err) { if (results.length) { for (var i = 0, len = results.length; i < len; ++i) { queue.push({"id": results[i]['id'], "domain": results[i]['domain'] }); } } else { process.exit(); } connection.release(); } else { connection.release(); setImmediate(function () { crawlQueue() }); } }); } else { setImmediate(function () { crawlQueue() }); } }); } setImmediate(function () { crawlQueue() }); 

系统限制相当高。

  Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds Max file size unlimited unlimited bytes Max data size unlimited unlimited bytes Max stack size 8388608 unlimited bytes Max core file size 0 unlimited bytes Max resident set unlimited unlimited bytes Max processes 257645 257645 processes Max open files 500000 500000 files Max locked memory 65536 65536 bytes Max address space unlimited unlimited bytes Max file locks unlimited unlimited locks Max pending signals 257645 257645 signals Max msgqueue size 819200 819200 bytes Max nice priority 0 0 Max realtime priority 0 0 Max realtime timeout unlimited unlimited us 

的sysctl

 net.ipv4.ip_local_port_range = 10000 61000 

默认情况下,Node有4名工作人员来parsingDNS查询 。 如果您的DNS查询需要很长时间,请求将在DNS阶段阻塞,症状正好是ESOCKETTIMEDOUTETIMEDOUT

尝试增加你的uv线程池大小:

 export UV_THREADPOOL_SIZE=128 node ... 

或者在index.js (或者任何你的入口点):

 #!/usr/bin/env node process.env.UV_THREADPOOL_SIZE = 128; function main() { ... } 

我通过使用tc减慢来自DNS服务器的响应,从而在本地重现了这一点 。

我有同样的问题。 在阅读这个讨论之后,通过在请求选项中使用“agent:false”来解决这个问题 。

10/31/2017以上看来并不能解决问题。 我们find的最终解决scheme是在代理中使用keepAlive选项。 例如:

 var pool = new https.Agent({ keepAlive: true });; function getJsonOptions(_url) { return { url: _url, method: 'GET', agent: pool, json: true }; } 

节点的默认池似乎默认为keepAlive = false,这导致每个请求上创build一个新的连接。 当在短时间内创build了太多的连接时,上述错误就会出现。 我的猜测是,一个或多个路由器沿着服务块连接请求的path,可能怀疑是拒绝服务攻击。 无论如何,上面的代码示例完全解决了我们的问题。