NightmareJS – Web爬虫需要遍历JSON数据

我正在构build一个涵盖200多个网站的networking爬虫。 当前的代码已经运行在由十几个站点构build的外部JSON文件的顶部。 样品:

[ { "company": "My Company", "url": "http://example.com/jobs/", "query": "div.job-listings>dt a", "link": "div.job-listings>dt a" }, { "company": "Another Company", "url": "http://anothercompany.com/careers/", "query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a", "link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a" } ] 

当我尝试asynchronous。每个它会logging所有的原始对象在函数的顶部,然后尝试进入噩梦实例然后返回error Nothing responds to "goto" 。 然后我尝试了asynchronous .eachSeries,打印出正确的结果,但在第一次迭代后停止。

 var async = require ('async'); var data = require('./input.json') var Nightmare = require('nightmare'); var nightmare = Nightmare({ show: false }) function crawl(data, cb) { console.log(data) // When async.each will iterate all items then error var nightmare = new Nightmare() nightmare .goto(data.url) // go to JSON specified url .wait(data.query) // wait until CSS selector loads .evaluate(function (data) { positionsArr = [] obj = {} obj.company = data.company query = document.querySelectorAll(data.query) link = document.querySelectorAll(data.link) /* Set query and link equal to all elements with selector itearte through appending text (innerText) from each element with job url to obj*/ var i; for (i = 0; i < query.length; i++) { positionsObj = {} positionsObj.title = query[i].innerText.trim() // if each position has individual page if (data.link !== null) { positionsObj.url = link[i].href } else { positionsObj.url = data.url } positionsArr.push(positionsObj) } obj.positions = positionsArr return obj }, data) .end() .then(function (obj) { console.log(obj) console.log('done') }) .catch(function (error) { console.error('error', error); }); } async.eachSeries(data, crawl, function (err){ console.log('done!'); }) 

我怎样才能做这项工作,而不必为每个文件写一个单独的文件? 还是有更好的方法来爬取这个数量的网站?

源代码

如果要执行第二步等,则必须使用callback( cb ):

 .end() .then(function (obj) { console.log(obj); console.log('done'); cb(); }) .catch(function (error) { console.error('error', error); cb(error); });