
我在这里find了一个蜘蛛脚本: http : //

基本上我想要做的是使用rethinkDB把所有的链接在一个表被刮。 所以我基本上创build了另一个名为args.js的脚本,它接受一个参数,链接,然后将其存储在数据库中。


// node args.js -link var r = require('rethinkdb'); var args = process.argv.slice(2); var link = args[1]; var connection = null; var connection = null; r.connect( {host: 'localhost', port: 28015}, function(err, conn) { if (err) throw err; connection = conn; conn.addListener('error', function(e) { processNetworkError(e); }); conn.addListener('close', function() { cleanup(); }); insertLink(conn); }); var myDB = r.db("links").table("href"); function insertLink() { myDB.insert({ "link" : link }).run(connection); process.exit() }; 




 // Set the start URL var startUrl = ''; // URL variables var visitedUrls = [], pendingUrls = []; // Create instances var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ }); var utils = require('utils') var helpers = require('./helpers') var exec = require('child_process').exec, child; // Spider from the given URL function spider(url) { child = exec('node args.js -link ' + url, function (error, stdout, stderr) { console.log(stdout); if (error !== null) { console.log('exec error: ' + error); } }); // Add the URL to the visited stack visitedUrls.push(url); // Open the URL { // Set the status style based on server status code var status = this.status().currentHTTPStatus; switch(status) { case 200: var statusStyle = { fg: 'green', bold: true }; break; case 404: var statusStyle = { fg: 'red', bold: true }; break; default: var statusStyle = { fg: 'magenta', bold: true }; break; } // Display the spidered URL and status this.echo(this.colorizer.format(status, statusStyle) + ' ' + url); // Find links present on this page var links = this.evaluate(function() { var links = [];'a'), function(e) { links.push(e.getAttribute('href')); }); return links; }); // Add newly found URLs to the stack var baseUrl = this.getGlobal('location').origin;, function(link) { var newUrl = helpers.absoluteUri(baseUrl, link); if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) { //casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' })); pendingUrls.push(newUrl); } }); // If there are URLs to be processed if (pendingUrls.length > 0) { var nextUrl = pendingUrls.shift(); //this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' })); spider(nextUrl); } }); } // Start spidering casper.start(startUrl, function() { spider(startUrl); }); // Start the run; 

这不起作用。 蜘蛛工作正常,但我的脚本似乎并没有被称为。 我创build了另一个名为test.js的文件,基本上只是直接调用args.js


 var exec = require('child_process').exec, child; var url = "" child = exec('node args.js -link ' + url, function (error, stdout, stderr) { console.log(stdout); if (error !== null) { console.log('exec error: ' + error); } }); 


编辑:我添加我的exec()代码到一个函数,然后把一个console.log()在它看来,console.log注销,所以我的函数被调用。 虽然,节点调用似乎没有。

我想出了这个问题。 这是我在spider.jsspider.js

 // Set the start URL var startUrl = ''; // URL variables var visitedUrls = [], pendingUrls = []; // Create instances var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ }); var utils = require('utils') var helpers = require('./helpers') var cp = require('child_process'); function addUrl(myUrl) { var params = ['args.js','--link', myUrl]; cp.execFile('node',params,{},function(_,stdout,stderr){ console.log(stdout); }); } // Spider from the given URL function spider(url) { addUrl(url); // Add the URL to the visited stack visitedUrls.push(url); // Open the URL { // Set the status style based on server status code var status = this.status().currentHTTPStatus; switch(status) { case 200: var statusStyle = { fg: 'green', bold: true }; break; case 404: var statusStyle = { fg: 'red', bold: true }; break; default: var statusStyle = { fg: 'magenta', bold: true }; break; } // Display the spidered URL and status this.echo(this.colorizer.format(status, statusStyle) + ' ' + url); // Find links present on this page var links = this.evaluate(function() { var links = [];'a'), function(e) { links.push(e.getAttribute('href')); }); return links; }); // Add newly found URLs to the stack var baseUrl = this.getGlobal('location').origin;, function(link) { var newUrl = helpers.absoluteUri(baseUrl, link); if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) { //casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' })); pendingUrls.push(newUrl); } }); // If there are URLs to be processed if (pendingUrls.length > 0) { var nextUrl = pendingUrls.shift(); //this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' })); spider(nextUrl); } }); } // Start spidering casper.start(startUrl, function() { spider(startUrl); }); // Start the run; 

我添加了一个使用execFilesubprocess的callback函数addUrl(myUrl) ,并直接调用到节点,并将数组中的参数作为['file-name','argument prefix','argument']作为['file-name','argument prefix','argument']定义的args.js文件。