如何在Node.js中从XML中获取url?

我的最终目标是让我的应用程序显示给定用户的500px.com帐户(这是一个摄影网站)的X最新图像的缩略图。 据我所知,该网站没有API,但它确实有个人用户的rss feed,即https://500px.com/janedoe/rss ,吐出xml。

使用xml2js,我可以将xmlparsing为一个js对象,并导航到包含我想要的url的html描述容器,就像这样(这只是使用rss提要中第一个项目的概念certificate) :

var express = require('express'); var router = express.Router(); var request = require('request'); var parseString = require('xml2js').parseString; var EventEmitter = require('events').EventEmitter; var body = new EventEmitter(); /* GET home page. */ router.get('/', function(req, res, next) { request("https://500px.com/janedoe/rss", function(error, response, data) { body.data = data; body.emit('update'); }); body.on('update', function() { parseString(body.data, function (err, result) { var photoLink = result.rss.channel[0].item[0].description[0]; res.render('index', { title: 'Express', photoName}); }); }); }); 

这会将“!CDATA”标记的整个html内容放入photoLinkvariables中。 我想要做的是定位在该HTML内的img src中,所以我可以通过url作为一个string在页面上呈现。

我可以设想使用string方法寻找第一个“img src”标签,然后阅读,直到地址的结尾,但有没有更优雅和简单的方法来做到这一点?

试试这个:在这个例子中,我find了所有的图片url

 const transform = require('camaro') const cheerio = require('cheerio') const xml = require('fs').readFileSync('feed.xml', 'utf-8') const template = { data: ['//item/description', '.'] } const result = transform(xml, template) const links = result.data.map(html => { const $ = cheerio.load(html) const links = $('img') const urls = [] $(links).each(function(i, link) { urls.push($(link).attr('src')) }) return urls }) console.log(links) 

输出:

 [ [ 'https://drscdn.500px.org/photo/629350/m%3D900/v2?webp=true&sig=4a9fa5788049efb196917cc3f1a55601af901c7157b59ec86c8aa3378c6ee557' ], [ 'https://drscdn.500px.org/photo/625259/m%3D900/v2?webp=true&sig=55eab44535f05625ad25dae3e805b2559c1caeb4c97570d04ee0a77c52c7fb19' ], [ 'https://drscdn.500px.org/photo/625253/m%3D900/v2?webp=true&sig=174d1b27e6f87e0a98192cf6ae051301681a51beb7297df9733956d2763af163' ], [ 'https://drscdn.500px.org/photo/509064/m%3D900/v2?webp=true&sig=698e56114e1d8b67ad11823390f8456ae723d3a389191c43192718f18213caa8' ], [ 'https://drscdn.500px.org/photo/509061/m%3D900/v2?webp=true&sig=2998212f82a1c3428cebb873830a99b908f463474045d4e5ebba3257808685dd' ], [ 'https://drscdn.500px.org/photo/509060/m%3D900/v2?webp=true&sig=8082904fe1935c51fc301a0d10529475ee15124d3797f69cbaeac3fd6c5f0dcb' ], [ 'https://drscdn.500px.org/photo/509056/m%3D900/v2?webp=true&sig=4b85086a7bf55709e77febb202636b0e09415c8ca3fc3657bfb889ad827b3cab' ] ] 

你不需要一个完整的parsing器,只要用正则expression式就可以了:

 var links = []; var re = new RegExp("<img.*?src=['\"](.*?)['\"].*?>", "gmi"); var res; while(res = re.exec(body)) links.push(res[1]); 

例如:

  var a = '<div class="quote"><div class="quote-profile"><img alt="voyages-sncf.com logo" class="img-responsive img-circle" style="height: 80px" src="/img/app_website/index/logo.jpg"> </div><!--//profile--><img alt="voyages-sncf.com logo" class="img-responsive img-circle" style="height: 80px" src="/img/app_website/index/logo2.jpg" data-attr = "lkjlk"/>' var links = []; var re = new RegExp("<img.*?src=['\"](.*?)['\"].*?>", "gmi"); var res; while(res = re.exec(a)) links.push(res[1]); //["/img/app_website/index/logo.jpg", "/img/app_website/index/logo2.jpg"]