Javascript:REGEX将所有相关url更改为Absolute

我目前正在创build一个Node.js webscraper /代理,但我无法parsing在源代码的脚本部分find相关的url,我想REGEX会做的伎俩。 虽然我不知道如何实现这一点。

无论如何,我可以去做这件事吗?

此外,我打开这样做的一个更简单的方法,因为我很困惑如何其他代理parsing网站。 我认为大多数只是荣耀的网站刮板,可以读取网站的来源中继所有链接/forms回代理。

高级HTMLstringreplacefunction

注意OP,因为他要求这样一个function:把base_url改成你的代理的basE URL,以达到预期的结果。

两个function将在下面显示(使用指南包含在代码中)。 确保你不要跳过这个答案的任何部分的解释,以充分理解函数的行为。

  • rel_to_abs(urL) – 此函数返回绝对URL。 当一个具有通用信任协议的绝对URL被传递时,它将立即返回这个URL。 否则,从base_url和函数参数生成绝对URL。 相对URL被正确parsing( ../ ; ./ ; . ; // )。
  • replace_all_rel_by_abs – 该函数将parsing所有在HTML中具有重要意义的URL,例如CSS url() ,链接和外部资源。 请参阅代码以获取分析的实例的完整列表。 看到这个答案的调整实施,从外部来源消毒HTMLstring (embedded文档)。
  • testing案例(在答案的底部):为了testingfunction的有效性,只需将小书签粘贴到位置栏即可。

rel_to_absparsing相对URL

 function rel_to_abs(url){ /* Only accept commonly trusted protocols: * Only data-image URLs are accepted, Exotic flavours (escaped slash, * html-entitied characters) are not supported to keep the function fast */ if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) return url; //Url is already absolute var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/"; if(url.substring(0,2) == "//") return location.protocol + url; else if(url.charAt(0) == "/") return location.protocol + "//" + location.host + url; else if(url.substring(0,2) == "./") url = "." + url; else if(/^\s*$/.test(url)) return ""; //Empty = Return nothing else url = "../" + url; url = base_url + url; var i=0 while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,""))); /* Escape certain characters to prevent XSS */ url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22") .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E"); return url; } 

案例/例子:

  • http://foo.bar 。 已经是一个绝对的URL,因此立即返回。
  • /doo相对于root:返回当前根目录+提供的相对URL。
  • ./meh相对于当前目录。
  • ../booh相对于父目录。

该函数将相对path转换为../ ,并执行search和replace( http://domain/sub/anything-but-a-slash/../mehttp://domain/sub/me ) 。


replace_all_rel_by_abs转换所有相关的URL的出现
脚本实例中的URL( <script> ,事件处理程序不会被replace,因为几乎不可能创build快速安全的filter来parsingJavaScript。

这个脚本里面有一些注释。 正则expression式是dynamic创build的,因为单个RE可以具有3000个字符的大小。 可以用各种方式混淆<meta http-equiv=refresh content=.. > ,因此RE的大小。

 function replace_all_rel_by_abs(html){ /*HTML/XML Attribute may not be prefixed by these characters (common attribute chars. This list is not complete, but will be sufficient for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */ var att = "[^-a-z0-9:._]"; var entityEnd = "(?:;|(?!\\d))"; var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")", "(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")", ")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")", ".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"}; /* Placeholders to filter obfuscations */ var charMap = {}; var s = ents[" "]+"*"; //Short-hand for common use var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*"; /* ^ Important: Must be pre- and postfixed by < and >. * This RE should match anything within a tag! */ /* @name ae @description Converts a given string in a sequence of the original input and the HTML entity @param String string String to convert */ function ae(string){ var all_chars_lowercase = string.toLowerCase(); if(ents[string]) return ents[string]; var all_chars_uppercase = string.toUpperCase(); var RE_res = ""; for(var i=0; i<string.length; i++){ var char_lowercase = all_chars_lowercase.charAt(i); if(charMap[char_lowercase]){ RE_res += charMap[char_lowercase]; continue; } var char_uppercase = all_chars_uppercase.charAt(i); var RE_sub = [char_lowercase]; RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd); RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd); if(char_lowercase != char_uppercase){ /* Note: RE ignorecase flag has already been activated */ RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd); RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd); } RE_sub = "(?:" + RE_sub.join("|") + ")"; RE_res += (charMap[char_lowercase] = RE_sub); } return(ents[string] = RE_res); } /* @name by @description 2nd argument for replace(). */ function by(match, group1, group2, group3){ /* Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ return group1 + rel_to_abs(group2) + group3; } /* @name by2 @description 2nd argument for replace(). Parses relevant HTML entities */ var slashRE = new RegExp(ae("/"), 'g'); var dotRE = new RegExp(ae("."), 'g'); function by2(match, group1, group2, group3){ /*Note that this function can also be used to remove links: * return group1 + "javascript://" + group3; */ group2 = group2.replace(slashRE, "/").replace(dotRE, "."); return group1 + rel_to_abs(group2) + group3; } /* @name cr @description Selects a HTML element and performs a search-and-replace on attributes @param String selector HTML substring to match @param String attribute RegExp-escaped; HTML element attribute to match @param String marker Optional RegExp-escaped; marks the prefix @param String delimiter Optional RegExp escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cr(selector, attribute, marker, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); attribute = att + attribute; marker = typeof marker == "string" ? marker : "\\s*=\\s*"; delimiter = typeof delimiter == "string" ? delimiter : ""; end = typeof end == "string" ? "?)("+end : ")("; var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi'); var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi'); var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi'); html = html.replace(selector, function(match){ return match.replace(re1, by).replace(re2, by).replace(re3, by); }); } /* @name cri @description Selects an attribute of a HTML element, and performs a search-and-replace on certain values @param String selector HTML element to match @param String attribute RegExp-escaped; HTML element attribute to match @param String front RegExp-escaped; attribute value, prefix to match @param String flags Optional RegExp flags, default "gi" @param String delimiter Optional RegExp-escaped; non-quote delimiters @param String end Optional RegExp-escaped; forces the match to end before an occurence of <end> */ function cri(selector, attribute, front, flags, delimiter, end){ if(typeof selector == "string") selector = new RegExp(selector, "gi"); attribute = att + attribute; flags = typeof flags == "string" ? flags : "gi"; var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi'); var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi'); var at1 = new RegExp('('+front+')([^"]+)(")', flags); var at2 = new RegExp("("+front+")([^']+)(')", flags); if(typeof delimiter == "string"){ end = typeof end == "string" ? end : ""; var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags); var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)}; } else { var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)}; } html = html.replace(selector, function(match){ return match.replace(re1, handleAttr).replace(re2, handleAttr); }); } /* <meta http-equiv=refresh content=" ; url= " > */ cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i"); cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */ cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */ cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */ cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */ /* <param name=movie value= >*/ cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value"); cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */ cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */ return html; } 

私人function的简短摘要:

  • rel_to_abs(url) – 将相对/未知url转换为绝对url
  • replace_all_rel_by_abs(html) – 用绝对URLreplaceHTMLstring中所有相关URL的出现。
    1. ae一个实体 – 返回一个RE模式来处理HTML实体。
    2. by – replace by – 这个简短的函数请求实际的urlreplace( rel_to_abs )。 这个function可能被称为数百次,如果不是千次。 注意不要在这个函数中添加一个慢速algorithm(自定义)。
    3. crC reate R eplace – 创build并执行search和replace。
      例如: href="..." (在任何HTML标签内)。
    4. criC reate R eplace I nline – 创build并执行search和replace。
      例如:HTML标签中的all style属性中的url(..)

testing用例

打开任何页面,并在地址栏中粘贴以下书签:

 javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})(); 

注入的代码包含上面定义的两个函数,以及下面显示的testing用例。 :testing用例不会修改页面的HTML,但会在textarea(可选)中显示parsing的结果。

 var t=(new Date).getTime(); var result = replace_all_rel_by_abs(document.documentElement.innerHTML); if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){ var txt = document.createElement("textarea"); txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%" txt.ondblclick = function(){this.parentNode.removeChild(this)} txt.value = result; document.body.appendChild(txt); } 

也可以看看:

  • 答案:parsing和清理HTMLstring

将url从绝对path转换为可靠的方法是使用内置的url模块 。

例:

 var url = require('url'); url.resolve("http://www.example.org/foo/bar/", "../baz/qux.html"); >> gives 'http://www.example.org/foo/baz/qux.html' 

这是Rob W在当前线程中回答“高级HTMLstringreplace函数” ,再加上一些代码,从而使JSLint很高兴。

我应该把它作为答复的评论,但我没有足够的声望点。

 /*jslint browser: true */ /*jslint regexp: true */ /*jslint unparam: true*/ /*jshint strict: false */ /** * convertRelToAbsUrl * * https://stackoverflow.com/a/7544757/1983903 * * @param {String} url * @return {String} updated url */ function convertRelToAbsUrl(url) { var baseUrl = null; if (/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) { return url; // url is already absolute } baseUrl = location.href.match(/^(.+)\/?(?:#.+)?$/)[0] + '/'; if (url.substring(0, 2) === '//') { return location.protocol + url; } if (url.charAt(0) === '/') { return location.protocol + '//' + location.host + url; } if (url.substring(0, 2) === './') { url = '.' + url; } else if (/^\s*$/.test(url)) { return ''; // empty = return nothing } url = baseUrl + '../' + url; while (/\/\.\.\//.test(url)) { url = url.replace(/[^\/]+\/+\.\.\//g, ''); } url = url.replace(/\.$/, '').replace(/\/\./g, '').replace(/"/g, '%22') .replace(/'/g, '%27').replace(/</g, '%3C').replace(/>/g, '%3E'); return url; } /** * convertAllRelativeToAbsoluteUrls * * https://stackoverflow.com/a/7544757/1983903 * * @param {String} html * @return {String} updated html */ function convertAllRelativeToAbsoluteUrls(html) { var me = this, att = '[^-a-z0-9:._]', entityEnd = '(?:;|(?!\\d))', ents = { ' ' : '(?:\\s|&nbsp;?|&#0*32' + entityEnd + '|&#x0*20' + entityEnd + ')', '(' : '(?:\\(|&#0*40' + entityEnd + '|&#x0*28' + entityEnd + ')', ')' : '(?:\\)|&#0*41' + entityEnd + '|&#x0*29' + entityEnd + ')', '.' : '(?:\\.|&#0*46' + entityEnd + '|&#x0*2e' + entityEnd + ')' }, charMap = {}, s = ents[' '] + '*', // short-hand for common use any = '(?:[^>\"\']*(?:\"[^\"]*\"|\'[^\']*\'))*?[^>]*', slashRE = null, dotRE = null; function ae(string) { var allCharsLowerCase = string.toLowerCase(), allCharsUpperCase = string.toUpperCase(), reRes = '', charLowerCase = null, charUpperCase = null, reSub = null, i = null; if (ents[string]) { return ents[string]; } for (i = 0; i < string.length; i++) { charLowerCase = allCharsLowerCase.charAt(i); if (charMap[charLowerCase]) { reRes += charMap[charLowerCase]; continue; } charUpperCase = allCharsUpperCase.charAt(i); reSub = [charLowerCase]; reSub.push('&#0*' + charLowerCase.charCodeAt(0) + entityEnd); reSub.push('&#x0*' + charLowerCase.charCodeAt(0).toString(16) + entityEnd); if (charLowerCase !== charUpperCase) { reSub.push('&#0*' + charUpperCase.charCodeAt(0) + entityEnd); reSub.push('&#x0*' + charUpperCase.charCodeAt(0).toString(16) + entityEnd); } reSub = '(?:' + reSub.join('|') + ')'; reRes += (charMap[charLowerCase] = reSub); } return (ents[string] = reRes); } function by(match, group1, group2, group3) { return group1 + me.convertRelToAbsUrl(group2) + group3; } slashRE = new RegExp(ae('/'), 'g'); dotRE = new RegExp(ae('.'), 'g'); function by2(match, group1, group2, group3) { group2 = group2.replace(slashRE, '/').replace(dotRE, '.'); return group1 + me.convertRelToAbsUrl(group2) + group3; } function cr(selector, attribute, marker, delimiter, end) { var re1 = null, re2 = null, re3 = null; if (typeof selector === 'string') { selector = new RegExp(selector, 'gi'); } attribute = att + attribute; marker = typeof marker === 'string' ? marker : '\\s*=\\s*'; delimiter = typeof delimiter === 'string' ? delimiter : ''; end = typeof end === 'string' ? '?)(' + end : ')('; re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi'); re2 = new RegExp('(' + attribute + marker + '\')([^\'' + delimiter + ']+' + end + ')', 'gi'); re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi'); html = html.replace(selector, function (match) { return match.replace(re1, by).replace(re2, by).replace(re3, by); }); } function cri(selector, attribute, front, flags, delimiter, end) { var re1 = null, re2 = null, at1 = null, at2 = null, at3 = null, handleAttr = null; if (typeof selector === 'string') { selector = new RegExp(selector, 'gi'); } attribute = att + attribute; flags = typeof flags === 'string' ? flags : 'gi'; re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi'); re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi'); at1 = new RegExp('(' + front + ')([^"]+)(")', flags); at2 = new RegExp("(" + front + ")([^']+)(')", flags); if (typeof delimiter === 'string') { end = typeof end === 'string' ? end : ''; at3 = new RegExp('(' + front + ')([^\"\'][^' + delimiter + ']*' + (end ? '?)(' + end + ')' : ')()'), flags); handleAttr = function (match, g1, g2) { return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2); }; } else { handleAttr = function (match, g1, g2) { return g1 + g2.replace(at1, by2).replace(at2, by2); }; } html = html.replace(selector, function (match) { return match.replace(re1, handleAttr).replace(re2, handleAttr); }); } cri('<meta' + any + att + 'http-equiv\\s*=\\s*(?:\"' + ae('refresh') + '\"' + any + '>|\'' + ae('refresh') + '\'' + any + '>|' + ae('refresh') + '(?:' + ae(' ') + any + '>|>))', 'content', ae('url') + s + ae('=') + s, 'i'); cr('<' + any + att + 'href\\s*=' + any + '>', 'href'); /* Linked elements */ cr('<' + any + att + 'src\\s*=' + any + '>', 'src'); /* Embedded elements */ cr('<object' + any + att + 'data\\s*=' + any + '>', 'data'); /* <object data= > */ cr('<applet' + any + att + 'codebase\\s*=' + any + '>', 'codebase'); /* <applet codebase= > */ /* <param name=movie value= >*/ cr('<param' + any + att + 'name\\s*=\\s*(?:\"' + ae('movie') + '\"' + any + '>|\'' + ae('movie') + '\'' + any + '>|' + ae('movie') + '(?:' + ae(' ') + any + '>|>))', 'value'); cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, 'url', '\\s*\\(\\s*', '', '\\s*\\)'); /* <style> */ cri('<' + any + att + 'style\\s*=' + any + '>', 'style', ae('url') + s + ae('(') + s, 0, s + ae(')'), ae(')')); /*< style=" url(...) " > */ return html; } 

从上面关于基本标签的Rob W的评论中,我写了一个注入函数:

 function injectBase(html, base) { // Remove any <base> elements inside <head> html = html.replace(/(<[^>/]*head[^>]*>)[\s\S]*?(<[^>/]*base[^>]*>)[\s\S]*?(<[^>]*head[^>]*>)/img, "$1 $3"); // Add <base> just before </head> html = html.replace(/(<[^>/]*head[^>]*>[\s\S]*?)(<[^>]*head[^>]*>)/img, "$1 " + base + " $2"); return(html); } 

如果你使用正则expression式来查找所有的非绝对URL,那么你可以在它们前面加上当前的URL,这就是它。

您需要修复的url不是以/http(s)://开头的http(s):// (或其他协议标记,如果您关心的话)

举一个例子,假设你在刮http://www.example.com/ 。 如果你遇到一个相对的URL,比方说foo/bar ,你只需要简单地在URL中加上前缀就可以了: http://www.example.com/foo/bar : http://www.example.com/foo/bar

对于一个正则expression式从网页上刮去的url,可能有很多好的可用,如果你有一点谷歌,所以我不会开始发明一个可怜的人在这里:)