/*! * resanitize - Regular expression-based HTML sanitizer and ad remover, geared toward RSS feed descriptions * Copyright(c) 2012 Dan MacTough * All rights reserved. */ /** * Dependencies */ var validator = require('validator'); /** * Remove unsafe parts and ads from HTML * * Example: * * var resanitize = require('resanitize'); * resanitize('
Headline
'); * // => '
Headline
' * * References: * - http://en.wikipedia.org/wiki/C0_and_C1_control_codes * - http://en.wikipedia.org/wiki/Unicode_control_characters * - http://www.utf8-chartable.de/unicode-utf8-table.pl * * @param {String|Buffer} HTML string to sanitize * @return {String} sanitized HTML * @api public */ function resanitize (str) { if ('string' !== typeof str) { if (Buffer.isBuffer(str)) { str = str.toString(); } else { throw new TypeError('Invalid argument: must be String or Buffer'); } } str = stripAsciiCtrlChars(str); str = stripExtendedCtrlChars(str); str = fixSpace(str); str = stripComments(str); str = stripAds(str); // It's important that this comes before the remainder // because it matches on certain attribute values that // get stripped below. str = validator.sanitize(str).xss().replace(/\[removed\]/g, '') str = fixImages(str); str = stripUnsafeTags(str); str = stripUnsafeAttrs(str); return str; } module.exports = resanitize; /** * Replace UTF-8 non-breaking space with a regular space and strip null bytes */ function fixSpace (str) { return str.replace(/\u00A0/g, ' ') // Unicode non-breaking space .replace(/[\u2028\u2029]/g, '') // UCS newline characters .replace(/\0/g, ''); } module.exports.fixSpace = fixSpace; /** * Strip superfluous whitespace */ function stripHtmlExtraSpace (str) { return str.replace(/<(div|p)[^>]*?>\s*?(?:]*?>)*?\s*?<\/\1>/gi, '') .replace(/<(div|span)[^>]*?>\s*?<\/\1>/gi, ''); } module.exports.stripHtmlExtraSpace = stripHtmlExtraSpace; /** * Strip ASCII control characters */ function stripAsciiCtrlChars (str) { return str.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/g, ''); } module.exports.stripAsciiCtrlChars = stripAsciiCtrlChars; /** * Strip ISO 6429 control characters */ function stripExtendedCtrlChars (str) { return str.replace(/[\u0080-\u009F]+/g, ''); } module.exports.stripExtendedCtrlChars = stripExtendedCtrlChars; /** * Strip HTML comments */ function stripComments (str) { return str.replace(//g, ''); } module.exports.stripComments = stripComments; /** * Permit only the provided attributes to remain in the tag */ function filterAttrs () { var allowed = []; if (Array.isArray(arguments[0])) { allowed = arguments[0]; } else { allowed = Array.prototype.slice.call(arguments); } return function (attr, name) { if ( ~allowed.indexOf(name && name.toLowerCase()) ) { return attr; } else { return ''; } }; } module.exports.filterAttrs = filterAttrs; /** * Strip the provided attributes from the tag */ function stripAttrs () { var banned = [] , regexes = []; if (Array.isArray(arguments[0])) { banned = arguments[0].filter(function (attr) { if ('string' === typeof attr) { return true; } else if (attr.constructor && 'RegExp' === attr.constructor.name) { regexes.push(attr); } else { } }); } else { banned = Array.prototype.slice.call(arguments).filter(function (attr) { if ('string' === typeof attr) { return true; } else if (attr.constructor && 'RegExp' === attr.constructor.name) { regexes.push(attr); } }); } return function (attr, name) { if ( ~banned.indexOf(name && name.toLowerCase()) || regexes.some(function (re) { return re.test(name); }) ) { return ''; } else { return attr; } }; } module.exports.stripAttrs = stripAttrs; /** * Filter an HTML opening or self-closing tag */ function filterTag (nextFilter) { return function (rematch) { if ('function' === typeof nextFilter) { rematch = rematch.replace(/([^\s"']+?)=("|')[^>]+?\2/g, nextFilter); } // Cleanup extra whitespace return rematch.replace(/\s+/g, ' ') .replace(/ (\/)?>/, '$1>'); }; } module.exports.filterTag = filterTag; function fixImages (str) { return str.replace(/(]*?>)/g, filterTag(filterAttrs('src', 'alt', 'title', 'height', 'width')) ); } module.exports.fixImages = fixImages; function stripUnsafeAttrs (str) { var unsafe = [ 'id' , 'class' , 'style' , 'accesskey' , 'action' , 'autocomplete' , 'autofocus' , 'clear' , 'contextmenu' , 'contenteditable' , 'draggable' , 'dropzone' , 'method' , 'tabindex' , 'target' , /on\w+/i , /data-\w+/i ]; return str.replace(/<([^ >]+?) [^>]*?>/g, filterTag(stripAttrs(unsafe))); } module.exports.stripUnsafeAttrs = stripUnsafeAttrs; function stripUnsafeTags (str) { var el = /<(?:wbr|form|input|font|blink|script|style|comment|plaintext|xmp|link|listing|meta|body|frame|frameset)\b/; var ct = 0, max = 2; // We'll repeatedly try to strip any maliciously nested elements up to [max] times while (el.test(str) && ct++ < max) { str = str.replace(/]*?>[\s\S]*?<\/form>/gi, '') .replace(/]*?>[\s\S]*?<\/input>/gi, '') .replace(/<\/?(?:form|input|font|blink)[^>]*?>/gi, '') // These are XSS/security risks .replace(/]*?>[\s\S]*?<\/script>/gi, '') .replace(/<(\/)*wbr[^>]*?>/gi, '') .replace(/]*?>[\s\S]*?<\/style>/gi, '') // shouldn't work anyway... .replace(/]*?>[\s\S]*?<\/comment>/gi, '') .replace(/]*?>[\s\S]*?<\/plaintext>/gi, '') .replace(/]*?>[\s\S]*?<\/xmp>/gi, '') .replace(/<\/?(?:link|listing|meta|body|frame|frameset)[^>]*?>/gi, '') // Delete iframes, except those inserted by Google in lieu of video embeds .replace(/]*?src=("|')\S+?reader.googleusercontent.com\/reader\/embediframe.+?\1)[^>]*?>[\s\S]*?<\/iframe>/gi, '') ; } if (el.test(str)) { // We couldn't safely strip the HTML, so we return an empty string return ''; } return str; } module.exports.stripUnsafeTags = stripUnsafeTags; function stripAds (str) { return str.replace(/]*?class=("|')snap_preview\1[^>]*?>(?:]*?>)?([\s\S]*?)<\/div>/gi, '$2') .replace(/]*?class=("|')(?:feedflare|zemanta-pixie)\1[^>]*?>[\s\S]*?<\/div>/gi, '') .replace(/[\s\S]*?\s*/gi, '') .replace(/]*?>[\s\S]*?<\/table>\s*?]*?>[\s\S]*?Ads by Pheedo[\s\S]*?<\/div>/gi, '') .replace(/]*?>.*?]*?src=("|')[^>]*?advertisement[^>]*?\1.*?>.*?<\/table>/gi, '') .replace(/]*?>\s*?]*?>\s*?]*?class=("|')advertisement\1[^>]*?>[\s\S]*?<\/span>[\s\S]*]*?>[\s\S]*?Ads by Pheedo[\s\S]*?<\/div>/gi, '') .replace(/]*?>\s*?]*?>\s*?(?:<[^>]+>\s*)*?]*?>\s*?]*?>(?:Featured Advertiser|Presented By:)<\/div>[\s\S]*]*?>[\s\S]*?(?:Ads by Pheedo|ads\.pheedo\.com)[\s\S]*?<\/div>/gi, '') .replace(/]*?>\s*?]*?>\s*?]*?href=("|')http:\/\/[^>]*?\.?(?:pheedo|pheedcontent)\.com\/[^>]*?\1[\s\S]*?<\/a>[\s\S]*$/gim, '') .replace(/]*?class=("|')cbw snap_nopreview\1[^>]*?>[\s\S]*$/gim, '') .replace(/
*/gi, '') .replace(/<(p|div)[^>]*?>\s*?]*?href=("|')[^>]+?\2[^>]*?>]*?src=("|')http:\/\/(?:feedads\.googleadservices|feedproxy\.google|feeds2?\.feedburner)\.com\/(?:~|%7e)[^>]*?\/[^>]+?\3[^>]*?>[\s\S]*?<\/\1>/gi, '') .replace(/<(p|div)[^>]*?>\s*?]*?href=("|')[^>]+?\2[^>]*?>]*?src=("|')http:\/\/feeds\.[^>]+?\.[^>]+?\/(?:~|%7e)[^>]\/[^>]+?\3[^>]*?>[\s\S]*?<\/\1>/gi, '') .replace(/]*?href=("|')http:\/\/feeds\.[^>]+?\.[^>]+?\/(?:~|%7e)[a-qs-z]\/[^>]+?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/[^>]*?\.?addtoany\.com\/[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/feeds\.wordpress\.com\/[\.\d]+?\/(?:comments|go[\s\S]*)\/[^>]+?\1[\s\S]*?<\/a> ?/gi, '') .replace(/]*?href=("|')http:\/\/[^>]*?\.?doubleclick\.net\/[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/[^>]*?\.?fmpub\.net\/adserver\/[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/[^>]*?\.?eyewonderlabs\.com\/[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/[^>]*?\.?pheedo\.com\/[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/]*?href=("|')http:\/\/api\.tweetmeme\.com\/share\?[^>]*?\1[\s\S]*?<\/a>/gi, '') .replace(/

]*?href=("|')http:\/\/rss\.cnn\.com\/+?(?:~|%7e)a\/[^>]*?\1[\s\S]*?<\/p>/gi, '') .replace(/]*?src=("|')http:\/\/feeds\.[^>]+?\.[^>]+?\/(?:~|%7e)r\/[^>]+?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/rss\.nytimes\.com\/c\/[^>]*?\1.*?>.*$/gim, '') .replace(/]*?src=("|')http:\/\/feeds\.washingtonpost\.com\/c\/[^>]*?\1.*?>.*$/gim, '') .replace(/]*?src=("|')http:\/\/[^>]*?\.?feedsportal\.com\/c\/[^>]*?\1.*?>.*$/gim, '') .replace(/]*?src=("|')http:\/\/(?:feedads\.googleadservices|feedproxy\.google|feeds2\.feedburner)\.com\/(?:~|%7e)r\/[^>]+?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/rss\.cnn\.com\/~r\/[^>]*?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/[^>]*?\.?fmpub\.net\/adserver\/[^>]*?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/[^>]*?\.?pheedo\.com\/[^>]*?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/stats\.wordpress\.com\/[\w]\.gif\?[^>]*?\1[\s\S]*?>/gi, '') .replace(/]*?src=("|')http:\/\/feeds\.wired\.com\/c\/[^>]*?\1.*?>.*$/gim, '') .replace(/

Crunch Network[\s\S]*?<\/p>/gi, '') .replace(/]*?castfire_player[\s\S]*?> *?(<\/embed>)?/gi, '') .replace(/]*?src=("|')[^>]*?castfire\.com[^>]+?\1[\s\S]*?> *?(<\/embed>)?/gi, '') .replace(/

Sponsor<\/em>[\s\S]*?<\/p>/gi, '') .replace(/

]*?src=(?:"|')[^>]*?\/share-buttons\/[\s\S]*?<\/div>[\s]*/gi, '') // This is that annoying footer in every delicious item .replace(/]*?>\s*?]*?href=("|')[^\1]+?src=feed_newsgator\1[^>]*?>[\s\S]*<\/span>/gi, '') // This is the annoying footer from ATL .replace(/]*?>]*?href=("|')[^>]*?abovethelaw\.com\/[\s\S]+?\1[^>]*?>Continue reading[\s\S]*<\/p>/gi, '') // This is the annoying link at the end of WaPo articles .replace(/]*?>Read full article[\s\S]*?<\/a>/gi, '') // These ads go... .replace(/]*?>]*?href=("|')[^>]*?crunchbase\.com\/company\/[\s\S]+?\1[^>]*?>[\s\S]*?Loading information about[\s\S]*?<\/div>/gi, '') .replace(/]*?class=("|')cb_widget_[^>]+?\1[\s\S]*?><\/div>/gi, '') .replace(/]*?class=("|')cb_widget_[^>]+?\1[\s\S]*?>[\s\S]*?<\/div>/gi, '') // Before these .replace(/]*?href=("|')[^>]*?crunchbase\.com\/\1[\s\S]*?<\/a>\s*/gi, '') .replace(/]*?class=("|')cb_widget\1[^>]*?>[\s\S]*?<\/div>/gi, '') // Clean up some empty things //.replace(/<(div|p|span)[^>]*?>(\s|
|(?R))*?<\/\1>/gi, '') .replace(/(\s|]*?\/?>)*$/gim, '') ; } module.exports.stripAds = stripAds; /** * Dumbly strip angle brackets */ function stripHtml (str) { return str.replace(/<.*?>/g, ''); } module.exports.stripHtml = stripHtml;