/*!
* resanitize - Regular expression-based HTML sanitizer and ad remover, geared toward RSS feed descriptions
* Copyright(c) 2012 Dan MacTough
* All rights reserved.
*/
/**
* Dependencies
*/
var validator = require('validator');
/**
* Remove unsafe parts and ads from HTML
*
* Example:
*
* var resanitize = require('resanitize');
* resanitize('Headline
');
* // => 'Headline
'
*
* References:
* - http://en.wikipedia.org/wiki/C0_and_C1_control_codes
* - http://en.wikipedia.org/wiki/Unicode_control_characters
* - http://www.utf8-chartable.de/unicode-utf8-table.pl
*
* @param {String|Buffer} HTML string to sanitize
* @return {String} sanitized HTML
* @api public
*/
function resanitize (str) {
if ('string' !== typeof str) {
if (Buffer.isBuffer(str)) {
str = str.toString();
}
else {
throw new TypeError('Invalid argument: must be String or Buffer');
}
}
str = stripAsciiCtrlChars(str);
str = stripExtendedCtrlChars(str);
str = fixSpace(str);
str = stripComments(str);
str = stripAds(str); // It's important that this comes before the remainder
// because it matches on certain attribute values that
// get stripped below.
str = validator.sanitize(str).xss().replace(/\[removed\]/g, '')
str = fixImages(str);
str = stripUnsafeTags(str);
str = stripUnsafeAttrs(str);
return str;
}
module.exports = resanitize;
/**
* Replace UTF-8 non-breaking space with a regular space and strip null bytes
*/
function fixSpace (str) {
return str.replace(/\u00A0/g, ' ') // Unicode non-breaking space
.replace(/[\u2028\u2029]/g, '') // UCS newline characters
.replace(/\0/g, '');
}
module.exports.fixSpace = fixSpace;
/**
* Strip superfluous whitespace
*/
function stripHtmlExtraSpace (str) {
return str.replace(/<(div|p)[^>]*?>\s*?(?:
]*?>)*?\s*?<\/\1>/gi, '')
.replace(/<(div|span)[^>]*?>\s*?<\/\1>/gi, '');
}
module.exports.stripHtmlExtraSpace = stripHtmlExtraSpace;
/**
* Strip ASCII control characters
*/
function stripAsciiCtrlChars (str) {
return str.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/g, '');
}
module.exports.stripAsciiCtrlChars = stripAsciiCtrlChars;
/**
* Strip ISO 6429 control characters
*/
function stripExtendedCtrlChars (str) {
return str.replace(/[\u0080-\u009F]+/g, '');
}
module.exports.stripExtendedCtrlChars = stripExtendedCtrlChars;
/**
* Strip HTML comments
*/
function stripComments (str) {
return str.replace(//g, '');
}
module.exports.stripComments = stripComments;
/**
* Permit only the provided attributes to remain in the tag
*/
function filterAttrs () {
var allowed = [];
if (Array.isArray(arguments[0])) {
allowed = arguments[0];
} else {
allowed = Array.prototype.slice.call(arguments);
}
return function (attr, name) {
if ( ~allowed.indexOf(name && name.toLowerCase()) ) {
return attr;
} else {
return '';
}
};
}
module.exports.filterAttrs = filterAttrs;
/**
* Strip the provided attributes from the tag
*/
function stripAttrs () {
var banned = []
, regexes = [];
if (Array.isArray(arguments[0])) {
banned = arguments[0].filter(function (attr) {
if ('string' === typeof attr) {
return true;
}
else if (attr.constructor && 'RegExp' === attr.constructor.name) {
regexes.push(attr);
}
else {
}
});
} else {
banned = Array.prototype.slice.call(arguments).filter(function (attr) {
if ('string' === typeof attr) {
return true;
}
else if (attr.constructor && 'RegExp' === attr.constructor.name) {
regexes.push(attr);
}
});
}
return function (attr, name) {
if ( ~banned.indexOf(name && name.toLowerCase()) || regexes.some(function (re) { return re.test(name); }) ) {
return '';
} else {
return attr;
}
};
}
module.exports.stripAttrs = stripAttrs;
/**
* Filter an HTML opening or self-closing tag
*/
function filterTag (nextFilter) {
return function (rematch) {
if ('function' === typeof nextFilter) {
rematch = rematch.replace(/([^\s"']+?)=("|')[^>]+?\2/g, nextFilter);
}
// Cleanup extra whitespace
return rematch.replace(/\s+/g, ' ')
.replace(/ (\/)?>/, '$1>');
};
}
module.exports.filterTag = filterTag;
function fixImages (str) {
return str.replace(/(
]*?>)/g, filterTag(filterAttrs('src', 'alt', 'title', 'height', 'width')) );
}
module.exports.fixImages = fixImages;
function stripUnsafeAttrs (str) {
var unsafe = [ 'id'
, 'class'
, 'style'
, 'accesskey'
, 'action'
, 'autocomplete'
, 'autofocus'
, 'clear'
, 'contextmenu'
, 'contenteditable'
, 'draggable'
, 'dropzone'
, 'method'
, 'tabindex'
, 'target'
, /on\w+/i
, /data-\w+/i
];
return str.replace(/<([^ >]+?) [^>]*?>/g, filterTag(stripAttrs(unsafe)));
}
module.exports.stripUnsafeAttrs = stripUnsafeAttrs;
function stripUnsafeTags (str) {
var el = /<(?:wbr|form|input|font|blink|script|style|comment|plaintext|xmp|link|listing|meta|body|frame|frameset)\b/;
var ct = 0, max = 2;
// We'll repeatedly try to strip any maliciously nested elements up to [max] times
while (el.test(str) && ct++ < max) {
str = str.replace(/