/*!
* resanitize - Regular expression-based HTML sanitizer and ad remover, geared toward RSS feed descriptions
* Copyright(c) 2012 Dan MacTough
* All rights reserved.
*/
/**
* Dependencies
*/
var validator = require('validator');
/**
* Remove unsafe parts and ads from HTML
*
* Example:
*
* var resanitize = require('resanitize');
* resanitize('
Headline
');
* // => '
Headline
'
*
* References:
* - http://en.wikipedia.org/wiki/C0_and_C1_control_codes
* - http://en.wikipedia.org/wiki/Unicode_control_characters
* - http://www.utf8-chartable.de/unicode-utf8-table.pl
*
* @param {String|Buffer} HTML string to sanitize
* @return {String} sanitized HTML
* @api public
*/
function resanitize (str) {
if ('string' !== typeof str) {
if (Buffer.isBuffer(str)) {
str = str.toString();
}
else {
throw new TypeError('Invalid argument: must be String or Buffer');
}
}
str = stripAsciiCtrlChars(str);
str = stripExtendedCtrlChars(str);
str = fixSpace(str);
str = stripComments(str);
str = stripAds(str); // It's important that this comes before the remainder
// because it matches on certain attribute values that
// get stripped below.
str = validator.sanitize(str).xss().replace(/\[removed\]/g, '')
str = fixImages(str);
str = stripUnsafeTags(str);
str = stripUnsafeAttrs(str);
return str;
}
module.exports = resanitize;
/**
* Replace UTF-8 non-breaking space with a regular space and strip null bytes
*/
function fixSpace (str) {
return str.replace(/\u00A0/g, ' ') // Unicode non-breaking space
.replace(/[\u2028\u2029]/g, '') // UCS newline characters
.replace(/\0/g, '');
}
module.exports.fixSpace = fixSpace;
/**
* Strip superfluous whitespace
*/
function stripHtmlExtraSpace (str) {
return str.replace(/<(div|p)[^>]*?>\s*?(?: ]*?>)*?\s*?<\/\1>/gi, '')
.replace(/<(div|span)[^>]*?>\s*?<\/\1>/gi, '');
}
module.exports.stripHtmlExtraSpace = stripHtmlExtraSpace;
/**
* Strip ASCII control characters
*/
function stripAsciiCtrlChars (str) {
return str.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/g, '');
}
module.exports.stripAsciiCtrlChars = stripAsciiCtrlChars;
/**
* Strip ISO 6429 control characters
*/
function stripExtendedCtrlChars (str) {
return str.replace(/[\u0080-\u009F]+/g, '');
}
module.exports.stripExtendedCtrlChars = stripExtendedCtrlChars;
/**
* Strip HTML comments
*/
function stripComments (str) {
return str.replace(//g, '');
}
module.exports.stripComments = stripComments;
/**
* Permit only the provided attributes to remain in the tag
*/
function filterAttrs () {
var allowed = [];
if (Array.isArray(arguments[0])) {
allowed = arguments[0];
} else {
allowed = Array.prototype.slice.call(arguments);
}
return function (attr, name) {
if ( ~allowed.indexOf(name && name.toLowerCase()) ) {
return attr;
} else {
return '';
}
};
}
module.exports.filterAttrs = filterAttrs;
/**
* Strip the provided attributes from the tag
*/
function stripAttrs () {
var banned = []
, regexes = [];
if (Array.isArray(arguments[0])) {
banned = arguments[0].filter(function (attr) {
if ('string' === typeof attr) {
return true;
}
else if (attr.constructor && 'RegExp' === attr.constructor.name) {
regexes.push(attr);
}
else {
}
});
} else {
banned = Array.prototype.slice.call(arguments).filter(function (attr) {
if ('string' === typeof attr) {
return true;
}
else if (attr.constructor && 'RegExp' === attr.constructor.name) {
regexes.push(attr);
}
});
}
return function (attr, name) {
if ( ~banned.indexOf(name && name.toLowerCase()) || regexes.some(function (re) { return re.test(name); }) ) {
return '';
} else {
return attr;
}
};
}
module.exports.stripAttrs = stripAttrs;
/**
* Filter an HTML opening or self-closing tag
*/
function filterTag (nextFilter) {
return function (rematch) {
if ('function' === typeof nextFilter) {
rematch = rematch.replace(/([^\s"']+?)=("|')[^>]+?\2/g, nextFilter);
}
// Cleanup extra whitespace
return rematch.replace(/\s+/g, ' ')
.replace(/ (\/)?>/, '$1>');
};
}
module.exports.filterTag = filterTag;
function fixImages (str) {
return str.replace(/(]*?>)/g, filterTag(filterAttrs('src', 'alt', 'title', 'height', 'width')) );
}
module.exports.fixImages = fixImages;
function stripUnsafeAttrs (str) {
var unsafe = [ 'id'
, 'class'
, 'style'
, 'accesskey'
, 'action'
, 'autocomplete'
, 'autofocus'
, 'clear'
, 'contextmenu'
, 'contenteditable'
, 'draggable'
, 'dropzone'
, 'method'
, 'tabindex'
, 'target'
, /on\w+/i
, /data-\w+/i
];
return str.replace(/<([^ >]+?) [^>]*?>/g, filterTag(stripAttrs(unsafe)));
}
module.exports.stripUnsafeAttrs = stripUnsafeAttrs;
function stripUnsafeTags (str) {
// var el = /<(?:wbr|form|input|font|blink|script|style|comment|plaintext|xmp|link|listing|meta|body|frame|frameset)\b/;
var ct = 0, max = 2;
// Prohibited elements
var otherTags = ['wbr','style', 'comment', 'plaintext', 'xmp', 'listing',
// 以下是evernote禁止的
'applet','base','basefont','bgsound','blink','body','button','dir','embed','fieldset','frameset','head',
'html','iframe','ilayer','input','isindex','label','layer','legend','link','marquee','menu','meta','noframes',
'noscript','object','optgroup','option','param','plaintext','script','select','style','textarea','xml'];
var patterReplace1 = '';
var patterReplace2 = '';
var pattern = '<(?:';
for (var i = 0; i < otherTags.length; ++i) {
pattern += otherTags[i] + '|';
patterReplace2 += otherTags[i] + '|';
}
pattern += 'body)\\b';
patterReplace2 += 'body'
var reg = new RegExp(pattern);
// 单个自闭合
var replageReg = new RegExp('<\\/?(?:' + patterReplace2 + ')[^>]*?>', 'gi');
// We'll repeatedly try to strip any maliciously nested elements up to [max] times
while (reg.test(str) && ct++ < max) {
for (var i = 0; i < otherTags.length; ++i) {
var tag = otherTags[i];
// 双闭合
str = str.replace(new RegExp('<' + tag + '[^>]*?>[\\s\\S]*?<\\/' + tag + '>', 'gi'), '')
}
// 单个自闭合
str = str.replace(replageReg, '');
// str = str.replace(/