Fastest method to escape HTML tags as HTML entities?

JavascriptHtmlRegexPerformanceString

Javascript Problem Overview


I'm writing a Chrome extension that involves doing a lot of the following job: sanitizing strings that might contain HTML tags, by converting <, > and & to &lt;, &gt; and &amp;, respectively.

(In other words, the same as PHP's htmlspecialchars(str, ENT_NOQUOTES) – I don't think there's any real need to convert double-quote characters.)

This is the fastest function I have found so far:

function safe_tags(str) {
    return str.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;') ;
}

But there's still a big lag when I have to run a few thousand strings through it in one go.

Can anyone improve on this? It's mostly for strings between 10 and 150 characters, if that makes a difference.

(One idea I had was not to bother encoding the greater-than sign – would there be any real danger with that?)

Javascript Solutions


Solution 1 - Javascript

Here's one way you can do this:

var escape = document.createElement('textarea');
function escapeHTML(html) {
	escape.textContent = html;
	return escape.innerHTML;
}

function unescapeHTML(html) {
	escape.innerHTML = html;
	return escape.textContent;
}

Here's a demo.

Solution 2 - Javascript

You could try passing a callback function to perform the replacement:

var tagsToReplace = {
    '&': '&amp;',
    '<': '&lt;',
    '>': '&gt;'
};
 
function replaceTag(tag) {
    return tagsToReplace[tag] || tag;
}
 
function safe_tags_replace(str) {
    return str.replace(/[&<>]/g, replaceTag);
}

Here is a performance test: http://jsperf.com/encode-html-entities to compare with calling the replace function repeatedly, and using the DOM method proposed by Dmitrij.

Your way seems to be faster...

Why do you need it, though?

Solution 3 - Javascript

Martijn's method as a prototype function:

String.prototype.escape = function() {
    var tagsToReplace = {
        '&': '&amp;',
        '<': '&lt;',
        '>': '&gt;'
    };
    return this.replace(/[&<>]/g, function(tag) {
        return tagsToReplace[tag] || tag;
    });
};

var a = "<abc>";
var b = a.escape(); // "&lt;abc&gt;"

Solution 4 - Javascript

An even quicker/shorter solution is:

escaped = new Option(html).innerHTML

This is related to some weird vestige of JavaScript whereby the Option element retains a constructor that does this sort of escaping automatically.

Credit to https://github.com/jasonmoo/t.js/blob/master/t.js

Solution 5 - Javascript

The fastest method is:

function escapeHTML(html) {
    return document.createElement('div').appendChild(document.createTextNode(html)).parentNode.innerHTML;
}

This method is about twice faster than the methods based on 'replace', see http://jsperf.com/htmlencoderegex/35 .

Source: https://stackoverflow.com/a/17546215/698168

Solution 6 - Javascript

The AngularJS source code also has a version inside of angular-sanitize.js.

var SURROGATE_PAIR_REGEXP = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g,
    // Match everything outside of normal chars and " (quote character)
    NON_ALPHANUMERIC_REGEXP = /([^\#-~| |!])/g;
/**
 * Escapes all potentially dangerous characters, so that the
 * resulting string can be safely inserted into attribute or
 * element text.
 * @param value
 * @returns {string} escaped text
 */
function encodeEntities(value) {
  return value.
    replace(/&/g, '&amp;').
    replace(SURROGATE_PAIR_REGEXP, function(value) {
      var hi = value.charCodeAt(0);
      var low = value.charCodeAt(1);
      return '&#' + (((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000) + ';';
    }).
    replace(NON_ALPHANUMERIC_REGEXP, function(value) {
      return '&#' + value.charCodeAt(0) + ';';
    }).
    replace(/</g, '&lt;').
    replace(/>/g, '&gt;');
}

Solution 7 - Javascript

All-in-one script:

// HTML entities Encode/Decode

function htmlspecialchars(str) {
	var map = {
		"&": "&amp;",
		"<": "&lt;",
		">": "&gt;",
		"\"": "&quot;",
		"'": "&#39;" // ' -> &apos; for XML only
	};
    return str.replace(/[&<>"']/g, function(m) { return map[m]; });
}
function htmlspecialchars_decode(str) {
	var map = {
		"&amp;": "&",
		"&lt;": "<",
		"&gt;": ">",
		"&quot;": "\"",
		"&#39;": "'"
	};
    return str.replace(/(&amp;|&lt;|&gt;|&quot;|&#39;)/g, function(m) { return map[m]; });
}
function htmlentities(str) {
	var textarea = document.createElement("textarea");
	textarea.innerHTML = str;
	return textarea.innerHTML;
}
function htmlentities_decode(str) {
	var textarea = document.createElement("textarea");
	textarea.innerHTML = str;
	return textarea.value;
}

http://pastebin.com/JGCVs0Ts

Solution 8 - Javascript

function encode(r) {
  return r.replace(/[\x26\x0A\x3c\x3e\x22\x27]/g, function(r) {
	return "&#" + r.charCodeAt(0) + ";";
  });
}

test.value=encode('How to encode\nonly html tags &<>\'" nice & fast!');

/*
 \x26 is &ampersand (it has to be first),
 \x0A is newline,
 \x22 is ",
 \x27 is ',
 \x3c is <,
 \x3e is >
*/

<textarea id=test rows=11 cols=55>www.WHAK.com</textarea>

Solution 9 - Javascript

Martijn's method as single function with handling " mark (using in javascript) :

function escapeHTML(html) {
    var fn=function(tag) {
        var charsToReplace = {
            '&': '&amp;',
            '<': '&lt;',
            '>': '&gt;',
            '"': '&#34;'
        };
        return charsToReplace[tag] || tag;
    }
    return html.replace(/[&<>"]/g, fn);
}

Solution 10 - Javascript

I'm not entirely sure about speed, but if you are looking for simplicity I would suggest using the lodash/underscore escape function.

Solution 11 - Javascript

I'll add XMLSerializer to the pile. It provides the fastest result without using any object caching (not on the serializer, nor on the Text node).

function serializeTextNode(text) {
  return new XMLSerializer().serializeToString(document.createTextNode(text));
}

The added bonus is that it supports attributes which is serialized differently than text nodes:

function serializeAttributeValue(value) {
  const attr = document.createAttribute('a');
  attr.value = value;
  return new XMLSerializer().serializeToString(attr);
}

You can see what it's actually replacing by checking the spec, both for text nodes and for attribute values. The full documentation has more node types, but the concept is the same.

As for performance, it's the fastest when not cached. When you do allow caching, then calling innerHTML on an HTMLElement with a child Text node is fastest. Regex would be slowest (as proven by other comments). Of course, XMLSerializer could be faster on other browsers, but in my (limited) testing, a innerHTML is fastest.


Fastest single line:

new XMLSerializer().serializeToString(document.createTextNode(text));

Fastest with caching:

const cachedElementParent = document.createElement('div');
const cachedChildTextNode = document.createTextNode('');
cachedElementParent.appendChild(cachedChildTextNode);

function serializeTextNode(text) {
  cachedChildTextNode.nodeValue = text;
  return cachedElementParent.innerHTML;
}

https://jsperf.com/htmlentityencode/1

Solution 12 - Javascript

A bit late to the show, but what's wrong with using encodeURIComponent() and decodeURIComponent()?

Attributions

All content for this solution is sourced from the original question on Stackoverflow.

The content on this page is licensed under the Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.

Content TypeOriginal AuthorOriginal Content on Stackoverflow
QuestioncallumView Question on Stackoverflow
Solution 1 - JavascriptWeb_DesignerView Answer on Stackoverflow
Solution 2 - JavascriptMartijnView Answer on Stackoverflow
Solution 3 - JavascriptAram KocharyanView Answer on Stackoverflow
Solution 4 - JavascriptToddView Answer on Stackoverflow
Solution 5 - JavascriptJulien KroneggView Answer on Stackoverflow
Solution 6 - JavascriptKevin HakansonView Answer on Stackoverflow
Solution 7 - JavascriptbaptxView Answer on Stackoverflow
Solution 8 - JavascriptDave BrownView Answer on Stackoverflow
Solution 9 - JavascriptimanView Answer on Stackoverflow
Solution 10 - JavascriptgilmaticView Answer on Stackoverflow
Solution 11 - JavascriptShortFuseView Answer on Stackoverflow
Solution 12 - Javascriptsuncat100View Answer on Stackoverflow