How to convert characters to HTML entities using plain JavaScript

Javascript Html Escaping Symbols Html Entities

Javascript Problem Overview

I have the following:

var text = "Übergroße Äpfel mit Würmern";

I'm searching for a Javascript function to transform the text so that every special letter is represented by its HTML entity sequence like this:

var newText = magicFunction(text);
...
newText = "&Uuml;bergro&szlig;e &Auml;pfel mit W&uuml;rmern";

The function should not only escape the letters of this example but also all of these.

How would you achieve that? Is there any existing function out there? (Plain, because a solution without a framework is preferred)

Btw: Yes, I've seen this question but it doesn't address my need.

Javascript Solutions

Solution 1 - Javascript

With the help of bucabay and the advice to create my own function i created this one which works for me. What do you guys think, is there a better solution somewhere?

if(typeof escapeHtmlEntities == 'undefined') {
        escapeHtmlEntities = function (text) {
            return text.replace(/[\u00A0-\u2666<>\&]/g, function(c) {
                return '&' + 
                (escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0)) + ';';
            });
        };

        // all HTML4 entities as defined here: http://www.w3.org/TR/html4/sgml/entities.html
        // added: amp, lt, gt, quot and apos
        escapeHtmlEntities.entityTable = {
            34 : 'quot', 
            38 : 'amp', 
            39 : 'apos', 
            60 : 'lt', 
            62 : 'gt', 
            160 : 'nbsp', 
            161 : 'iexcl', 
            162 : 'cent', 
            163 : 'pound', 
            164 : 'curren', 
            165 : 'yen', 
            166 : 'brvbar', 
            167 : 'sect', 
            168 : 'uml', 
            169 : 'copy', 
            170 : 'ordf', 
            171 : 'laquo', 
            172 : 'not', 
            173 : 'shy', 
            174 : 'reg', 
            175 : 'macr', 
            176 : 'deg', 
            177 : 'plusmn', 
            178 : 'sup2', 
            179 : 'sup3', 
            180 : 'acute', 
            181 : 'micro', 
            182 : 'para', 
            183 : 'middot', 
            184 : 'cedil', 
            185 : 'sup1', 
            186 : 'ordm', 
            187 : 'raquo', 
            188 : 'frac14', 
            189 : 'frac12', 
            190 : 'frac34', 
            191 : 'iquest', 
            192 : 'Agrave', 
            193 : 'Aacute', 
            194 : 'Acirc', 
            195 : 'Atilde', 
            196 : 'Auml', 
            197 : 'Aring', 
            198 : 'AElig', 
            199 : 'Ccedil', 
            200 : 'Egrave', 
            201 : 'Eacute', 
            202 : 'Ecirc', 
            203 : 'Euml', 
            204 : 'Igrave', 
            205 : 'Iacute', 
            206 : 'Icirc', 
            207 : 'Iuml', 
            208 : 'ETH', 
            209 : 'Ntilde', 
            210 : 'Ograve', 
            211 : 'Oacute', 
            212 : 'Ocirc', 
            213 : 'Otilde', 
            214 : 'Ouml', 
            215 : 'times', 
            216 : 'Oslash', 
            217 : 'Ugrave', 
            218 : 'Uacute', 
            219 : 'Ucirc', 
            220 : 'Uuml', 
            221 : 'Yacute', 
            222 : 'THORN', 
            223 : 'szlig', 
            224 : 'agrave', 
            225 : 'aacute', 
            226 : 'acirc', 
            227 : 'atilde', 
            228 : 'auml', 
            229 : 'aring', 
            230 : 'aelig', 
            231 : 'ccedil', 
            232 : 'egrave', 
            233 : 'eacute', 
            234 : 'ecirc', 
            235 : 'euml', 
            236 : 'igrave', 
            237 : 'iacute', 
            238 : 'icirc', 
            239 : 'iuml', 
            240 : 'eth', 
            241 : 'ntilde', 
            242 : 'ograve', 
            243 : 'oacute', 
            244 : 'ocirc', 
            245 : 'otilde', 
            246 : 'ouml', 
            247 : 'divide', 
            248 : 'oslash', 
            249 : 'ugrave', 
            250 : 'uacute', 
            251 : 'ucirc', 
            252 : 'uuml', 
            253 : 'yacute', 
            254 : 'thorn', 
            255 : 'yuml', 
            402 : 'fnof', 
            913 : 'Alpha', 
            914 : 'Beta', 
            915 : 'Gamma', 
            916 : 'Delta', 
            917 : 'Epsilon', 
            918 : 'Zeta', 
            919 : 'Eta', 
            920 : 'Theta', 
            921 : 'Iota', 
            922 : 'Kappa', 
            923 : 'Lambda', 
            924 : 'Mu', 
            925 : 'Nu', 
            926 : 'Xi', 
            927 : 'Omicron', 
            928 : 'Pi', 
            929 : 'Rho', 
            931 : 'Sigma', 
            932 : 'Tau', 
            933 : 'Upsilon', 
            934 : 'Phi', 
            935 : 'Chi', 
            936 : 'Psi', 
            937 : 'Omega', 
            945 : 'alpha', 
            946 : 'beta', 
            947 : 'gamma', 
            948 : 'delta', 
            949 : 'epsilon', 
            950 : 'zeta', 
            951 : 'eta', 
            952 : 'theta', 
            953 : 'iota', 
            954 : 'kappa', 
            955 : 'lambda', 
            956 : 'mu', 
            957 : 'nu', 
            958 : 'xi', 
            959 : 'omicron', 
            960 : 'pi', 
            961 : 'rho', 
            962 : 'sigmaf', 
            963 : 'sigma', 
            964 : 'tau', 
            965 : 'upsilon', 
            966 : 'phi', 
            967 : 'chi', 
            968 : 'psi', 
            969 : 'omega', 
            977 : 'thetasym', 
            978 : 'upsih', 
            982 : 'piv', 
            8226 : 'bull', 
            8230 : 'hellip', 
            8242 : 'prime', 
            8243 : 'Prime', 
            8254 : 'oline', 
            8260 : 'frasl', 
            8472 : 'weierp', 
            8465 : 'image', 
            8476 : 'real', 
            8482 : 'trade', 
            8501 : 'alefsym', 
            8592 : 'larr', 
            8593 : 'uarr', 
            8594 : 'rarr', 
            8595 : 'darr', 
            8596 : 'harr', 
            8629 : 'crarr', 
            8656 : 'lArr', 
            8657 : 'uArr', 
            8658 : 'rArr', 
            8659 : 'dArr', 
            8660 : 'hArr', 
            8704 : 'forall', 
            8706 : 'part', 
            8707 : 'exist', 
            8709 : 'empty', 
            8711 : 'nabla', 
            8712 : 'isin', 
            8713 : 'notin', 
            8715 : 'ni', 
            8719 : 'prod', 
            8721 : 'sum', 
            8722 : 'minus', 
            8727 : 'lowast', 
            8730 : 'radic', 
            8733 : 'prop', 
            8734 : 'infin', 
            8736 : 'ang', 
            8743 : 'and', 
            8744 : 'or', 
            8745 : 'cap', 
            8746 : 'cup', 
            8747 : 'int', 
            8756 : 'there4', 
            8764 : 'sim', 
            8773 : 'cong', 
            8776 : 'asymp', 
            8800 : 'ne', 
            8801 : 'equiv', 
            8804 : 'le', 
            8805 : 'ge', 
            8834 : 'sub', 
            8835 : 'sup', 
            8836 : 'nsub', 
            8838 : 'sube', 
            8839 : 'supe', 
            8853 : 'oplus', 
            8855 : 'otimes', 
            8869 : 'perp', 
            8901 : 'sdot', 
            8968 : 'lceil', 
            8969 : 'rceil', 
            8970 : 'lfloor', 
            8971 : 'rfloor', 
            9001 : 'lang', 
            9002 : 'rang', 
            9674 : 'loz', 
            9824 : 'spades', 
            9827 : 'clubs', 
            9829 : 'hearts', 
            9830 : 'diams', 
            338 : 'OElig', 
            339 : 'oelig', 
            352 : 'Scaron', 
            353 : 'scaron', 
            376 : 'Yuml', 
            710 : 'circ', 
            732 : 'tilde', 
            8194 : 'ensp', 
            8195 : 'emsp', 
            8201 : 'thinsp', 
            8204 : 'zwnj', 
            8205 : 'zwj', 
            8206 : 'lrm', 
            8207 : 'rlm', 
            8211 : 'ndash', 
            8212 : 'mdash', 
            8216 : 'lsquo', 
            8217 : 'rsquo', 
            8218 : 'sbquo', 
            8220 : 'ldquo', 
            8221 : 'rdquo', 
            8222 : 'bdquo', 
            8224 : 'dagger', 
            8225 : 'Dagger', 
            8240 : 'permil', 
            8249 : 'lsaquo', 
            8250 : 'rsaquo', 
            8364 : 'euro'
        };
    }

usage example:

var text = "Übergroße Äpfel mit Würmern";
alert(escapeHtmlEntities (text));

result:

&Uuml;bergro&szlig;e &Auml;pfel mit W&uuml;rmern

Update1: Thanks bucabay again for the || - hint
Update2: Updated entity table with amp,lt,gt,apos,quot, thanks richardtallent for the hint
Update3(in 2014): Mathias Bynens created a lib called 'he', maybe it serves your need.

Solution 2 - Javascript

All the other solutions suggested here, as well as most other JavaScript libraries that do HTML entity encoding/decoding, make several mistakes:

They don’t implement the full list of named character references that browsers support. For example, htmlDecode('&PrecedesSlantEqual;') should return '≼' (i.e. '\u227C').
They don’t support encoding astral symbols correctly. For example, htmlEncode('𝌆') should return something like 𝌆 or 𝌆. If an implementation returns two separate entities instead (e.g. &#xD834;&#xDF06; or &#55348;&#57094;), it is broken.
They don’t support decoding astral symbols correctly. htmlDecode('𝌆') should return '𝌆' and not '팆' (i.e. '\uD306').
They don’t implement the character reference overrides table listed in the HTML Standard. For example, htmlDecode('') should return '€' (i.e. '\u20AC').
They should perform decoding in a single pass. For example, htmlDecode('&amp;') should return '&', not &.

For a robust solution that avoids all these issues, use a library I wrote called he for this. From its README:

> he (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript. It supports all standardized named character references as per HTML, handles ambiguous ampersands and other edge cases just like a browser would, has an extensive test suite, and — contrary to many other JavaScript solutions — he handles astral Unicode symbols just fine. An online demo is available.

Solution 3 - Javascript

Using escape() should work with the character code range 0x00 to 0xFF (UTF-8 range).

If you go beyond 0xFF (255), such as 0x100 (256) then escape() will not work:

escape("\u0100"); // %u0100

and:

text = "\u0100"; // Ā
html = escape(text).replace(/%(..)/g,"&#x$1;"); // &#xu0;100

So, if you want to cover all Unicode charachacters as defined on http://www.w3.org/TR/html4/sgml/entities.html , then you could use something like:

var html = text.replace(/[\u00A0-\u00FF]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});

Note here the range is between: \u00A0-\u00FF.

Thats the first character code range defined in http://www.w3.org/TR/html4/sgml/entities.html which is the same as what escape() covers.

You'll need to add the other ranges you want to cover as well, or all of them.

Example: UTF-8 range with general punctuations (\u00A0-\u00FF and \u2022-\u2135)

var html = text.replace(/[\u00A0-\u00FF\u2022-\u2135]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});

Edit:

BTW: \u00A0-\u2666 should convert every Unicode character code not within ASCII range to HTML entities blindly:

var html = text.replace(/[\u00A0-\u2666]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
});

Solution 4 - Javascript

You can use:

function encodeHTML(str){
 var aStr = str.split(''),
     i = aStr.length,
     aRet = [];

   while (i--) {
    var iC = aStr[i].charCodeAt();
    if (iC < 65 || iC > 127 || (iC>90 && iC<97)) {
      aRet.push('&#'+iC+';');
    } else {
      aRet.push(aStr[i]);
    }
  }
 return aRet.reverse().join('');
}

This function HTMLEncodes everything that is not a-z/A-Z.

[Edit] A rather old answer. Let's add a simpler String extension to encode all extended characters:

String.prototype.encodeHTML = function () {
  return this.replace(/[\u0080-\u024F]/g, 
          function (v) {return '&#'+v.charCodeAt()+';';}
         );
}
// usage
log('Übergroße Äpfel mit Würmern'.encodeHTML());
//=> '&#220;bergro&#223;e &#196;pfel mit W&#252;rmern'

Solution 5 - Javascript

The he library is the only 100% reliable solution that I know of!

He is written by Mathias Bynens - one of the world's most renowned JavaScript gurus - and has the following features :

Support for all standardized named character references
Support for unicode
Works fine with ambiguous ampersands

Example use

he.encode('foo © bar ≠ baz 𝌆 qux'); 
// Output : 'foo &#xA9; bar &#x2260; baz &#x1D306; qux'

he.decode('foo &copy; bar &ne; baz &#x1D306; qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'

Solution 6 - Javascript

Having a lookup table with a bazillion replace() calls is slow and not maintainable.

Fortunately, the build-in escape() function also encodes most of the same characters, and puts them in a consistent format (%XX, where XX is the hex value of the character).

So, you can let escape() method do most of the work for you and just change its answer to be HTML entities instead of URL-escaped characters:

htmlescaped = escape(mystring).replace(/%(..)/g,"&#x$1;");

This uses the hex format for escaping values rather than the named entities, but for storing and displaying the values, it works just as well as named entities.

Of course, escape also escapes characters you don't need to escape in HTML (spaces, for instance), but you can unescape them with a few replace calls.

Edit: I like bucabay's answer better than my own... handles a larger range of characters, and requires no hacking afterward to get spaces, slashes, etc. unescaped.

Solution 7 - Javascript

I fixed my problem by using encodeURIComponent() instead of escape().

This might be the fix for you if the problem happens when sending your string in a URL.

Try this with the phrase ("hi & % ‘")

escape() returns

"hi%20%26%20%25%20%u2018"

Notice the %u2018 isn't very url friendly and can break the rest of the query string.

encodeURI() returns

"hi%20&%20%25%20%E2%80%98"

Notice the ampersand is still there.

encodeURIComponent() returns

"hi%20%26%20%25%20%E2%80%98"

Finally, all of our characters are properly encoded.

Solution 8 - Javascript

Demo on JSFiddle

here's a tiny stand alone method that:

attempts to consolidate the answers on this page, without using a library
works in older browsers
supports surrogate pairs (like emojis)
applies character overrides (what's that? not sure exactly)

i don't know too much about unicode, but it seems to be working well.

// escape a string for display in html
// see also: 
// polyfill for String.prototype.codePointAt
//   https://raw.githubusercontent.com/mathiasbynens/String.prototype.codePointAt/master/codepointat.js
// how to convert characters to html entities
//     http://stackoverflow.com/a/1354491/347508
// html overrides from 
//   https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides / http://stackoverflow.com/questions/1354064/how-to-convert-characters-to-html-entities-using-plain-javascript/23831239#comment36668052_1354098

var _escape_overrides = { 0x00:'\uFFFD',0x80:'\u20AC',0x82:'\u201A',0x83:'\u0192',0x84:'\u201E',0x85:'\u2026',0x86:'\u2020',0x87:'\u2021',0x88:'\u02C6',0x89:'\u2030',0x8A:'\u0160',0x8B:'\u2039',0x8C:'\u0152',0x8E:'\u017D',0x91:'\u2018',0x92:'\u2019',0x93:'\u201C',0x94:'\u201D',0x95:'\u2022',0x96:'\u2013',0x97:'\u2014',0x98:'\u02DC',0x99:'\u2122',0x9A:'\u0161',0x9B:'\u203A',0x9C:'\u0153',0x9E:'\u017E',0x9F:'\u0178' }; 

function escapeHtml(str){
	return str.replace(/([\u0000-\uD799]|[\uD800-\uDBFF][\uDC00-\uFFFF])/g, function(c) {
		var c1 = c.charCodeAt(0);
		// ascii character, use override or escape
        if( c1 <= 0xFF ) return (c1=_escape_overrides[c1])?c1:escape(c).replace(/%(..)/g,"&#x$1;");
		// utf8/16 character
		else if( c.length == 1 ) return "&#" + c1 + ";"; 
		// surrogate pair
		else if( c.length == 2 && c1 >= 0xD800 && c1 <= 0xDBFF ) return "&#" + ((c1-0xD800)*0x400 + c.charCodeAt(1) - 0xDC00 + 0x10000) + ";"
		// no clue .. 
		else return "";
	});
}

Solution 9 - Javascript

Just reposting @bucababy's answer as a "bookmarklet", as it's sometimes easier than using those lookup pages:

alert(prompt('Enter characters to htmlEncode', '').replace(/[\u00A0-\u2666]/g, function(c) {
   return '&#'+c.charCodeAt(0)+';';
}));

Solution 10 - Javascript

I recommend to use the JS library entities. Using the library is quite simple. See the examples from docs:

const entities = require("entities");
//encoding
entities.escape("&#38;"); // "&#x26;#38;"
entities.encodeXML("&#38;"); // "&amp;#38;"
entities.encodeHTML("&#38;"); // "&amp;&num;38&semi;"
//decoding
entities.decodeXML("asdf &amp; &#xFF; &#xFC; &apos;"); // "asdf & ÿ ü '"
entities.decodeHTML("asdf &amp; &yuml; &uuml; &apos;"); // "asdf & ÿ ü '"

Solution 11 - Javascript

Best solution is posted at phpjs.org implementation of PHP function htmlentities

The format is htmlentities(string, quote_style, charset, double_encode) Full documentation on the PHP function which is identical can be read here

Solution 12 - Javascript

I adapted one of the answers from the referenced question, but added the ability to define an explicit mapping for character names.

var char_names = {
	160:'nbsp',
	161:'iexcl',
	220:'Uuml',
	223:'szlig',
	196:'Auml',
	252:'uuml',
	};

function HTMLEncode(str){
	 var aStr = str.split(''),
		 i = aStr.length,
		 aRet = [];

	 while (--i >= 0) {
	  var iC = aStr[i].charCodeAt();
	   if (iC < 32 || (iC > 32 && iC < 65) || iC > 127 || (iC>90 && iC<97)) {
		if(char_names[iC]!=undefined) {
		 aRet.push('&'+char_names[iC]+';');
		}
		else {
		 aRet.push('&#'+iC+';');
		}
	   } else {
		aRet.push(aStr[i]);
	   }
	}
	return aRet.reverse().join('');
   }

var text = "Übergroße Äpfel mit Würmer";

alert(HTMLEncode(text));

Content Type	Original Author	Original Content on Stackoverflow
Question	Chris	View Question on Stackoverflow
Solution 1 - Javascript	Chris	View Answer on Stackoverflow
Solution 2 - Javascript	Mathias Bynens	View Answer on Stackoverflow
Solution 3 - Javascript	bucabay	View Answer on Stackoverflow
Solution 4 - Javascript	KooiInc	View Answer on Stackoverflow
Solution 5 - Javascript	John Slegers	View Answer on Stackoverflow
Solution 6 - Javascript	richardtallent	View Answer on Stackoverflow
Solution 7 - Javascript	Adam Grant	View Answer on Stackoverflow
Solution 8 - Javascript	kritzikratzi	View Answer on Stackoverflow
Solution 9 - Javascript	drzaus	View Answer on Stackoverflow
Solution 10 - Javascript	Antonín Slejška	View Answer on Stackoverflow
Solution 11 - Javascript	Ronny Sherer	View Answer on Stackoverflow
Solution 12 - Javascript	Jason R. Coombs	View Answer on Stackoverflow