Sort an array by the "Levenshtein Distance" with best performance in Javascript
JavascriptJquerySortingLevenshtein DistanceJavascript Problem Overview
So I have a random javascript array of names...
[@larry,@nicholas,@notch] etc.
They all start with the @ symbol. I'd like to sort them by the Levenshtein Distance so that the the ones at the top of the list are closest to the search term. At the moment, I have some javascript that uses jQuery's .grep()
on it using javascript .match()
method around the entered search term on key press:
(code edited since first publish)
limitArr = $.grep(imTheCallback, function(n){
return n.match(searchy.toLowerCase())
});
modArr = limitArr.sort(levenshtein(searchy.toLowerCase(), 50))
if (modArr[0].substr(0, 1) == '@') {
if (atRes.childred('div').length < 6) {
modArr.forEach(function(i){
atRes.append('<div class="oneResult">' + i + '</div>');
});
}
} else if (modArr[0].substr(0, 1) == '#') {
if (tagRes.children('div').length < 6) {
modArr.forEach(function(i){
tagRes.append('<div class="oneResult">' + i + '</div>');
});
}
}
$('.oneResult:first-child').addClass('active');
$('.oneResult').click(function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
It also has some if statements detecting if the array contains hashtags (#) or mentions (@). Ignore that. The imTheCallback
is the array of names, either hashtags or mentions, then modArr
is the array sorted. Then the .atResults
and .tagResults
elements are the elements that it appends each time in the array to, this forms a list of names based on the entered search terms.
I also have the Levenshtein Distance algorithm:
var levenshtein = function(min, split) {
// Levenshtein Algorithm Revisited - WebReflection
try {
split = !("0")[0]
} catch(i) {
split = true
};
return function(a, b) {
if (a == b)
return 0;
if (!a.length || !b.length)
return b.length || a.length;
if (split) {
a = a.split("");
b = b.split("")
};
var len1 = a.length + 1,
len2 = b.length + 1,
I = 0,
i = 0,
d = [[0]],
c, j, J;
while (++i < len2)
d[0][i] = i;
i = 0;
while (++i < len1) {
J = j = 0;
c = a[I];
d[i] = [i];
while(++j < len2) {
d[i][j] = min(d[I][j] + 1, d[i][J] + 1, d[I][J] + (c != b[J]));
++J;
};
++I;
};
return d[len1 - 1][len2 - 1];
}
}(Math.min, false);
How can I work with algorithm (or a similar one) into my current code to sort it without bad performance?
UPDATE:
So I'm now using James Westgate's Lev Dist function. Works WAYYYY fast. So performance is solved, the issue now is using it with source...
modArr = limitArr.sort(function(a, b){
levDist(a, searchy)
levDist(b, searchy)
});
My problem now is general understanding on using the .sort()
method. Help is appreciated, thanks.
Thanks!
Javascript Solutions
Solution 1 - Javascript
I wrote an inline spell checker a few years ago and implemented a Levenshtein algorithm - since it was inline and for IE8 I did quite a lot of performance optimisation.
var levDist = function(s, t) {
var d = []; //2d matrix
// Step 1
var n = s.length;
var m = t.length;
if (n == 0) return m;
if (m == 0) return n;
//Create an array of arrays in javascript (a descending loop is quicker)
for (var i = n; i >= 0; i--) d[i] = [];
// Step 2
for (var i = n; i >= 0; i--) d[i][0] = i;
for (var j = m; j >= 0; j--) d[0][j] = j;
// Step 3
for (var i = 1; i <= n; i++) {
var s_i = s.charAt(i - 1);
// Step 4
for (var j = 1; j <= m; j++) {
//Check the jagged ld total so far
if (i == j && d[i][j] > 4) return n;
var t_j = t.charAt(j - 1);
var cost = (s_i == t_j) ? 0 : 1; // Step 5
//Calculate the minimum
var mi = d[i - 1][j] + 1;
var b = d[i][j - 1] + 1;
var c = d[i - 1][j - 1] + cost;
if (b < mi) mi = b;
if (c < mi) mi = c;
d[i][j] = mi; // Step 6
//Damerau transposition
if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
}
}
}
// Step 7
return d[n][m];
}
Solution 2 - Javascript
I came to this solution:
var levenshtein = (function() {
var row2 = [];
return function(s1, s2) {
if (s1 === s2) {
return 0;
} else {
var s1_len = s1.length, s2_len = s2.length;
if (s1_len && s2_len) {
var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
while (i1 < s1_len)
row[i1] = ++i1;
while (i2 < s2_len) {
c2 = s2.charCodeAt(i2);
a = i2;
++i2;
b = i2;
for (i1 = 0; i1 < s1_len; ++i1) {
c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
a = row[i1];
b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
row[i1] = b;
}
}
return b;
} else {
return s1_len + s2_len;
}
}
};
})();
See also http://jsperf.com/levenshtein-distance/12
Most speed was gained by eliminating some array usages.
Solution 3 - Javascript
Updated: http://jsperf.com/levenshtein-distance/5
The new Revision annihilates all other benchmarks. I was specifically chasing Chromium/Firefox performance as I don't have an IE8/9/10 test environment, but the optimisations made should apply in general to most browsers.
Levenshtein Distance
The matrix to perform Levenshtein Distance can be reused again and again. This was an obvious target for optimisation (but be careful, this now imposes a limit on string length (unless you were to resize the matrix dynamically)).
The only option for optimisation not pursued in jsPerf Revision 5 is memoisation. Depending on your use of Levenshtein Distance, this could help drastically but was omitted due to its implementation specific nature.
// Cache the matrix. Note this implementation is limited to
// strings of 64 char or less. This could be altered to update
// dynamically, or a larger value could be used.
var matrix = [];
for (var i = 0; i < 64; i++) {
matrix[i] = [i];
matrix[i].length = 64;
}
for (var i = 0; i < 64; i++) {
matrix[0][i] = i;
}
// Functional implementation of Levenshtein Distance.
String.levenshteinDistance = function(__this, that, limit) {
var thisLength = __this.length, thatLength = that.length;
if (Math.abs(thisLength - thatLength) > (limit || 32)) return limit || 32;
if (thisLength === 0) return thatLength;
if (thatLength === 0) return thisLength;
// Calculate matrix.
var this_i, that_j, cost, min, t;
for (i = 1; i <= thisLength; ++i) {
this_i = __this[i-1];
for (j = 1; j <= thatLength; ++j) {
// Check the jagged ld total so far
if (i === j && matrix[i][j] > 4) return thisLength;
that_j = that[j-1];
cost = (this_i === that_j) ? 0 : 1; // Chars already match, no ++op to count.
// Calculate the minimum (much faster than Math.min(...)).
min = matrix[i - 1][j ] + 1; // Deletion.
if ((t = matrix[i ][j - 1] + 1 ) < min) min = t; // Insertion.
if ((t = matrix[i - 1][j - 1] + cost) < min) min = t; // Substitution.
matrix[i][j] = min; // Update matrix.
}
}
return matrix[thisLength][thatLength];
};
Damerau-Levenshtein Distance
jsperf.com/damerau-levenshtein-distance
Damerau-Levenshtein Distance is a small modification to Levenshtein Distance to include transpositions. There is very little to optimise.
// Damerau transposition.
if (i > 1 && j > 1 && this_i === that[j-2] && this[i-2] === that_j
&& (t = matrix[i-2][j-2]+cost) < matrix[i][j]) matrix[i][j] = t;
Sorting Algorithm
The second part of this answer is to choose an appropriate sort function. I will upload optimised sort functions to http://jsperf.com/sort soon.
Solution 4 - Javascript
I implemented a very performant implementation of levenshtein distance calculation if you still need this.
function levenshtein(s, t) {
if (s === t) {
return 0;
}
var n = s.length, m = t.length;
if (n === 0 || m === 0) {
return n + m;
}
var x = 0, y, a, b, c, d, g, h, k;
var p = new Array(n);
for (y = 0; y < n;) {
p[y] = ++y;
}
for (; (x + 3) < m; x += 4) {
var e1 = t.charCodeAt(x);
var e2 = t.charCodeAt(x + 1);
var e3 = t.charCodeAt(x + 2);
var e4 = t.charCodeAt(x + 3);
c = x;
b = x + 1;
d = x + 2;
g = x + 3;
h = x + 4;
for (y = 0; y < n; y++) {
k = s.charCodeAt(y);
a = p[y];
if (a < c || b < c) {
c = (a > b ? b + 1 : a + 1);
}
else {
if (e1 !== k) {
c++;
}
}
if (c < b || d < b) {
b = (c > d ? d + 1 : c + 1);
}
else {
if (e2 !== k) {
b++;
}
}
if (b < d || g < d) {
d = (b > g ? g + 1 : b + 1);
}
else {
if (e3 !== k) {
d++;
}
}
if (d < g || h < g) {
g = (d > h ? h + 1 : d + 1);
}
else {
if (e4 !== k) {
g++;
}
}
p[y] = h = g;
g = d;
d = b;
b = c;
c = a;
}
}
for (; x < m;) {
var e = t.charCodeAt(x);
c = x;
d = ++x;
for (y = 0; y < n; y++) {
a = p[y];
if (a < c || d < c) {
d = (a > d ? d + 1 : a + 1);
}
else {
if (e !== s.charCodeAt(y)) {
d = c + 1;
}
else {
d = c;
}
}
p[y] = d;
c = a;
}
h = d;
}
return h;
}
It was my answer to a similar SO question https://stackoverflow.com/questions/18516942/fastest-general-purpose-levenshtein-javascript-implementation/35279162#35279162
Update
A improved version of the above is now on github/npm see https://github.com/gustf/js-levenshtein
Solution 5 - Javascript
The obvious way of doing this is to map each string to a (distance, string) pair, then sort this list, then drop the distances again. This way you ensure the levenstein distance only has to be computed once. Maybe merge duplicates first, too.
Solution 6 - Javascript
I would definitely suggest using a better Levenshtein method like the one in @James Westgate's answer.
That said, DOM manipulations are often a great expense. You can certainly improve your jQuery usage.
Your loops are rather small in the example above, but concatenating the generated html for each oneResult
into a single string and doing one append
at the end of the loop will be much more efficient.
Your selectors are slow. $('.oneResult')
will search all elements in the DOM and test their className
in older IE browsers. You may want to consider something like atRes.find('.oneResult')
to scope the search.
In the case of adding the click
handlers, we may want to do one better avoid setting handlers on every keyup
. You could leverage event delegation by setting a single handler on atRest
for all results in the same block you are setting the keyup
handler:
atRest.on('click', '.oneResult', function(){
window.location.href = 'http://hashtag.ly/' + $(this).html();
});
See http://api.jquery.com/on/ for more info.
Solution 7 - Javascript
I just wrote an new revision: http://jsperf.com/levenshtein-algorithms/16
function levenshtein(a, b) {
if (a === b) return 0;
var aLen = a.length;
var bLen = b.length;
if (0 === aLen) return bLen;
if (0 === bLen) return aLen;
var len = aLen + 1;
var v0 = new Array(len);
var v1 = new Array(len);
var i = 0;
var j = 0;
var c2, min, tmp;
while (i < len) v0[i] = i++;
while (j < bLen) {
c2 = b.charAt(j++);
v1[0] = j;
i = 0;
while (i < aLen) {
min = v0[i] - (a.charAt(i) === c2 ? 1 : 0);
if (v1[i] < min) min = v1[i];
if (v0[++i] < min) min = v0[i];
v1[i] = min + 1;
}
tmp = v0;
v0 = v1;
v1 = tmp;
}
return v0[aLen];
}
This revision is faster than the other ones. Works even on IE =)