Sort an array by the "Levenshtein Distance" with best performance in Javascript

JavascriptJquerySortingLevenshtein Distance

Javascript Problem Overview


So I have a random javascript array of names...

[@larry,@nicholas,@notch] etc.

They all start with the @ symbol. I'd like to sort them by the Levenshtein Distance so that the the ones at the top of the list are closest to the search term. At the moment, I have some javascript that uses jQuery's .grep() on it using javascript .match() method around the entered search term on key press:

(code edited since first publish)

limitArr = $.grep(imTheCallback, function(n){
	return n.match(searchy.toLowerCase())
});
modArr = limitArr.sort(levenshtein(searchy.toLowerCase(), 50))
if (modArr[0].substr(0, 1) == '@') {
	if (atRes.childred('div').length < 6) {
		modArr.forEach(function(i){
			atRes.append('<div class="oneResult">' + i + '</div>');
		});
	}
} else if (modArr[0].substr(0, 1) == '#') {
	if (tagRes.children('div').length < 6) {
		modArr.forEach(function(i){
			tagRes.append('<div class="oneResult">' + i + '</div>');
		});
	}
}

$('.oneResult:first-child').addClass('active');

$('.oneResult').click(function(){
	window.location.href = 'http://hashtag.ly/' + $(this).html();
});

It also has some if statements detecting if the array contains hashtags (#) or mentions (@). Ignore that. The imTheCallback is the array of names, either hashtags or mentions, then modArr is the array sorted. Then the .atResults and .tagResults elements are the elements that it appends each time in the array to, this forms a list of names based on the entered search terms.

I also have the Levenshtein Distance algorithm:

var levenshtein = function(min, split) {
    // Levenshtein Algorithm Revisited - WebReflection
    try {
        split = !("0")[0]
    } catch(i) {
        split = true
    };

    return function(a, b) {
        if (a == b)
            return 0;
        if (!a.length || !b.length)
            return b.length || a.length;
        if (split) {
            a = a.split("");
            b = b.split("")
        };
        var len1 = a.length + 1,
            len2 = b.length + 1,
            I = 0,
            i = 0,
            d = [[0]],
            c, j, J;
        while (++i < len2)
            d[0][i] = i;
        i = 0;
        while (++i < len1) {
            J = j = 0;
            c = a[I];
            d[i] = [i];
            while(++j < len2) {
                d[i][j] = min(d[I][j] + 1, d[i][J] + 1, d[I][J] + (c != b[J]));
                ++J;
            };
            ++I;
        };
        return d[len1 - 1][len2 - 1];
    }
}(Math.min, false);

How can I work with algorithm (or a similar one) into my current code to sort it without bad performance?

UPDATE:

So I'm now using James Westgate's Lev Dist function. Works WAYYYY fast. So performance is solved, the issue now is using it with source...

modArr = limitArr.sort(function(a, b){
    levDist(a, searchy)
    levDist(b, searchy)
});

My problem now is general understanding on using the .sort() method. Help is appreciated, thanks.

Thanks!

Javascript Solutions


Solution 1 - Javascript

I wrote an inline spell checker a few years ago and implemented a Levenshtein algorithm - since it was inline and for IE8 I did quite a lot of performance optimisation.

var levDist = function(s, t) {
    var d = []; //2d matrix

    // Step 1
    var n = s.length;
    var m = t.length;

    if (n == 0) return m;
    if (m == 0) return n;

    //Create an array of arrays in javascript (a descending loop is quicker)
    for (var i = n; i >= 0; i--) d[i] = [];

    // Step 2
    for (var i = n; i >= 0; i--) d[i][0] = i;
    for (var j = m; j >= 0; j--) d[0][j] = j;

    // Step 3
    for (var i = 1; i <= n; i++) {
        var s_i = s.charAt(i - 1);

        // Step 4
        for (var j = 1; j <= m; j++) {

            //Check the jagged ld total so far
            if (i == j && d[i][j] > 4) return n;

            var t_j = t.charAt(j - 1);
            var cost = (s_i == t_j) ? 0 : 1; // Step 5

            //Calculate the minimum
            var mi = d[i - 1][j] + 1;
            var b = d[i][j - 1] + 1;
            var c = d[i - 1][j - 1] + cost;

            if (b < mi) mi = b;
            if (c < mi) mi = c;

            d[i][j] = mi; // Step 6

            //Damerau transposition
            if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) {
                d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
            }
        }
    }

    // Step 7
    return d[n][m];
}

Solution 2 - Javascript

I came to this solution:

var levenshtein = (function() {
		var row2 = [];
		return function(s1, s2) {
			if (s1 === s2) {
				return 0;
			} else {
				var s1_len = s1.length, s2_len = s2.length;
				if (s1_len && s2_len) {
					var i1 = 0, i2 = 0, a, b, c, c2, row = row2;
					while (i1 < s1_len)
						row[i1] = ++i1;
					while (i2 < s2_len) {
						c2 = s2.charCodeAt(i2);
						a = i2;
						++i2;
						b = i2;
						for (i1 = 0; i1 < s1_len; ++i1) {
							c = a + (s1.charCodeAt(i1) === c2 ? 0 : 1);
							a = row[i1];
							b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c);
							row[i1] = b;
						}
					}
					return b;
				} else {
					return s1_len + s2_len;
				}
			}
		};
})();

See also http://jsperf.com/levenshtein-distance/12

Most speed was gained by eliminating some array usages.

Solution 3 - Javascript

Updated: http://jsperf.com/levenshtein-distance/5

The new Revision annihilates all other benchmarks. I was specifically chasing Chromium/Firefox performance as I don't have an IE8/9/10 test environment, but the optimisations made should apply in general to most browsers.

Levenshtein Distance

The matrix to perform Levenshtein Distance can be reused again and again. This was an obvious target for optimisation (but be careful, this now imposes a limit on string length (unless you were to resize the matrix dynamically)).

The only option for optimisation not pursued in jsPerf Revision 5 is memoisation. Depending on your use of Levenshtein Distance, this could help drastically but was omitted due to its implementation specific nature.

// Cache the matrix. Note this implementation is limited to
// strings of 64 char or less. This could be altered to update
// dynamically, or a larger value could be used.
var matrix = [];
for (var i = 0; i < 64; i++) {
    matrix[i] = [i];
    matrix[i].length = 64;
}
for (var i = 0; i < 64; i++) {
    matrix[0][i] = i;
}

// Functional implementation of Levenshtein Distance.
String.levenshteinDistance = function(__this, that, limit) {
    var thisLength = __this.length, thatLength = that.length;

    if (Math.abs(thisLength - thatLength) > (limit || 32)) return limit || 32;
    if (thisLength === 0) return thatLength;
    if (thatLength === 0) return thisLength;

    // Calculate matrix.
    var this_i, that_j, cost, min, t;
    for (i = 1; i <= thisLength; ++i) {
        this_i = __this[i-1];

        for (j = 1; j <= thatLength; ++j) {
            // Check the jagged ld total so far
            if (i === j && matrix[i][j] > 4) return thisLength;

            that_j = that[j-1];
            cost = (this_i === that_j) ? 0 : 1;  // Chars already match, no ++op to count.
            // Calculate the minimum (much faster than Math.min(...)).
            min    = matrix[i - 1][j    ] + 1;						// Deletion.
            if ((t = matrix[i    ][j - 1] + 1   ) < min) min = t;	// Insertion.
            if ((t = matrix[i - 1][j - 1] + cost) < min) min = t;	// Substitution.

            matrix[i][j] = min;	// Update matrix.
        }
    }

    return matrix[thisLength][thatLength];
};

Damerau-Levenshtein Distance

jsperf.com/damerau-levenshtein-distance

Damerau-Levenshtein Distance is a small modification to Levenshtein Distance to include transpositions. There is very little to optimise.

// Damerau transposition.
if (i > 1 && j > 1 && this_i === that[j-2] && this[i-2] === that_j
&& (t = matrix[i-2][j-2]+cost) < matrix[i][j]) matrix[i][j] = t;

Sorting Algorithm

The second part of this answer is to choose an appropriate sort function. I will upload optimised sort functions to http://jsperf.com/sort soon.

Solution 4 - Javascript

I implemented a very performant implementation of levenshtein distance calculation if you still need this.

function levenshtein(s, t) {
    if (s === t) {
        return 0;
    }
    var n = s.length, m = t.length;
    if (n === 0 || m === 0) {
        return n + m;
    }
    var x = 0, y, a, b, c, d, g, h, k;
    var p = new Array(n);
    for (y = 0; y < n;) {
        p[y] = ++y;
    }

    for (; (x + 3) < m; x += 4) {
        var e1 = t.charCodeAt(x);
        var e2 = t.charCodeAt(x + 1);
        var e3 = t.charCodeAt(x + 2);
        var e4 = t.charCodeAt(x + 3);
        c = x;
        b = x + 1;
        d = x + 2;
        g = x + 3;
        h = x + 4;
        for (y = 0; y < n; y++) {
            k = s.charCodeAt(y);
            a = p[y];
            if (a < c || b < c) {
                c = (a > b ? b + 1 : a + 1);
            }
            else {
                if (e1 !== k) {
                    c++;
                }
            }

            if (c < b || d < b) {
                b = (c > d ? d + 1 : c + 1);
            }
            else {
                if (e2 !== k) {
                    b++;
                }
            }

            if (b < d || g < d) {
                d = (b > g ? g + 1 : b + 1);
            }
            else {
                if (e3 !== k) {
                    d++;
                }
            }

            if (d < g || h < g) {
                g = (d > h ? h + 1 : d + 1);
            }
            else {
                if (e4 !== k) {
                    g++;
                }
            }
            p[y] = h = g;
            g = d;
            d = b;
            b = c;
            c = a;
        }
    }

    for (; x < m;) {
        var e = t.charCodeAt(x);
        c = x;
        d = ++x;
        for (y = 0; y < n; y++) {
            a = p[y];
            if (a < c || d < c) {
                d = (a > d ? d + 1 : a + 1);
            }
            else {
                if (e !== s.charCodeAt(y)) {
                    d = c + 1;
                }
                else {
                    d = c;
                }
            }
            p[y] = d;
            c = a;
        }
        h = d;
    }

    return h;
}

It was my answer to a similar SO question https://stackoverflow.com/questions/18516942/fastest-general-purpose-levenshtein-javascript-implementation/35279162#35279162

Update

A improved version of the above is now on github/npm see https://github.com/gustf/js-levenshtein

Solution 5 - Javascript

The obvious way of doing this is to map each string to a (distance, string) pair, then sort this list, then drop the distances again. This way you ensure the levenstein distance only has to be computed once. Maybe merge duplicates first, too.

Solution 6 - Javascript

I would definitely suggest using a better Levenshtein method like the one in @James Westgate's answer.

That said, DOM manipulations are often a great expense. You can certainly improve your jQuery usage.

Your loops are rather small in the example above, but concatenating the generated html for each oneResult into a single string and doing one append at the end of the loop will be much more efficient.

Your selectors are slow. $('.oneResult') will search all elements in the DOM and test their className in older IE browsers. You may want to consider something like atRes.find('.oneResult') to scope the search.

In the case of adding the click handlers, we may want to do one better avoid setting handlers on every keyup. You could leverage event delegation by setting a single handler on atRest for all results in the same block you are setting the keyup handler:

atRest.on('click', '.oneResult', function(){
  window.location.href = 'http://hashtag.ly/' + $(this).html();
});

See http://api.jquery.com/on/ for more info.

Solution 7 - Javascript

I just wrote an new revision: http://jsperf.com/levenshtein-algorithms/16

function levenshtein(a, b) {
  if (a === b) return 0;

  var aLen = a.length;
  var bLen = b.length;

  if (0 === aLen) return bLen;
  if (0 === bLen) return aLen;

  var len = aLen + 1;
  var v0 = new Array(len);
  var v1 = new Array(len);
      
  var i = 0;
  var j = 0;
  var c2, min, tmp;

  while (i < len) v0[i] = i++;

  while (j < bLen) {
    c2 = b.charAt(j++);
    v1[0] = j;
    i = 0;

    while (i < aLen) {
      min = v0[i] - (a.charAt(i) === c2 ? 1 : 0);
      if (v1[i] < min) min = v1[i];
      if (v0[++i] < min) min = v0[i];
      v1[i] = min + 1;
    }

    tmp = v0;
    v0 = v1;
    v1 = tmp;
  }
  return v0[aLen];
}

This revision is faster than the other ones. Works even on IE =)

Attributions

All content for this solution is sourced from the original question on Stackoverflow.

The content on this page is licensed under the Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) license.

Content TypeOriginal AuthorOriginal Content on Stackoverflow
QuestionaltView Question on Stackoverflow
Solution 1 - JavascriptJames WestgateView Answer on Stackoverflow
Solution 2 - JavascriptMarco de WitView Answer on Stackoverflow
Solution 3 - JavascriptTheSpanishInquisitionView Answer on Stackoverflow
Solution 4 - JavascriptgustfView Answer on Stackoverflow
Solution 5 - JavascriptHas QUIT--Anony-MousseView Answer on Stackoverflow
Solution 6 - JavascriptJacob SwartwoodView Answer on Stackoverflow
Solution 7 - JavascriptgtournieView Answer on Stackoverflow