UXP/devtools/shared/gcli/source/lib/gcli/util/spell.js

/*
 * Copyright 2012, Mozilla Foundation and contributors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

'use strict';

/*
 * A spell-checker based on Damerau-Levenshtein distance.
 */

var CASE_CHANGE_COST = 1;
var INSERTION_COST = 10;
var DELETION_COST = 10;
var SWAP_COST = 10;
var SUBSTITUTION_COST = 20;
var MAX_EDIT_DISTANCE = 40;

/**
 * Compute Damerau-Levenshtein Distance, with a modification to allow a low
 * case-change cost (1/10th of a swap-cost)
 * @see http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
 */
var distance = exports.distance = function(wordi, wordj) {
  var wordiLen = wordi.length;
  var wordjLen = wordj.length;

  // We only need to store three rows of our dynamic programming matrix.
  // (Without swap, it would have been two.)
  var row0 = new Array(wordiLen+1);
  var row1 = new Array(wordiLen+1);
  var row2 = new Array(wordiLen+1);
  var tmp;

  var i, j;

  // The distance between the empty string and a string of size i is the cost
  // of i insertions.
  for (i = 0; i <= wordiLen; i++) {
    row1[i] = i * INSERTION_COST;
  }

  // Row-by-row, we're computing the edit distance between substrings wordi[0..i]
  // and wordj[0..j].
  for (j = 1; j <= wordjLen; j++) {
    // Edit distance between wordi[0..0] and wordj[0..j] is the cost of j
    // insertions.
    row0[0] = j * INSERTION_COST;

    for (i = 1; i <= wordiLen; i++) {
      // Handle deletion, insertion and substitution: we can reach each cell
      // from three other cells corresponding to those three operations. We
      // want the minimum cost.
      var dc = row0[i - 1] + DELETION_COST;
      var ic = row1[i] + INSERTION_COST;
      var sc0;
      if (wordi[i-1] === wordj[j-1]) {
        sc0 = 0;
      }
      else {
        if (wordi[i-1].toLowerCase() === wordj[j-1].toLowerCase()) {
          sc0 = CASE_CHANGE_COST;
        }
        else {
          sc0 = SUBSTITUTION_COST;
        }
      }
      var sc = row1[i-1] + sc0;

      row0[i] = Math.min(dc, ic, sc);

      // We handle swap too, eg. distance between help and hlep should be 1. If
      // we find such a swap, there's a chance to update row0[1] to be lower.
      if (i > 1 && j > 1 && wordi[i-1] === wordj[j-2] && wordj[j-1] === wordi[i-2]) {
        row0[i] = Math.min(row0[i], row2[i-2] + SWAP_COST);
      }
    }

    tmp = row2;
    row2 = row1;
    row1 = row0;
    row0 = tmp;
  }

  return row1[wordiLen];
};

/**
 * As distance() except that we say that if word is a prefix of name then we
 * only count the case changes. This allows us to use words that can be
 * completed by typing as more likely than short words
 */
var distancePrefix = exports.distancePrefix = function(word, name) {
  var dist = 0;

  for (var i = 0; i < word.length; i++) {
    if (name[i] !== word[i]) {
      if (name[i].toLowerCase() === word[i].toLowerCase()) {
        dist++;
      }
      else {
        // name does not start with word, even ignoring case, use
        // Damerau-Levenshtein
        return exports.distance(word, name);
      }
    }
  }

  return dist;
};

/**
 * A function that returns the correction for the specified word.
 */
exports.correct = function(word, names) {
  if (names.length === 0) {
    return undefined;
  }

  var distances = {};
  var sortedCandidates;

  names.forEach(function(candidate) {
    distances[candidate] = exports.distance(word, candidate);
  });

  sortedCandidates = names.sort(function(worda, wordb) {
    if (distances[worda] !== distances[wordb]) {
      return distances[worda] - distances[wordb];
    }
    else {
      // if the score is the same, always return the first string
      // in the lexicographical order
      return worda < wordb;
    }
  });

  if (distances[sortedCandidates[0]] <= MAX_EDIT_DISTANCE) {
    return sortedCandidates[0];
  }
  else {
    return undefined;
  }
};

/**
 * Return a ranked list of matches:
 *
 *   spell.rank('fred', [ 'banana', 'fred', 'ed', 'red' ]);
 *     ↓
 *   [
 *      { name: 'fred', dist: 0 },
 *      { name: 'red', dist: 1 },
 *      { name: 'ed', dist: 2 },
 *      { name: 'banana', dist: 10 },
 *   ]
 *
 * @param word The string that we're comparing names against
 * @param names An array of strings to compare word against
 * @param options Comparison options:
 * - noSort: Do not sort the output by distance
 * - prefixZero: Count prefix matches as edit distance 0 (i.e. word='bana' and
 *   names=['banana'], would return { name:'banana': dist: 0 }) This is useful
 *   if someone is typing the matches and may not have finished yet
 */
exports.rank = function(word, names, options) {
  options = options || {};

  var reply = names.map(function(name) {
    // If any name starts with the word then the distance is based on the
    // number of case changes rather than Damerau-Levenshtein
    var algo = options.prefixZero ? distancePrefix : distance;
    return {
      name: name,
      dist: algo(word, name)
    };
  });

  if (!options.noSort) {
    reply = reply.sort(function(d1, d2) {
      return d1.dist - d2.dist;
    });
  }

  return reply;
};