unicode to ascii (v2)

Revision 2 of this benchmark created by mac on


Description

Testing various methods of converting Unicode characters to ASCII

Preparation HTML

<script>
  var uniStrings = new Array("René Magritt", "András Süto", "Nicómede", "ééééé", "john", "charles", "cow");
</script>

Test runner

Ready to run.

Testing in
TestOps/sec
String replace
function strReplace(str) {
 var l;
 // remove accents, swap ñ for n, etc
 var from = "àáäâèéëêìíïîòóöôùúüûñç";
 var to = "aaaaeeeeiiiioooouuuunc";
 for (var i = 0, l = from.length; i < l; i++) {
  str = str.replace(new RegExp(from.charAt(i), 'g'), to.charAt(i));
 }
 return str;
}
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  strReplace(uniStrings[c]);
 }
}
ready
lookup table
// String containing replacement characters for stripping accents 
var stripstring = 'AAAAAAACEEEEIIII' + 'DNOOOOO.OUUUUY..' + 'aaaaaaaceeeeiiii' + 'dnooooo.ouuuuy.y' + 'AaAaAaCcCcCcCcDd' + 'DdEeEeEeEeEeGgGg' + 'GgGgHhHhIiIiIiIi' + 'IiIiJjKkkLlLlLlL' + 'lJlNnNnNnnNnOoOo' + 'OoOoRrRrRrSsSsSs' + 'SsTtTtTtUuUuUuUu' + 'UuUuWwYyYZzZzZz.';

function stripaccents(str) {
 var answer = '';
 for (var i = 0; i < str.length; i++) {
  var ch = str[i];
  var chindex = ch.charCodeAt(0) - 192; // Index of character code in the strip string
  if (chindex >= 0 && chindex < stripstring.length) {
   // Character is within our table, so we can strip the accent...
   var outch = stripstring.charAt(chindex);
   // ...unless it was shown as a '.'
   if (outch != '.') ch = outch;
  }
  answer += ch;
 }
 return answer;
}
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  stripaccents(uniStrings[c]);
 }
}
ready
unicode ranges
function stripRange(str) {
 var s = str;

 var repChar = ['A', 'a', 'E', 'e', 'I', 'i', 'O', 'o', 'U', 'u'];
 var rExps = [/[\xC0-\xC2]/g, /[\xE0-\xE2]/g, /[\xC8-\xCA]/g, /[\xE8-\xEB]/g, /[\xCC-\xCE]/g, /[\xEC-\xEE]/g, /[\xD2-\xD4]/g, /[\xF2-\xF4]/g, /[\xD9-\xDB]/g, /[\xF9-\xFB]/g];

 for (var i = 0; i < rExps.length; i++)
 s = s.replace(rExps[i], repChar[i]);

 return s;
}
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  stripRange(uniStrings[c]);
 }
}
ready
String replace lazy init
function createRegex() {
 var regArray = new Array();
 var from = "àáäâèéëêìíïîòóöôùúüûñç";
 for (var i = 0; i < from.length; i++) {
  regArray[i] = new RegExp(from.charAt(i), 'g');
 }
 return regArray;
}

function strReplace(str, from) {
 var l;
 // remove accents, swap ñ for n, etc
 var to = "aaaaeeeeiiiioooouuuunc";
 for (var i = 0, l = from.length; i < l; i++) {
  str = str.replace(from[i], to.charAt(i));
 }
 return str;
}
var regArray = createRegex();
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  strReplace(uniStrings[c], regArray);
 }
}
ready
unicode ranges with lazy init
var rExps = [/[\xC0-\xC2]/g, /[\xE0-\xE2]/g, /[\xC8-\xCA]/g, /[\xE8-\xEB]/g, /[\xCC-\xCE]/g, /[\xEC-\xEE]/g, /[\xD2-\xD4]/g, /[\xF2-\xF4]/g, /[\xD9-\xDB]/g, /[\xF9-\xFB]/g];

function stripRange(str) {
 var s = str;

 var repChar = ['A', 'a', 'E', 'e', 'I', 'i', 'O', 'o', 'U', 'u'];

 for (var i = 0; i < rExps.length; i++)
 s = s.replace(rExps[i], repChar[i]);

 return s;
}
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  stripRange(uniStrings[c]);
 }
}
ready
lookup table substring
// String containing replacement characters for stripping accents 
var stripstring = 'AAAAAAACEEEEIIII' + 'DNOOOOO.OUUUUY..' + 'aaaaaaaceeeeiiii' + 'dnooooo.ouuuuy.y' + 'AaAaAaCcCcCcCcDd' + 'DdEeEeEeEeEeGgGg' + 'GgGgHhHhIiIiIiIi' + 'IiIiJjKkkLlLlLlL' + 'lJlNnNnNnnNnOoOo' + 'OoOoRrRrRrSsSsSs' + 'SsTtTtTtUuUuUuUu' + 'UuUuWwYyYZzZzZz.';

function stripaccents(str) {

 for (var i = 0; i < str.length; i++) {
  var ch = str[i];
  var chindex = ch.charCodeAt(0) - 192; // Index of character code in the strip string
  if (chindex >= 0 && chindex < stripstring.length) {
   // Character is within our table, so we can strip the accent...
   var outch = stripstring.charAt(chindex);
   // ...unless it was shown as a '.'
   if (outch != '.') {
    str = str.substring(0, i) + outch + str.substring(i + 1, str.length);
   }
  }
 }
 return str;
}
for (var cn = 0; cn < 1000; cn++) {
 for (var c = 0; c < uniStrings.length; c++) {
  stripaccents(uniStrings[c]);
 }
}
ready

Revisions

You can edit these tests or add more tests to this page by appending /edit to the URL.