UTF-8 Decoder Benchmark (v2)

Revision 2 of this benchmark created on


Setup

const fromCharCode = String.fromCharCode;
const cache = Array.from({ length: 0xffff }, (_, i) => fromCharCode(i));

function decodeNaive(buffer, start, end) {
  var str = "";
  for (var i = start; i < end;) {
    var t = buffer[i++];
    if (t <= 0x7F) {
      str += cache[t];
    } else if (t >= 0xC0 && t < 0xE0) {
      str += fromCharCode((t & 0x1F) << 6 | buffer[i++] & 0x3F);
    } else if (t >= 0xE0 && t < 0xF0) {
      str += fromCharCode((t & 0xF) << 12 | (buffer[i++] & 0x3F) << 6 | buffer[i++] & 0x3F);
    } else if (t >= 0xF0) {
      var t2 = ((t & 7) << 18 | (buffer[i++] & 0x3F) << 12 | (buffer[i++] & 0x3F) << 6 | buffer[i++] & 0x3F) - 0x10000;
      str += fromCharCode(0xD800 + (t2 >> 10));
      str += fromCharCode(0xDC00 + (t2 & 0x3FF));
    }
  }
  return str;
}
/* eslint-disable @typescript-eslint/no-unused-vars */
// prettier is disabled because it messes up the table
/* prettier-ignore */

// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// The remapped transition table is justified at
// https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
/* prettier-ignore */
const utf8d = new Uint8Array([
    // This first table maps bytes to character to a transition.
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 00-0F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 10-1F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 20-2F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 30-3F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 40-4F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 50-5F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 60-6F
    0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 70-7F
    1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 80-8F
    2,  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 90-9F
    3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // A0-AF
    3,  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // B0-BF
    9,  9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // C0-CF
    4,  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,  // D0-DF
    10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5,  // E0-EF
    11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,  // F0-FF
]);
/* prettier-ignore */
const utf8t = new Uint8Array([
  // This second table maps a state to a new state when adding a transition
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // REJECT = 0
    12, 0,  0,  0,  24, 36, 48, 60, 72, 0, 84, 96,  // ACCEPT = 12
    0,  12, 12, 12, 0,  0,  0,  0,  0,  0, 0,  0,   // 2-byte = 24
    0,  24, 24, 24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte = 36
    0,  24, 24, 0,  0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte low/mid = 48
    0,  36, 36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte = 60
    0,  36, 0,  0,  0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte low = 72
    0,  0,  0,  24, 0,  0,  0,  0,  0,  0, 0,  0,   // 3-byte high = 84
    0,  0,  36, 36, 0,  0,  0,  0,  0,  0, 0,  0,   // 4-byte mid/high = 96

])
var invalidCodePoint = cache[0xfffd];
function decodeDFA(buffer) {
  var start = 0,
    end = buffer.length;

  var res = ""; // final string

  var i = start,
    state = 12,
    current = 0,
    type = 0,
    bite = 0,
    prevState = 0;
  while (i < end) {
    bite = buffer[i];
    type = utf8d[bite];
    prevState = state;
    state = utf8t[state + type];
    current = (current << 6) | (bite & (0x7f >> (type >> 1)));
    if (state === 12) {
      res +=
        current <= 0xff_ff
          ? cache[current]
          : fromCharCode(
              0xd8_00 + (((current - 0x1_00_00) >>> 10) & 0x3_ff),
              0xdc_00 + (current & 0x3_ff),
            );
      current = 0;
    } else if (state < 12) {
      state = 12;
      res += invalidCodePoint;
      current = 0;
      if (prevState !== 12) {
        continue;
      }
    }
    i++;
  }
  return res;
}

const buffer = new TextEncoder().encode("The quick brown fox jumps over the lazy dog 中文🤣 ÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĀāẢảȀȁȂȃẠạẶặẬậḀḁȺⱥᶏẚɐɑᶐɒᴀÆæǼǽǢǣᴂᴁ".repeat(100_00));

Test runner

Ready to run.

Testing in
TestOps/sec
DFA Decoder
decodeDFA(buffer, 0, buffer.length)
ready
Naive Decoder
decodeNaive(buffer, 0, buffer.length)
ready

Revisions

You can edit these tests or add more tests to this page by appending /edit to the URL.