utf8 encode (v2)

Revision 2 of this benchmark created on


Setup

const len = 10;

function getRandomInt(min, max) {
    min = Math.ceil(min);
    max = Math.floor(max);
    return Math.floor(Math.random() * (max - min + 1)) + min;
}

const generateChineseStr = (len) => [...Array(len).keys()]
	.map(() => getRandomInt(0x29100, 0x2A6DF))
	.map(code => String.fromCodePoint(code))
	.join('');
	
const generateASCIIStr = (len) => [...Array(len).keys()]
	.map(() => getRandomInt(32, 126))
	.map(code => String.fromCodePoint(code))
	.join('');
	
const arr = [...Array(len).keys()].map((i) => i % 2 === 0 ? generateChineseStr(len) : generateASCIIStr(len))


const textEncoder = new TextEncoder('utf-8');

const utf8ByteSize = (str) => {
  let size = 0;
  for (let i = 0; i < str.length; i++) {
    const codePoint = str.codePointAt(i);
    if (codePoint <= 0x7f) {
      size++;
    } else if (codePoint <= 0x7ff) {
      size += 2;
    } else if (codePoint <= 0xffff) {
      size += 3;
    } else if (codePoint <= 0x10ffff) {
      size += 4;
    }

    if (codePoint > 65535) {
      i++;
    }
  }
  return size;
}

const encodeCodePoint = (codePoint, arr, offset = 0) => {
  if (codePoint <= 0x7f) {
    arr[offset] = codePoint;
    return 1;
  }
  if (codePoint <= 0x7ff) {
    arr[offset] = 0b11_000_000  |  codePoint >> 6;
    arr[offset + 1] = 0b10_000_000  |  codePoint & 0b111_111;

    return 2;
  }
  if (codePoint <= 0xffff) {
    arr[offset + 2] = 0b10_000_000  | codePoint & 0b111111;
    arr[offset + 1] = 0b10_000_000  | codePoint >> 6  &  0b111111;
    arr[offset] = 0b11100_000  | codePoint >> 12  &  0b1111;

    return 3;
  }
  if (codePoint <= 0x10ffff) {
    arr[offset + 3] = 0b10_000_000  |  codePoint   &   0b111111;
    arr[offset + 2] = 0b10_000_000  |  codePoint >> 6  &  0b111111;
    arr[offset + 1] = 0b10_000_000  |  codePoint >> 12  &  0b111111;
    arr[offset] = 0b11110_000  |  codePoint >> 18  &  0b111;

    return 4;
  }

  return -1;
}

const utf8Encode = (str) => {
  const size = utf8ByteSize(str);
  const b = new ArrayBuffer(size);
  const u8arr = new Uint8Array(b);
  let j = 0;
  for (let i = 0; i < str.length; i++) {
    const codePoint = str.codePointAt(i);

    const s = encodeCodePoint(codePoint, u8arr, j);

    if (s === -1) {
      throw new Error()
    }

    j += s;

    if (codePoint > 65535) {
      i++;
    }
  }
  
  return u8arr;
}


const er = [];
const er2 = [];
const er3 = []

Test runner

Ready to run.

Testing in
TestOps/sec
TextEncoder
for (let i = 0; i< arr.length; i++) {
	er[i] = textEncoder.encode(arr[i])
}
ready
custom
for (let i = 0; i< arr.length; i++) {
	er2[i] = utf8Encode(arr[i])
}
ready
Blob
for (let i = 0; i< arr.length; i++) {
	er3[i] = new Blob([arr[i]], {encoding: 'utf-8'});
}
ready

Revisions

You can edit these tests or add more tests to this page by appending /edit to the URL.