JSDoc: Source: conversion.js

/*!
 * Module Conversion
 */

/**
 * @namespace Conversion
 */

const MODULE_NAME = 'Conversion';



//###[ IMPORTS ]########################################################################################################

import {isInt, isArray, orDefault} from './basic.js';
import {pad, trim} from './strings.js';



//###[ DATA ]###########################################################################################################

const
	UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
	LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyz',
	NUMBERS = '0123456789',
	BASE_ALPHABETS = {
		'64' : `${UPPER_CHARS}${LOWER_CHARS}${NUMBERS}+/`,
		'63' : `${NUMBERS}${UPPER_CHARS}${LOWER_CHARS}_`,
		'36' : `${NUMBERS}${UPPER_CHARS}`,
		'26' : UPPER_CHARS,
		'16' : `${NUMBERS}${UPPER_CHARS.slice(0, 6)}`,
		'10' : NUMBERS,
		'8' : NUMBERS.slice(0, 8),
		'2' : NUMBERS.slice(0, 2)
	},
	BASE64_ALPHABET = BASE_ALPHABETS['64']
;



//###[ HELPERS ]########################################################################################################

/**
 * Builds an alphabet string, based on an integer, an alphabet string or an array of strings containing the alphabet's
 * chars. An integer uses BASE_ALPHABETS to select a base alphabet to slice the alphabet from. The first base alphabet
 * having enough chars is going be used. The configurations of the base alphabets are according to current base
 * practices.
 *
 * Characters in custom alphabets are sorted according to base64 definition, with additional chars appended at the end,
 * sorted ascending based on char value.
 *
 * @private
 */
function buildAlphabet(__methodName__='buildAlphabet', baseOrAlphabet=64, useChunks=false){
	baseOrAlphabet = orDefault(baseOrAlphabet, 64);

	let alphabet;

	if( isInt(baseOrAlphabet) ){
		if( (baseOrAlphabet < 2) || (baseOrAlphabet > 64) ){
			throw new Error(`${MODULE_NAME}:${__methodName__} | base not usable, smaller than 2 or larger than 64`);
		}

		if( useChunks && (baseOrAlphabet < 3) ){
			throw new Error(`${MODULE_NAME}:${__methodName__} | base not usable for chunks, smaller than 3`);
		}

		for( let baseAlphabetKey of Object.keys(BASE_ALPHABETS).sort() ){
			if( Number(baseAlphabetKey) >= baseOrAlphabet ){
				alphabet = BASE_ALPHABETS[baseAlphabetKey].slice(0, baseOrAlphabet);
				break;
			}
		}
	} else {
		alphabet = [];

		if( !isArray(baseOrAlphabet) ){
			baseOrAlphabet = `${baseOrAlphabet}`.split('');
		}

		baseOrAlphabet.forEach(char => {
			alphabet = alphabet.concat(`${char}`.split(''));
		});

		alphabet = Array.from(new Set(alphabet));
		alphabet.sort((a, b) => {
			const
				aBase64Index = BASE64_ALPHABET.indexOf(a),
				bBase64Index = BASE64_ALPHABET.indexOf(b)
			;

			if( (aBase64Index < 0) && (bBase64Index < 0) ){
				return (a === b) ? 0 : ((a < b) ? -1 : 1);
			} else if( aBase64Index < 0 ){
				return 1;
			} else if( bBase64Index < 0 ){
				return -1;
			} else {
				return (aBase64Index === bBase64Index) ? 0 : ((aBase64Index < bBase64Index) ? -1 : 1);
			}
		});

		alphabet = alphabet.join('');
	}

	if( (alphabet.length < 2) || (alphabet.length > 64) ){
		throw new Error(`${MODULE_NAME}:${__methodName__} | alphabet not usable, must have between two and 64 chars`);
	}

	if( useChunks && (alphabet.length < 3) ){
		throw new Error(`${MODULE_NAME}:${__methodName__} | alphabet not usable for chunks, less than 3 chars`);
	}

	return alphabet;
}



/**
 * Calculates how many character mapping pages/page characters we need for a specific alphabet,
 * defined by its length/base.
 *
 * @private
 */
function calculateNeededPages(base){
	const availablePages = Math.floor(base / 2);
	let
		neededPages = 0,
		charPoolSize = base,
		combinations = charPoolSize
	;

	while( combinations < 64 ){
		neededPages++;
		if( neededPages <= availablePages ){
			charPoolSize--;
		}
		combinations = (neededPages + 1) * charPoolSize;
	}

	return neededPages;
}



/**
 * Returns an array of value prefixes used to map characters to different code pages, in cases where we need to encode
 * a base64 character above the base of our target alphabet, which means, that we have to repeat character usage, but
 * with a page prefix to multiply the value set by reducing the base alphabet for that purpose.
 *
 * @private
 */
function buildPageMap(alphabet){
	const
		base = alphabet.length,
		neededPages = calculateNeededPages(base),
		availablePages = Math.floor(base / 2),
		pageMap = ['']
	;

	let remainder, quotient;
	for( let i = 1; i <= neededPages; i++ ){
		remainder = i % availablePages;
		quotient = Math.ceil(i / availablePages);
		pageMap.push(pad('', alphabet[(remainder > 0) ? (remainder - 1) : (availablePages - 1)], quotient));
	}

	return pageMap;
}



/**
 * Returns a dictionary, mapping each base64 character to one or more characters of the target alphabet.
 * In cases, where the character to encode is beyond the target alphabet, page prefixes are prepended to
 * cover all characters by increasing length.
 *
 * @private
 */
function buildCharMap(pageMap, alphabet){
	const
		base = alphabet.length,
		neededPages = calculateNeededPages(base),
		availablePages = Math.floor(base / 2),
		pagedAlphabet = alphabet.slice(Math.min(neededPages, availablePages)),
		pagedBase = pagedAlphabet.length,
		charMap = {}
	;

	let remainder, quotient;
	for( let i in BASE64_ALPHABET.split('') ){
		remainder = i % pagedBase;
		quotient = Math.floor(i / pagedBase);
		charMap[BASE64_ALPHABET[i]] = `${pageMap[quotient]}${pagedAlphabet[remainder]}`
	}

	return charMap;
}



/**
 * Converts a string to base64, while handling unicode characters correctly.
 * Be advised, that the result needs to be decoded with base64ToString() again, since
 * we also need to correctly handle unicode on the way back.
 *
 * @private
 */
function stringToBase64(value){
	return btoa(String.fromCodePoint(...((new TextEncoder()).encode(`${value}`)))).replaceAll('=', '');
}



/**
 * Decodes a base64-encoded string to its original value.
 * Be advised, that the base64 value has to be encoded using stringToBase64(), since unicode characters need
 * special handling during en/decoding.
 *
 * This function will fail with an error, if the given value is not actually decodable with base64.
 *
 * @private
 */
function base64ToString(value, __methodName__='base64ToString'){
	let res = null;

	try {
		res = (new TextDecoder()).decode(Uint8Array.from(atob(`${value}`), char => char.codePointAt(0)))
	} catch(ex){
		throw new Error(`${MODULE_NAME}:${__methodName__} | cannot decode "${value}"`);
	}

	return res;
}



/**
 * Converts a decimal/base10 value to a different base numerically.
 * Be aware that this needs the value to be a safe integer.
 * This function does not deal with negative numbers by itself.
 *
 * @private
 */
function base10toBaseX(value, alphabet){
	const base = alphabet.length;
	let
		baseXValue = '',
		quotient = value,
		remainder
	;

	if( quotient !== 0){
		while( quotient !== 0 ){
			remainder = quotient % base;
			quotient = Math.floor(quotient / base);
			baseXValue = `${alphabet[remainder]}${baseXValue}`;
		}
	} else {
		baseXValue = `${alphabet[0]}`;
	}

	return baseXValue;
}



/**
 * Converts a value, based on a defined alphabet, to its decimal/base10 representation.
 * Be aware that this needs the result to be a safe integer.
 * This function does not deal with negative numbers by itself.
 *
 * @private
 */
function baseXToBase10(value, alphabet){
	value = `${value}`.split('').reverse().join('');

	const base = alphabet.length;
	let base10Value = 0;

	for( let i = 0; i < value.length; i++ ){
		base10Value += Math.pow(base, i) * alphabet.indexOf(value[i]);
	}

	return base10Value;
}



//###[ EXPORTS ]########################################################################################################

/**
 * @namespace Conversion:toBaseX
 */

/**
 * This function converts a value to a representation in a defined base between 2 and 64.
 * So this covers common use cases like binary, octal, hexadecimal, alphabetical, alphanumeric and of course base64.
 *
 * The result of this function is always either a decimal number or a string, just as the input value. All numbers
 * apart from decimal ones are returned as strings without prefix. So, decimal 5 will be the number 5, but the binary
 * version will be the string "101". Positive and negative decimal integers are valid numbers here, but this
 * implementation does not support floats (multiply and divide if needed). Only numerical bases above 36 contain
 * lower case characters, so decimal 255 is "FF" in base 16 and not "ff".
 *
 * This function is unicode safe, by using byte conversion
 * (see: https://developer.mozilla.org/en-US/docs/Glossary/Base64#the_unicode_problem).
 * Be aware, that this also means, that results of `btoa/atob()` and `toBaseX/fromBaseX()` are _not_ interchangeable,
 * since they work with different values internally.
 *
 * There are three approaches to changing the base of a value in JavaScript:
 *
 * 1. Either you are taking the numerical/mathematical road, treating a value as a number in its alphabet being
 * interpreted as a number, where each character, counting from the back is the base to the power of the
 * character index. This is the approach you'd expect, when, for instance, you'd want to convert the decimal number 5
 * to binary 101. The downside of this approach is, that the relatively small max safe integer in JavaScript makes
 * converting large numbers, such as longer strings, impossible.
 *
 * 2. Therefore, the second approach takes the numeric approach, but combines it with chunking, splitting the value into
 * pieces, which are, by themselves, safely convertible. The downside is, that we need an extra character to delimit
 * chunks in the result, since values have non-uniform lengths. This means, that this does not work with the basic
 * binary base, and we need at least 3 alphabet characters.
 *
 * 3. The last approach uses the native base64 string encoding with `btoa()` as a safe translation layer, mapping the
 * resulting string to the target base, using a generated (and possibly paged) character map. This way treats all
 * values as strings and is not compatible to numerical conversion anymore, but uses the same characters. The result
 * of this approach can encode every string of every length without structural tricks, but has the longest results.
 *
 * This function is capable of all three approaches, which are equally safe for unicode values. The numerical
 * approach is the default. If you want to encode large numbers or strings longer than ~6 characters, select
 * a different approach using the `useCharacterMap` or `useChunks` parameters. Character mapping has preference, while
 * chunks have no effect in character mapping.
 *
 * Each encoding process ends with a self-test, checking if the result is actually decodable using
 * `fromBaseX()`, using the same settings again. This ensures, that every result is valid and retrievable in the future,
 * preventing any undiscovered errors, which would make it impossible to work with the original value again.
 *
 * You may define the base as an integer between 2 and 64 or as a custom alphabet in the same range. Integer based
 * alphabets are generated using defined base alphabets, which are sliced if necessary. Custom alphabets are
 * automatically sorted to match base64 are far as possible, pushing additional characters to the end, which are then
 * sorted ascending by character value.
 *
 * "{" and "}" are the only forbidden characters in a custom alphabet, since we need these to mark number values in
 * `fromBaseX()`.
 *
 * Numerical conversion keeps negative numbers negative and marks the result with a preceding "-".
 *
 * Hint: if you want to genrate codes to be presented to the user, see `Random:randomUserCode`.
 *
 * @param {Number|String} value - value to be encoded
 * @param {?Number|String|Array<String>} [baseOrAlphabet=64] - either the numerical base to convert to (64, 36, ...) or the alphabet of characters to use in encoding; numerical bases must be between 2 and 64 (if the result is chunked, we need a base 3)
 * @param {?Boolean} [useCharacterMap=true] - set to true, to use a character map, based on btoa(), instead of numerical conversion
 * @param {?Boolean} [useChunks=false] - set to true, to add chunking to the numerical approach, converting the value in groups separated by a delimiter, which is the first letter of the base's alphabet
 * @param {?Number} [chunkSize=6] - define a different chunks size; only change this, if 6 seems too big in your context, going higher is not advisable
 * @throws error if baseOrAlphabet is not usable
 * @throws error if result is not decodable again using the same settings
 * @returns {String} the encoded value
 *
 * @memberof Conversion:toBaseX
 * @alias toBaseX
 * @see fromBaseX
 * @see https://developer.mozilla.org/en-US/docs/Glossary/Base64#the_unicode_problem
 * @see Random:randomUserCode
 * @example
 * toBaseX('foobar')
 * => 'Zm9vYmFy'
 * toBaseX(-5, 2)
 * => '-101'
 * toBaseX(42, 'abcdefghij')
 * => 'ec'
 * toBaseX('too-long-for-number-conversion', 36, true)
 * => 'U70R0DCN0F0DS04T0BQ040R0GCN0N0JSNA03TZ0J01S0K0N0KQOA0HRN0R0C'
 * toBaseX('too-long-for-number-conversion', 16, false, true)
 * => 'D3EF5D81F026D9DFDA970BBF17222402A47D5AD650CF6C2FE2102A494BCBDD0A2864C'
 */
export function toBaseX(value, baseOrAlphabet=64, useCharacterMap=false, useChunks=false, chunkSize=6){
	const __methodName__ = 'toBaseX';

	const
		valueIsNumber = isInt(value),
		valueIsNegativeNumber = valueIsNumber && (value < 0)
	;

	value = valueIsNumber ? `${Math.abs(value)}` : `${value}`;
	useCharacterMap = orDefault(useCharacterMap, false, 'bool');
	useChunks = orDefault(useChunks, false, 'bool');
	chunkSize = orDefault(chunkSize, 10, 'int');

	const alphabet = buildAlphabet(__methodName__, baseOrAlphabet, useChunks);
	if( alphabet.includes('{') || alphabet.includes('}') ){
		throw new Error(`${MODULE_NAME}:${__methodName__} | invalid alphabet, must not contain "{" or "}"`)
	}

	let
		base64Value = '',
		base10Value = 0,
		baseXValue = ''
	;

	if( useCharacterMap ){
		base64Value = stringToBase64(value);
		if( baseOrAlphabet === 64 ) return base64Value;

		const
			pageMap = buildPageMap(alphabet),
			charMap = buildCharMap(pageMap, alphabet)
		;

		for( let char of base64Value ){
			baseXValue += charMap[char];
		}
	} else {
		if( valueIsNumber ){
			base64Value = base10toBaseX(value, BASE64_ALPHABET);
		} else {
			base64Value = stringToBase64(value);
		}

		const
			chunks = [],
			chunkAlphabet = useChunks ? alphabet.slice(1) : alphabet,
			chunkSeparator = useChunks ? alphabet[0] : ''
		;

		if( useChunks ){
			let chunkStart = 0;
			while( chunkStart < base64Value.length ){
				chunks.push(base64Value.slice(chunkStart, chunkStart + chunkSize));
				chunkStart += chunkSize;
			}
		} else {
			chunks.push(base64Value);
		}

		for( let chunk of chunks ){
			base10Value = baseXToBase10(chunk, BASE64_ALPHABET);

			if( !useChunks && (baseOrAlphabet === 10) ){
				baseXValue += base10Value;
				break;
			}

			baseXValue += `${chunkSeparator}${base10toBaseX(base10Value, chunkAlphabet)}`;
		}

		if( chunkSeparator !== '' ){
			baseXValue = baseXValue.slice(1);
		}
	}

	baseXValue = `${valueIsNegativeNumber ? '-' : ''}${baseXValue}`;
	const decodedValue = `${fromBaseX(baseXValue, baseOrAlphabet, useCharacterMap, useChunks, valueIsNumber)}`;
	if( decodedValue !== `${valueIsNegativeNumber ? '-' : ''}${value}` ){
		throw new Error(
			`${MODULE_NAME}:${__methodName__} | critical error, encoded value "${baseXValue}" `
			+`not decodable to "${value}", is "${decodedValue}" instead; `
			+`if this looks "cut off", this may be a problem with JS max safe integer size `
			+`(safe value length for number-based conversion is just ~8 chars), `
			+`try using character mapping or chunks to circumvent this problem`
		);
	}
	return baseXValue;
}



/**
 * @namespace Conversion:fromBaseX
 */

/**
 * This function converts a based representation back to its original number or string value.
 * This is the mirror function to `toBaseX()` and expects a value encoded with that function. See that function
 * for implementation details, modes and restrictions.
 *
 * The result of this function is always either a decimal number or a string, just as the input value. All numbers
 * apart from decimal ones are returned as strings without prefix. So, decimal 5 will be the number 5, but the binary
 * version will be the string "101".
 *
 * You may define the base as an integer between 2 and 64 or as a custom alphabet in the same range. Integer based
 * alphabets are generated using defined base alphabets, which are sliced if necessary. Custom alphabets are
 * automatically sorted to match base64 are far as possible, pushing additional characters to the end, which are then
 * sorted ascending by character value.
 *
 * "{" and "}" are the only forbidden characters in a custom alphabet, since we need these to mark number values in
 * `fromBaseX()`.
 *
 * Numerical conversion keeps negative numbers negative and marks the result with a preceding "-".
 *
 * @param {Number|String} value - value to be decoded
 * @param {?Number|String|Array<String>} [baseOrAlphabet=64] - either the numerical base to convert to (64, 36, ...) or the alphabet of characters to use in encoding; numerical bases must be between 2 and 64 (if the result is chunked, we need a base 3)
 * @param {?Boolean} [useCharacterMap=true] - set to true, to use a character map, based on btoa(), instead of numerical conversion
 * @param {?Boolean} [useChunks=false] - set to true, to add chunking to the numerical approach, converting the value in groups separated by a delimiter, which is the first letter of the base's alphabet
 * @param {?Boolean} [valueIsNumber=false] - if true, the given value is treated as a number for numerical conversion; this is necessary, since numbers such as binaries are defined as strings and are therefore not auto-detectable
 * @throws error if baseOrAlphabet is not usable
 * @throws error character mapped decoding fails, due to missing token/unmatched alphabet
 * @returns {String} the decoded value
 *
 * @memberof Conversion:fromBaseX
 * @alias fromBaseX
 * @see toBaseX
 * @example
 * fromBaseX('Zm9vYmFy')
 * => 'foobar'
 * fromBaseX('16W33YPUS', 36)
 * => 'äす'
 * fromBaseX('{-3C3}', 13)
 * => -666
 * fromBaseX('q', 64, false, false, true)
 * => 42
 * fromBaseX('U70R0DCN0F0DS04T0BQ040R0GCN0N0JSNA03TZ0J01S0K0N0KQOA0HRN0R0C', 36, true)
 * => 'too-long-for-number-conversion'
 * fromBaseX('D3EF5D81F026D9DFDA970BBF17222402A47D5AD650CF6C2FE2102A494BCBDD0A2864C', 16, false, true)
 * => 'too-long-for-number-conversion'
 */
export function fromBaseX(value, baseOrAlphabet=64, useCharacterMap=false, useChunks=false, valueIsNumber=false){
	const __methodName__ = 'fromBaseX';

	valueIsNumber = !!valueIsNumber
		|| isInt(value)
		|| (`${value}`.startsWith('{') && `${value}`.endsWith('}'))
	;
	value = `${value}`;
	if( valueIsNumber ){
		value = trim(value, ['{', '}']);
	}
	const valueIsNegativeNumber = valueIsNumber && value.startsWith('-');
	if( valueIsNegativeNumber ){
		value = value.slice(1);
	}

	useCharacterMap = orDefault(useCharacterMap, false, 'bool');
	useChunks = orDefault(useChunks, false, 'bool');

	if(
		(baseOrAlphabet === 64)
		&& !useCharacterMap
		&& !useChunks
		&& !valueIsNumber
	) return base64ToString(value, __methodName__);

	const alphabet = buildAlphabet(__methodName__, baseOrAlphabet, useChunks);
	if( alphabet.includes('{') || alphabet.includes('}') ){
		throw new Error(`${MODULE_NAME}:${__methodName__} | invalid alphabet, must not contain "{" or "}"`)
	}

	let
		base64Value = '',
		base10Value = 0,
		decodedValue
	;

	if( useCharacterMap ){
		const
			pageMap = buildPageMap(alphabet),
			charMap = buildCharMap(pageMap, alphabet),
			inverseCharMap = Object.fromEntries(
				Object
				.entries(charMap)
				.map(([key, value]) => [value, key])
			),
			tokensByLength = Object.keys(inverseCharMap).sort((a, b) => {
				return (a.length === b.length) ? 0 : ((a.length > b.length) ? -1 : 1);
			})
		;

		let tokenFound = false;
		while( value !== '' ){
			for( let token of tokensByLength ){
				tokenFound = false;
				if( value.startsWith(token) ){
					tokenFound = true;
					base64Value += inverseCharMap[token];
					value = value.slice(token.length);
					break;
				}
			}

			if( !tokenFound ){
				throw new Error(
					`${MODULE_NAME}:${__methodName__} | unknown token at start of "${value}", likely due to non-matching alphabet`
				);
			}
		}

		decodedValue = base64ToString(base64Value, __methodName__);
		if( valueIsNegativeNumber ){
			decodedValue = `-${decodedValue}`;
		}

		return decodedValue;
	} else {
		decodedValue = '';

		const
			chunkAlphabet = useChunks ? alphabet.slice(1) : alphabet,
			chunkSeparator = useChunks ? alphabet[0] : '',
			chunks = useChunks ? value.split(chunkSeparator) : [value]
		;

		for( let chunk of chunks ){
			base10Value = baseXToBase10(chunk, chunkAlphabet);

			if( valueIsNumber ){
				decodedValue += `${base10Value}`;
			} else {
				base64Value += base10toBaseX(base10Value, BASE64_ALPHABET);
			}
		}

		if( decodedValue === '' ){
			decodedValue = base64ToString(base64Value, __methodName__)
			if( valueIsNegativeNumber ){
				decodedValue = `-${decodedValue}`;
			}
		}

		if( !useChunks && valueIsNumber ){
			decodedValue = Number(decodedValue);
			if( valueIsNegativeNumber && (decodedValue >= 0) ){
				decodedValue = -decodedValue;
			}
		}

		return decodedValue;
	}
}