/**
* This file is part of Threema Web.
*
* Threema Web is free software: you can redistribute it and/or modify it
* under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
* General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with Threema Web. If not, see .
*/
export const enum TokenType {
Text,
Newline,
Asterisk,
Underscore,
Tilde,
}
export interface Token {
kind: TokenType;
value?: string;
}
// The markup characters.
const markupChars = {
[TokenType.Asterisk]: '*',
[TokenType.Underscore]: '_',
[TokenType.Tilde]: '~',
};
// CSS classes for the HTML markup.
const cssClasses = {
[TokenType.Asterisk]: 'text-bold',
[TokenType.Underscore]: 'text-italic',
[TokenType.Tilde]: 'text-strike',
};
/**
* Return whether the specified token type is a markup token.
*/
function isMarkupToken(tokenType: TokenType) {
return markupChars.hasOwnProperty(tokenType);
}
/**
* Return whether the specified character is a boundary character.
* When `character` is undefined, the function will return true.
*/
function isBoundary(character?: string): boolean {
return character === undefined || /[\s.,!?¡¿‽⸮;:&(){}\[\]⟨⟩‹›«»'"‘’“”*~\-_…⋯᠁]/.test(character);
}
/**
* Return whether the specified character is a URL boundary character.
* When `character` is undefined, the function will return true.
*
* Characters that may be in an URL according to RFC 3986:
* ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%
*/
function isUrlBoundary(character?: string): boolean {
return character === undefined || !/[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]/.test(character);
}
/**
* Return whether the specified string starts an URL.
*/
function isUrlStart(substring: string): boolean {
return substring.match(/^[a-zA-Z]+:\/\//) != null;
}
/**
* This function accepts a string and returns a list of tokens.
*/
export function tokenize(text: string): Token[] {
const tokens = [];
let textBuf = '';
let matchingUrl = false;
const pushTextBufToken = () => {
if (textBuf.length > 0) {
tokens.push({ kind: TokenType.Text, value: textBuf });
textBuf = '';
}
};
for (let i = 0; i < text.length; i++) {
const currentChar = text[i];
// Detect URLs
if (!matchingUrl) {
matchingUrl = isUrlStart(text.substring(i));
}
// URLs have a limited set of boundary characters, therefore we need to
// treat them separately.
if (matchingUrl) {
textBuf += currentChar;
const nextIsUrlBoundary = isUrlBoundary(text[i + 1]);
if (nextIsUrlBoundary) {
pushTextBufToken();
matchingUrl = false;
}
} else {
const prevIsBoundary = isBoundary(text[i - 1]);
const nextIsBoundary = isBoundary(text[i + 1]);
if (currentChar === '*' && (prevIsBoundary || nextIsBoundary)) {
pushTextBufToken();
tokens.push({ kind: TokenType.Asterisk });
} else if (currentChar === '_' && (prevIsBoundary || nextIsBoundary)) {
pushTextBufToken();
tokens.push({ kind: TokenType.Underscore });
} else if (currentChar === '~' && (prevIsBoundary || nextIsBoundary)) {
pushTextBufToken();
tokens.push({ kind: TokenType.Tilde });
} else if (currentChar === '\n') {
pushTextBufToken();
tokens.push({ kind: TokenType.Newline });
} else {
textBuf += currentChar;
}
}
}
pushTextBufToken();
return tokens;
}
export function parse(tokens: Token[]): string {
const stack: Token[] = [];
// Booleans to avoid searching the stack.
// This is used for optimization.
const tokensPresent = {
[TokenType.Asterisk]: false,
[TokenType.Underscore]: false,
[TokenType.Tilde]: false,
};
// Helper: When called with a value, mark the token type as present or not.
// When called without a value, return whether this token type is present.
const hasToken = (token: TokenType, value?: boolean) => {
if (value === undefined) {
return tokensPresent[token];
}
tokensPresent[token] = value;
};
// Helper: Consume the stack, return a string.
const consumeStack = () => {
let textBuf = '';
for (const token of stack) {
switch (token.kind) {
case TokenType.Text:
textBuf += token.value;
break;
case TokenType.Asterisk:
case TokenType.Underscore:
case TokenType.Tilde:
textBuf += markupChars[token.kind];
break;
case TokenType.Newline:
throw new Error('Unexpected newline token on stack');
default:
throw new Error('Unknown token on stack: ' + token.kind);
}
}
// Clear stack
// https://stackoverflow.com/a/1232046
stack.splice(0, stack.length);
return textBuf;
};
// Helper: Pop the stack, throw an exception if it's empty
const popStack = () => {
const stackTop = stack.pop();
if (stackTop === undefined) {
throw new Error('Stack is empty');
}
return stackTop;
};
// Helper: Add markup HTML to the stack
const pushMarkup = (textParts: string[], cssClass: string) => {
let html = ``;
for (let i = textParts.length - 1; i >= 0; i--) {
html += textParts[i];
}
html += '';
stack.push({ kind: TokenType.Text, value: html });
};
// Process the tokens. Add them to a stack. When a token pair is complete
// (e.g. the second asterisk is found), pop the stack until you find the
// matching token and convert everything in between to formatted text.
for (const token of tokens) {
switch (token.kind) {
// Keep text as-is
case TokenType.Text:
stack.push(token);
break;
// If a markup token is found, try to find a matching token.
case TokenType.Asterisk:
case TokenType.Underscore:
case TokenType.Tilde:
// Optimization: Only search the stack if a token with this token type exists
if (hasToken(token.kind)) {
// Pop tokens from the stack. If a matching token was found, apply
// markup to the text parts in between those two tokens.
const textParts = [];
while (true) {
const stackTop = popStack();
if (stackTop.kind === TokenType.Text) {
textParts.push(stackTop.value);
} else if (stackTop.kind === token.kind) {
if (textParts.length > 0) {
pushMarkup(textParts, cssClasses[token.kind]);
} else {
// If this happens, then two markup chars were following each other (e.g. **hello).
// In that case, just keep them as regular text characters, without applying any markup.
const markupChar = markupChars[token.kind];
stack.push({ kind: TokenType.Text, value: markupChar + markupChar });
}
hasToken(token.kind, false);
break;
} else if (isMarkupToken(stackTop.kind)) {
textParts.push(markupChars[stackTop.kind]);
} else {
throw new Error('Unknown token on stack: ' + token.kind);
}
hasToken(stackTop.kind, false);
}
} else {
stack.push(token);
hasToken(token.kind, true);
}
break;
// Don't apply formatting across newlines, consume the current stack!
case TokenType.Newline:
stack.push({ kind: TokenType.Text, value: consumeStack() + '\n' });
hasToken(TokenType.Asterisk, false);
hasToken(TokenType.Underscore, false);
hasToken(TokenType.Tilde, false);
break;
default:
throw new Error('Invalid token kind: ' + token.kind);
}
}
// Concatenate processed tokens
return consumeStack();
}
export function markify(text: string): string {
return parse(tokenize(text));
}