markup_parser.ts 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. /**
  2. * This file is part of Threema Web.
  3. *
  4. * Threema Web is free software: you can redistribute it and/or modify it
  5. * under the terms of the GNU Affero General Public License as published by
  6. * the Free Software Foundation, either version 3 of the License, or (at
  7. * your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Affero General Public License
  15. * along with Threema Web. If not, see <http://www.gnu.org/licenses/>.
  16. */
  17. export const enum TokenType {
  18. Text,
  19. Newline,
  20. Asterisk,
  21. Underscore,
  22. Tilde,
  23. }
  24. export interface Token {
  25. kind: TokenType;
  26. value?: string;
  27. }
  28. // The markup characters.
  29. const markupChars = {
  30. [TokenType.Asterisk]: '*',
  31. [TokenType.Underscore]: '_',
  32. [TokenType.Tilde]: '~',
  33. };
  34. // CSS classes for the HTML markup.
  35. const cssClasses = {
  36. [TokenType.Asterisk]: 'text-bold',
  37. [TokenType.Underscore]: 'text-italic',
  38. [TokenType.Tilde]: 'text-strike',
  39. };
  40. /**
  41. * Return whether the specified token type is a markup token.
  42. */
  43. function isMarkupToken(tokenType: TokenType) {
  44. return markupChars.hasOwnProperty(tokenType);
  45. }
  46. /**
  47. * Return whether the specified character is a boundary character.
  48. * When `character` is undefined, the function will return true.
  49. */
  50. function isBoundary(character?: string): boolean {
  51. return character === undefined || /[\s.,!?¡¿‽⸮;:&(){}\[\]⟨⟩‹›«»'"‘’“”*~\-_…⋯᠁]/.test(character);
  52. }
  53. /**
  54. * Return whether the specified character is a URL boundary character.
  55. * When `character` is undefined, the function will return true.
  56. *
  57. * Characters that may be in an URL according to RFC 3986:
  58. * ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%
  59. */
  60. function isUrlBoundary(character?: string): boolean {
  61. return character === undefined || !/[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]/.test(character);
  62. }
  63. /**
  64. * Return whether the specified string starts an URL.
  65. */
  66. function isUrlStart(substring: string): boolean {
  67. return substring.match(/^[a-zA-Z]+:\/\//) != null;
  68. }
  69. /**
  70. * This function accepts a string and returns a list of tokens.
  71. */
  72. export function tokenize(text: string): Token[] {
  73. const tokens = [];
  74. let textBuf = '';
  75. let matchingUrl = false;
  76. const pushTextBufToken = () => {
  77. if (textBuf.length > 0) {
  78. tokens.push({ kind: TokenType.Text, value: textBuf });
  79. textBuf = '';
  80. }
  81. };
  82. for (let i = 0; i < text.length; i++) {
  83. const currentChar = text[i];
  84. // Detect URLs
  85. if (!matchingUrl) {
  86. matchingUrl = isUrlStart(text.substring(i));
  87. }
  88. // URLs have a limited set of boundary characters, therefore we need to
  89. // treat them separately.
  90. if (matchingUrl) {
  91. textBuf += currentChar;
  92. const nextIsUrlBoundary = isUrlBoundary(text[i + 1]);
  93. if (nextIsUrlBoundary) {
  94. pushTextBufToken();
  95. matchingUrl = false;
  96. }
  97. } else {
  98. const prevIsBoundary = isBoundary(text[i - 1]);
  99. const nextIsBoundary = isBoundary(text[i + 1]);
  100. if (currentChar === '*' && (prevIsBoundary || nextIsBoundary)) {
  101. pushTextBufToken();
  102. tokens.push({ kind: TokenType.Asterisk });
  103. } else if (currentChar === '_' && (prevIsBoundary || nextIsBoundary)) {
  104. pushTextBufToken();
  105. tokens.push({ kind: TokenType.Underscore });
  106. } else if (currentChar === '~' && (prevIsBoundary || nextIsBoundary)) {
  107. pushTextBufToken();
  108. tokens.push({ kind: TokenType.Tilde });
  109. } else if (currentChar === '\n') {
  110. pushTextBufToken();
  111. tokens.push({ kind: TokenType.Newline });
  112. } else {
  113. textBuf += currentChar;
  114. }
  115. }
  116. }
  117. pushTextBufToken();
  118. return tokens;
  119. }
  120. export function parse(tokens: Token[]): string {
  121. const stack: Token[] = [];
  122. // Booleans to avoid searching the stack.
  123. // This is used for optimization.
  124. const tokensPresent = {
  125. [TokenType.Asterisk]: false,
  126. [TokenType.Underscore]: false,
  127. [TokenType.Tilde]: false,
  128. };
  129. // Helper: When called with a value, mark the token type as present or not.
  130. // When called without a value, return whether this token type is present.
  131. const hasToken = (token: TokenType, value?: boolean) => {
  132. if (value === undefined) {
  133. return tokensPresent[token];
  134. }
  135. tokensPresent[token] = value;
  136. };
  137. // Helper: Consume the stack, return a string.
  138. const consumeStack = () => {
  139. let textBuf = '';
  140. for (const token of stack) {
  141. switch (token.kind) {
  142. case TokenType.Text:
  143. textBuf += token.value;
  144. break;
  145. case TokenType.Asterisk:
  146. case TokenType.Underscore:
  147. case TokenType.Tilde:
  148. textBuf += markupChars[token.kind];
  149. break;
  150. case TokenType.Newline:
  151. throw new Error('Unexpected newline token on stack');
  152. default:
  153. throw new Error('Unknown token on stack: ' + token.kind);
  154. }
  155. }
  156. // Clear stack
  157. // https://stackoverflow.com/a/1232046
  158. stack.splice(0, stack.length);
  159. return textBuf;
  160. };
  161. // Helper: Pop the stack, throw an exception if it's empty
  162. const popStack = () => {
  163. const stackTop = stack.pop();
  164. if (stackTop === undefined) {
  165. throw new Error('Stack is empty');
  166. }
  167. return stackTop;
  168. };
  169. // Helper: Add markup HTML to the stack
  170. const pushMarkup = (textParts: string[], cssClass: string) => {
  171. let html = `<span class="${cssClass}">`;
  172. for (let i = textParts.length - 1; i >= 0; i--) {
  173. html += textParts[i];
  174. }
  175. html += '</span>';
  176. stack.push({ kind: TokenType.Text, value: html });
  177. };
  178. // Process the tokens. Add them to a stack. When a token pair is complete
  179. // (e.g. the second asterisk is found), pop the stack until you find the
  180. // matching token and convert everything in between to formatted text.
  181. for (const token of tokens) {
  182. switch (token.kind) {
  183. // Keep text as-is
  184. case TokenType.Text:
  185. stack.push(token);
  186. break;
  187. // If a markup token is found, try to find a matching token.
  188. case TokenType.Asterisk:
  189. case TokenType.Underscore:
  190. case TokenType.Tilde:
  191. // Optimization: Only search the stack if a token with this token type exists
  192. if (hasToken(token.kind)) {
  193. // Pop tokens from the stack. If a matching token was found, apply
  194. // markup to the text parts in between those two tokens.
  195. const textParts = [];
  196. while (true) {
  197. const stackTop = popStack();
  198. if (stackTop.kind === TokenType.Text) {
  199. textParts.push(stackTop.value);
  200. } else if (stackTop.kind === token.kind) {
  201. if (textParts.length > 0) {
  202. pushMarkup(textParts, cssClasses[token.kind]);
  203. } else {
  204. // If this happens, then two markup chars were following each other (e.g. **hello).
  205. // In that case, just keep them as regular text characters, without applying any markup.
  206. const markupChar = markupChars[token.kind];
  207. stack.push({ kind: TokenType.Text, value: markupChar + markupChar });
  208. }
  209. hasToken(token.kind, false);
  210. break;
  211. } else if (isMarkupToken(stackTop.kind)) {
  212. textParts.push(markupChars[stackTop.kind]);
  213. } else {
  214. throw new Error('Unknown token on stack: ' + token.kind);
  215. }
  216. hasToken(stackTop.kind, false);
  217. }
  218. } else {
  219. stack.push(token);
  220. hasToken(token.kind, true);
  221. }
  222. break;
  223. // Don't apply formatting across newlines, consume the current stack!
  224. case TokenType.Newline:
  225. stack.push({ kind: TokenType.Text, value: consumeStack() + '\n' });
  226. hasToken(TokenType.Asterisk, false);
  227. hasToken(TokenType.Underscore, false);
  228. hasToken(TokenType.Tilde, false);
  229. break;
  230. default:
  231. throw new Error('Invalid token kind: ' + token.kind);
  232. }
  233. }
  234. // Concatenate processed tokens
  235. return consumeStack();
  236. }
  237. export function markify(text: string): string {
  238. return parse(tokenize(text));
  239. }