Jelajahi Sumber

New markup parser (#590)

Replace Regexes with proper stack based parser.

Fixes #453 and #458.
Danilo Bargen 6 tahun lalu
induk
melakukan
53fbcb92af
5 mengubah file dengan 491 tambahan dan 91 penghapusan
  1. 2 9
      src/filters.ts
  2. 225 0
      src/markup_parser.ts
  3. 0 82
      tests/filters.js
  4. 1 0
      tests/ts/main.ts
  5. 263 0
      tests/ts/markup_parser.ts

+ 2 - 9
src/filters.ts

@@ -16,6 +16,7 @@
  */
 
 import {bufferToUrl, escapeRegExp, filter, hasValue, logAdapter} from './helpers';
+import {markify} from './markup_parser';
 import {MimeService} from './services/mime';
 import {NotificationService} from './services/notification';
 import {WebClientService} from './services/webclient';
@@ -137,15 +138,7 @@ angular.module('3ema.filters', [])
  * Convert markdown elements to html elements
  */
 .filter('markify', function() {
-    return function(text) {
-        if (text !== null) {
-            text = text.replace(/\B\*([^\r\n]+?)\*\B/g, '<span class="text-bold">$1</span>');
-            text = text.replace(/\b_([^\r\n]+?)_\b/g, '<span class="text-italic">$1</span>');
-            text = text.replace(/\B~([^\r\n]+?)~\B/g, '<span class="text-strike">$1</span>');
-            return text;
-        }
-        return text;
-    };
+    return markify;
 })
 
 /**

+ 225 - 0
src/markup_parser.ts

@@ -0,0 +1,225 @@
+/**
+ * This file is part of Threema Web.
+ *
+ * Threema Web is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Threema Web. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+export const enum TokenType {
+    Text,
+    Newline,
+    Asterisk,
+    Underscore,
+    Tilde,
+}
+
+export interface Token {
+    kind: TokenType;
+    value?: string;
+}
+
+// The markup characters.
+const markupChars = {
+    [TokenType.Asterisk]: '*',
+    [TokenType.Underscore]: '_',
+    [TokenType.Tilde]: '~',
+};
+
+// CSS classes for the HTML markup.
+const cssClasses = {
+    [TokenType.Asterisk]: 'text-bold',
+    [TokenType.Underscore]: 'text-italic',
+    [TokenType.Tilde]: 'text-strike',
+};
+
+/**
+ * Return whether the specified token type is a markup token.
+ */
+function isMarkupToken(tokenType: TokenType) {
+    return markupChars.hasOwnProperty(tokenType);
+}
+
+/**
+ * Return whether the specified character is a boundary character.
+ * When `character` is undefined, the function will return true.
+ */
+function isBoundary(character?: string) {
+    return character === undefined || /[\s.,!?¡¿‽⸮;:&(){}\[\]⟨⟩‹›«»'"‘’“”*~\-_…⋯᠁]/.test(character);
+}
+
+/**
+ * This function accepts a string and returns a list of tokens.
+ */
+export function tokenize(text: string): Token[] {
+    const tokens = [];
+    let textBuf = '';
+
+    const pushTextBufToken = () => {
+        if (textBuf.length > 0) {
+            tokens.push({ kind: TokenType.Text, value: textBuf });
+            textBuf = '';
+        }
+    };
+
+    for (let i = 0; i < text.length; i++) {
+        const currentChar = text[i];
+        const prevIsBoundary = isBoundary(text[i - 1]);
+        const nextIsBoundary = isBoundary(text[i + 1]);
+
+        if (currentChar === '*' && (prevIsBoundary || nextIsBoundary)) {
+            pushTextBufToken();
+            tokens.push({ kind: TokenType.Asterisk });
+        } else if (currentChar === '_' && (prevIsBoundary || nextIsBoundary)) {
+            pushTextBufToken();
+            tokens.push({ kind: TokenType.Underscore });
+        } else if (currentChar === '~' && (prevIsBoundary || nextIsBoundary)) {
+            pushTextBufToken();
+            tokens.push({ kind: TokenType.Tilde });
+        } else if (currentChar === '\n') {
+            pushTextBufToken();
+            tokens.push({ kind: TokenType.Newline });
+        } else {
+            textBuf += currentChar;
+        }
+    }
+
+    pushTextBufToken();
+
+    return tokens;
+}
+
+export function parse(tokens: Token[]): string {
+    const stack: Token[] = [];
+
+    // Booleans to avoid searching the stack.
+    // This is used for optimization.
+    const tokensPresent = {
+        [TokenType.Asterisk]: false,
+        [TokenType.Underscore]: false,
+        [TokenType.Tilde]: false,
+    };
+
+    // Helper: When called with a value, mark the token type as present or not.
+    // When called without a value, return whether this token type is present.
+    const hasToken = (token: TokenType, value?: boolean) => {
+        if (value === undefined) {
+            return tokensPresent[token];
+        }
+        tokensPresent[token] = value;
+    };
+
+    // Helper: Consume the stack, return a string.
+    const consumeStack = () => {
+        let textBuf = '';
+        for (const token of stack) {
+            switch (token.kind) {
+                case TokenType.Text:
+                    textBuf += token.value;
+                    break;
+                case TokenType.Asterisk:
+                case TokenType.Underscore:
+                case TokenType.Tilde:
+                    textBuf += markupChars[token.kind];
+                    break;
+                case TokenType.Newline:
+                    throw new Error('Unexpected newline token on stack');
+                default:
+                    throw new Error('Unknown token on stack: ' + token.kind);
+            }
+        }
+        // Clear stack
+        // https://stackoverflow.com/a/1232046
+        stack.splice(0, stack.length);
+        return textBuf;
+    };
+
+    // Helper: Pop the stack, throw an exception if it's empty
+    const popStack = () => {
+        const stackTop = stack.pop();
+        if (stackTop === undefined) {
+            throw new Error('Stack is empty');
+        }
+        return stackTop;
+    };
+
+    // Helper: Add markup HTML to the stack
+    const pushMarkup = (textParts: string[], cssClass: string) => {
+        let html = `<span class="${cssClass}">`;
+        for (let i = textParts.length - 1; i >= 0; i--) {
+            html += textParts[i];
+        }
+        html += '</span>';
+        stack.push({ kind: TokenType.Text, value: html });
+    };
+
+    // Process the tokens. Add them to a stack. When a token pair is complete
+    // (e.g. the second asterisk is found), pop the stack until you find the
+    // matching token and convert everything in between to formatted text.
+    for (const token of tokens) {
+        switch (token.kind) {
+
+            // Keep text as-is
+            case TokenType.Text:
+                stack.push(token);
+                break;
+
+            // If a markup token is found, try to find a matching token.
+            case TokenType.Asterisk:
+            case TokenType.Underscore:
+            case TokenType.Tilde:
+                // Optimization: Only search the stack if a token with this token type exists
+                if (hasToken(token.kind)) {
+                    // Pop tokens from the stack. If a matching token was found, apply
+                    // markup to the text parts in between those two tokens.
+                    const textParts = [];
+                    while (true) {
+                        const stackTop = popStack();
+                        if (stackTop.kind === TokenType.Text) {
+                            textParts.push(stackTop.value);
+                        } else if (stackTop.kind === token.kind) {
+                            pushMarkup(textParts, cssClasses[token.kind]);
+                            hasToken(token.kind, false);
+                            break;
+                        } else if (isMarkupToken(stackTop.kind)) {
+                            textParts.push(markupChars[stackTop.kind]);
+                        } else {
+                            throw new Error('Unknown token on stack: ' + token.kind);
+                        }
+                        hasToken(stackTop.kind, false);
+                    }
+                } else {
+                    stack.push(token);
+                    hasToken(token.kind, true);
+                }
+                break;
+
+            // Don't apply formatting across newlines, consume the current stack!
+            case TokenType.Newline:
+                stack.push({ kind: TokenType.Text, value: consumeStack() + '\n' });
+                hasToken(TokenType.Asterisk, false);
+                hasToken(TokenType.Underscore, false);
+                hasToken(TokenType.Tilde, false);
+                break;
+
+            default:
+                throw new Error('Invalid token kind: ' + token.kind);
+        }
+    }
+
+    // Concatenate processed tokens
+    return consumeStack();
+}
+
+export function markify(text: string): string {
+    return parse(tokenize(text));
+}

+ 0 - 82
tests/filters.js

@@ -71,87 +71,6 @@ describe('Filters', function() {
         };
     };
 
-    describe('markify', function() {
-
-        this.testPatterns = (cases) => testPatterns('markify', cases);
-
-        it('detects bold text', () => {
-            this.testPatterns([
-                ['*bold text (not italic)*',
-                 '<span class="text-bold">bold text (not italic)</span>'],
-            ]);
-        });
-
-        it('detects italic text', () => {
-            this.testPatterns([
-                ['This text is not italic.',
-                 'This text is not italic.'],
-                ['_This text is italic._',
-                 '<span class="text-italic">This text is italic.</span>'],
-                ['This text is _partially_ italic',
-                 'This text is <span class="text-italic">partially</span> italic'],
-                ['This text has _two_ _italic_ bits',
-                 'This text has <span class="text-italic">two</span> <span class="text-italic">italic</span> bits'],
-            ]);
-
-        });
-
-        it('detects strikethrough text', () => {
-            this.testPatterns([
-                ['so ~strikethrough~', 'so <span class="text-strike">strikethrough</span>'],
-            ]);
-        });
-
-        it('detects mixed markup', () => {
-            this.testPatterns([
-                ['*bold text with _italic_ *',
-                 '<span class="text-bold">bold text with <span class="text-italic">italic</span> </span>'],
-                ['*part bold,* _part italic_',
-                 '<span class="text-bold">part bold,</span> <span class="text-italic">part italic</span>'],
-                ['_italic text with *bold* _',
-                 '<span class="text-italic">italic text with <span class="text-bold">bold</span> </span>'],
-            ]);
-        });
-
-        it('is only applied on word boundaries', () => {
-            this.testPatterns([
-                ['so not_really_italic',
-                 'so not_really_italic'],
-                ['invalid*bold*stuff',
-                 'invalid*bold*stuff'],
-                ['no~strike~through',
-                 'no~strike~through'],
-                ['*bold_but_no~strike~through*',
-                 '<span class="text-bold">bold_but_no~strike~through</span>'],
-            ]);
-        });
-
-        it('does not break URLs', () => {
-            this.testPatterns([
-                ['https://en.wikipedia.org/wiki/Java_class_file *nice*',
-                 'https://en.wikipedia.org/wiki/Java_class_file <span class="text-bold">nice</span>'],
-                ['<a href="https://threema.ch">_Threema_</a>',
-                 '<a href="https://threema.ch"><span class="text-italic">Threema</span></a>'],
-            ]);
-        });
-
-        it('ignores invalid markup', () => {
-            this.testPatterns([
-                ['*invalid markup (do not parse)_', '*invalid markup (do not parse)_'],
-                ['random *asterisk', 'random *asterisk'],
-            ]);
-        });
-
-        it('ignores markup with \\n (newline)', () => {
-            this.testPatterns([
-                ['*First line\n and a new one. (do not parse)*', '*First line\n and a new one. (do not parse)*'],
-                ['*\nbegins with linebreak. (do not parse)*', '*\nbegins with linebreak. (do not parse)*'],
-                ['*Just some text. But it ends with newline (do not parse)\n*', '*Just some text. But it ends with newline (do not parse)\n*'],
-            ]);
-        });
-
-    });
-
     describe('escapeHtml', function() {
 
         this.testPatterns = (cases) => testPatterns('escapeHtml', cases);
@@ -168,7 +87,6 @@ describe('Filters', function() {
 
     describe('mentionify', function() {
 
-
         this.testPatterns = (cases) => testPatterns('mentionify', cases);
 
         it('no mentions', () => {

+ 1 - 0
tests/ts/main.ts

@@ -23,4 +23,5 @@
 import './containers';
 import './crypto_helpers';
 import './helpers';
+import './markup_parser';
 import './receiver_helpers';

+ 263 - 0
tests/ts/markup_parser.ts

@@ -0,0 +1,263 @@
+/**
+ * Copyright © 2016-2018 Threema GmbH (https://threema.ch/).
+ *
+ * This file is part of Threema Web.
+ *
+ * Threema Web is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Threema Web. If not, see <http://www.gnu.org/licenses/>.
+ */
+// tslint:disable:max-line-length
+
+import {markify, parse, tokenize, TokenType} from '../../src/markup_parser';
+
+describe('Markup Parser', () => {
+    describe('tokenizer', () => {
+        it('simple', function() {
+            const text = 'hello *there*!';
+            const tokens = tokenize(text);
+            expect(tokens).toEqual([
+                { kind: TokenType.Text, value: 'hello ' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'there' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: '!' },
+            ]);
+        });
+
+        it('nested', function() {
+            const text = 'this is *_nested_*!';
+            const tokens = tokenize(text);
+            expect(tokens).toEqual([
+                { kind: TokenType.Text, value: 'this is ' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Text, value: 'nested' },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: '!' },
+            ]);
+        });
+
+        it('ignore if not along boundary', function() {
+            const text = 'this*is_not~at-boundary';
+            const tokens = tokenize(text);
+            expect(tokens).toEqual([
+                { kind: TokenType.Text, value: 'this*is_not~at-boundary' },
+            ]);
+        });
+
+        it('ignore in URLs', function() {
+            const text = 'ignore if *in* a link: https://example.com/_pub_/horse.jpg';
+            const tokens = tokenize(text);
+            expect(tokens).toEqual([
+                { kind: TokenType.Text, value: 'ignore if ' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'in' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: ' a link: https://example.com/_pub_/horse.jpg' },
+            ]);
+        });
+
+        it('with newlines', function() {
+            const text = 'hello\n*world*\n';
+            const tokens = tokenize(text);
+            expect(tokens).toEqual([
+                { kind: TokenType.Text, value: 'hello' },
+                { kind: TokenType.Newline },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'world' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Newline },
+            ]);
+        });
+    });
+
+    describe('parser', () => {
+        it('simple text without formatting', () => {
+            const tokens = [{ kind: TokenType.Text, value: 'hello world' }];
+            const html = parse(tokens);
+            expect(html).toEqual('hello world');
+        });
+
+        it('simple bold text', () => {
+            const tokens = [
+                { kind: TokenType.Text, value: 'hello ' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'bold' },
+                { kind: TokenType.Asterisk },
+            ];
+            const html = parse(tokens);
+            expect(html).toEqual('hello <span class="text-bold">bold</span>');
+        });
+
+        it('simple italic text', () => {
+            const tokens = [
+                { kind: TokenType.Text, value: 'hello ' },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Text, value: 'italic' },
+                { kind: TokenType.Underscore },
+            ];
+            const html = parse(tokens);
+            expect(html).toEqual('hello <span class="text-italic">italic</span>');
+        });
+
+        it('simple strikethrough text', () => {
+            const tokens = [
+                { kind: TokenType.Text, value: 'hello ' },
+                { kind: TokenType.Tilde },
+                { kind: TokenType.Text, value: 'strikethrough' },
+                { kind: TokenType.Tilde },
+            ];
+            const html = parse(tokens);
+            expect(html).toEqual('hello <span class="text-strike">strikethrough</span>');
+        });
+
+        it('correct nesting', () => {
+            const tokens = [
+                { kind: TokenType.Text, value: 'hello ' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'bold and ' },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Text, value: 'italic' },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Asterisk },
+            ];
+            const html = parse(tokens);
+            expect(html).toEqual('hello <span class="text-bold">bold and <span class="text-italic">italic</span></span>');
+        });
+
+        it('incorrect nesting', () => {
+            const tokens = [
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Text, value: 'hi ' },
+                { kind: TokenType.Underscore },
+                { kind: TokenType.Text, value: 'there' },
+                { kind: TokenType.Asterisk },
+                { kind: TokenType.Underscore },
+            ];
+            const html = parse(tokens);
+            expect(html).toEqual('<span class="text-bold">hi _there</span>_');
+        });
+    });
+
+    function testPatterns(cases) {
+        for (const testcase of cases) {
+            const input = testcase[0];
+            const expected = testcase[1];
+            expect(markify(input)).toEqual(expected);
+        }
+    }
+
+    describe('markify', () => {
+
+        it('detects bold text', () => {
+            testPatterns([
+                ['*bold text (not italic)*',
+                 '<span class="text-bold">bold text (not italic)</span>'],
+            ]);
+        });
+
+        it('detects italic text', () => {
+            testPatterns([
+                ['This text is not italic.',
+                 'This text is not italic.'],
+                ['_This text is italic._',
+                 '<span class="text-italic">This text is italic.</span>'],
+                ['This text is _partially_ italic',
+                 'This text is <span class="text-italic">partially</span> italic'],
+                ['This text has _two_ _italic_ bits',
+                 'This text has <span class="text-italic">two</span> <span class="text-italic">italic</span> bits'],
+            ]);
+
+        });
+
+        it('detects strikethrough text', () => {
+            testPatterns([
+                ['so ~strikethrough~', 'so <span class="text-strike">strikethrough</span>'],
+            ]);
+        });
+
+        it('detects mixed markup', () => {
+            testPatterns([
+                ['*bold text with _italic_ *',
+                 '<span class="text-bold">bold text with <span class="text-italic">italic</span> </span>'],
+                ['*part bold,* _part italic_',
+                 '<span class="text-bold">part bold,</span> <span class="text-italic">part italic</span>'],
+                ['_italic text with *bold* _',
+                 '<span class="text-italic">italic text with <span class="text-bold">bold</span> </span>'],
+            ]);
+        });
+
+        it('is applied on word boundaries', () => {
+            testPatterns([
+                ['(*bold*)',
+                 '(<span class="text-bold">bold</span>)'],
+                ['¡*Threema* es fantástico!',
+                 '¡<span class="text-bold">Threema</span> es fantástico!'],
+                ['«_great_ service»',
+                 '«<span class="text-italic">great</span> service»'],
+                ['"_great_" service',
+                 '"<span class="text-italic">great</span>" service'],
+                ['*bold*…',
+                 '<span class="text-bold">bold</span>…'],
+                ['_<a href="https://threema.ch">Threema</a>_',
+                 '<span class="text-italic"><a href="https://threema.ch">Threema</a></span>'],
+            ]);
+        });
+
+        it('is only applied on word boundaries', () => {
+            testPatterns([
+                ['so not_really_italic',
+                 'so not_really_italic'],
+                ['invalid*bold*stuff',
+                 'invalid*bold*stuff'],
+                ['no~strike~through',
+                 'no~strike~through'],
+                ['*bold_but_no~strike~through*',
+                 '<span class="text-bold">bold_but_no~strike~through</span>'],
+                ['<_< >_>',
+                 '<_< >_>'],
+                ['<a href="https://threema.ch">_Threema_</a>',
+                 '<a href="https://threema.ch">_Threema_</a>'],
+            ]);
+        });
+
+        it('does not break URLs', () => {
+            testPatterns([
+                ['https://en.wikipedia.org/wiki/Java_class_file *nice*',
+                 'https://en.wikipedia.org/wiki/Java_class_file <span class="text-bold">nice</span>'],
+                ['https://example.com/_output_/',
+                 'https://example.com/_output_/'],
+                ['https://example.com/*output*/',
+                 'https://example.com/*output*/'],
+            ]);
+        });
+
+        it('ignores invalid markup', () => {
+            testPatterns([
+                ['*invalid markup (do not parse)_', '*invalid markup (do not parse)_'],
+                ['random *asterisk', 'random *asterisk'],
+            ]);
+        });
+
+        it('ignores markup with \\n (newline)', () => {
+            testPatterns([
+                ['*First line\n and a new one. (do not parse)*', '*First line\n and a new one. (do not parse)*'],
+                ['*\nbegins with linebreak. (do not parse)*', '*\nbegins with linebreak. (do not parse)*'],
+                ['*Just some text. But it ends with newline (do not parse)\n*', '*Just some text. But it ends with newline (do not parse)\n*'],
+            ]);
+        });
+
+    });
+
+});