فهرست منبع

Markup parser: Ignore markup in URLs (#778)

In the tokenizer, process URLs as text tokens. Use a different, smaller
set of boundary characters.

This avoids processing markup in URLs like
`https://example.com/file_-_1.jpg` (see test cases).
Danilo Bargen 6 سال پیش
والد
کامیت
6d8ae7e058
2فایلهای تغییر یافته به همراه57 افزوده شده و 20 حذف شده
  1. 52 17
      src/markup_parser.ts
  2. 5 3
      tests/ts/markup_parser.ts

+ 52 - 17
src/markup_parser.ts

@@ -53,16 +53,35 @@ function isMarkupToken(tokenType: TokenType) {
  * Return whether the specified character is a boundary character.
  * When `character` is undefined, the function will return true.
  */
-function isBoundary(character?: string) {
+function isBoundary(character?: string): boolean {
     return character === undefined || /[\s.,!?¡¿‽⸮;:&(){}\[\]⟨⟩‹›«»'"‘’“”*~\-_…⋯᠁]/.test(character);
 }
 
+/**
+ * Return whether the specified character is a URL boundary character.
+ * When `character` is undefined, the function will return true.
+ *
+ * Characters that may be in an URL according to RFC 3986:
+ * ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;=%
+ */
+function isUrlBoundary(character?: string): boolean {
+    return character === undefined || !/[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]/.test(character);
+}
+
+/**
+ * Return whether the specified string starts an URL.
+ */
+function isUrlStart(substring: string): boolean {
+    return substring.match(/^[a-zA-Z]+:\/\//) != null;
+}
+
 /**
  * This function accepts a string and returns a list of tokens.
  */
 export function tokenize(text: string): Token[] {
     const tokens = [];
     let textBuf = '';
+    let matchingUrl = false;
 
     const pushTextBufToken = () => {
         if (textBuf.length > 0) {
@@ -73,23 +92,39 @@ export function tokenize(text: string): Token[] {
 
     for (let i = 0; i < text.length; i++) {
         const currentChar = text[i];
-        const prevIsBoundary = isBoundary(text[i - 1]);
-        const nextIsBoundary = isBoundary(text[i + 1]);
-
-        if (currentChar === '*' && (prevIsBoundary || nextIsBoundary)) {
-            pushTextBufToken();
-            tokens.push({ kind: TokenType.Asterisk });
-        } else if (currentChar === '_' && (prevIsBoundary || nextIsBoundary)) {
-            pushTextBufToken();
-            tokens.push({ kind: TokenType.Underscore });
-        } else if (currentChar === '~' && (prevIsBoundary || nextIsBoundary)) {
-            pushTextBufToken();
-            tokens.push({ kind: TokenType.Tilde });
-        } else if (currentChar === '\n') {
-            pushTextBufToken();
-            tokens.push({ kind: TokenType.Newline });
-        } else {
+
+        // Detect URLs
+        if (!matchingUrl) {
+            matchingUrl = isUrlStart(text.substring(i));
+        }
+
+        // URLs have a limited set of boundary characters, therefore we need to
+        // treat them separately.
+        if (matchingUrl) {
             textBuf += currentChar;
+            const nextIsUrlBoundary = isUrlBoundary(text[i + 1]);
+            if (nextIsUrlBoundary) {
+                pushTextBufToken();
+                matchingUrl = false;
+            }
+        } else {
+            const prevIsBoundary = isBoundary(text[i - 1]);
+            const nextIsBoundary = isBoundary(text[i + 1]);
+            if (currentChar === '*' && (prevIsBoundary || nextIsBoundary)) {
+                pushTextBufToken();
+                tokens.push({ kind: TokenType.Asterisk });
+            } else if (currentChar === '_' && (prevIsBoundary || nextIsBoundary)) {
+                pushTextBufToken();
+                tokens.push({ kind: TokenType.Underscore });
+            } else if (currentChar === '~' && (prevIsBoundary || nextIsBoundary)) {
+                pushTextBufToken();
+                tokens.push({ kind: TokenType.Tilde });
+            } else if (currentChar === '\n') {
+                pushTextBufToken();
+                tokens.push({ kind: TokenType.Newline });
+            } else {
+                textBuf += currentChar;
+            }
         }
     }
 

+ 5 - 3
tests/ts/markup_parser.ts

@@ -56,15 +56,15 @@ describe('Markup Parser', () => {
             ]);
         });
 
-        it('ignore in URLs', function() {
-            const text = 'ignore if *in* a link: https://example.com/_pub_/horse.jpg';
+        it('ignore markup in URLs', function() {
+            const text = 'ignore if *in* a link: https://example.com/pic_-_a.jpg';
             const tokens = tokenize(text);
             expect(tokens).toEqual([
                 { kind: TokenType.Text, value: 'ignore if ' },
                 { kind: TokenType.Asterisk },
                 { kind: TokenType.Text, value: 'in' },
                 { kind: TokenType.Asterisk },
-                { kind: TokenType.Text, value: ' a link: https://example.com/_pub_/horse.jpg' },
+                { kind: TokenType.Text, value: ' a link: https://example.com/pic_-_a.jpg' },
             ]);
         });
 
@@ -246,6 +246,8 @@ describe('Markup Parser', () => {
                  'https://example.com?__twitter_impression=true'],
                 ['https://example.com?___twitter_impression=true',
                  'https://example.com?___twitter_impression=true'],
+                ['https://example.com/image_-_1.jpg',
+                 'https://example.com/image_-_1.jpg'],
             ]);
         });