Improve MFM parser (#3337)

* wip

* wip

* Refactor

* Refactor

* wip

* wip

* wip

* wip

* Refactor

* Refactor

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* Clean up

* Update misskey-flavored-markdown.ts

* wip

* wip

* wip

* wip

* Update parser.ts

* wip

* Add new test

* wip

* Add new test

* Add new test

* wip

* Refactor

* Update parse.ts

* Refactor

* Update parser.ts

* wip
This commit is contained in:
syuilo
2018-11-21 05:11:00 +09:00
committed by GitHub
parent 6e347e4221
commit 79ffbf95db
44 changed files with 1097 additions and 916 deletions

View File

@@ -1,127 +1,135 @@
const { lib: emojilib } = require('emojilib');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
import config from '../config';
import { INote } from '../models/note';
import { TextElement } from './parse';
import { Node } from './parser';
import { intersperse } from '../prelude/array';
const handlers: { [key: string]: (window: any, token: any, mentionedRemoteUsers: INote['mentionedRemoteUsers']) => void } = {
bold({ document }, { bold }) {
const b = document.createElement('b');
b.textContent = bold;
document.body.appendChild(b);
},
big({ document }, { big }) {
const b = document.createElement('strong');
b.textContent = big;
document.body.appendChild(b);
},
motion({ document }, { big }) {
const b = document.createElement('strong');
b.textContent = big;
document.body.appendChild(b);
},
code({ document }, { code }) {
const pre = document.createElement('pre');
const inner = document.createElement('code');
inner.innerHTML = code;
pre.appendChild(inner);
document.body.appendChild(pre);
},
emoji({ document }, { content, emoji }) {
const found = emojilib[emoji];
const node = document.createTextNode(found ? found.char : content);
document.body.appendChild(node);
},
hashtag({ document }, { hashtag }) {
const a = document.createElement('a');
a.href = `${config.url}/tags/${hashtag}`;
a.textContent = `#${hashtag}`;
a.setAttribute('rel', 'tag');
document.body.appendChild(a);
},
'inline-code'({ document }, { code }) {
const element = document.createElement('code');
element.textContent = code;
document.body.appendChild(element);
},
math({ document }, { formula }) {
const element = document.createElement('code');
element.textContent = formula;
document.body.appendChild(element);
},
link({ document }, { url, title }) {
const a = document.createElement('a');
a.href = url;
a.textContent = title;
document.body.appendChild(a);
},
mention({ document }, { content, username, host }, mentionedRemoteUsers) {
const a = document.createElement('a');
const remoteUserInfo = mentionedRemoteUsers.find(remoteUser => remoteUser.username === username && remoteUser.host === host);
a.href = remoteUserInfo ? remoteUserInfo.uri : `${config.url}/${content}`;
a.textContent = content;
document.body.appendChild(a);
},
quote({ document }, { quote }) {
const blockquote = document.createElement('blockquote');
blockquote.textContent = quote;
document.body.appendChild(blockquote);
},
title({ document }, { content }) {
const h1 = document.createElement('h1');
h1.textContent = content;
document.body.appendChild(h1);
},
text({ document }, { content }) {
const nodes = (content as string).split('\n').map(x => document.createTextNode(x));
for (const x of intersperse('br', nodes)) {
if (x === 'br') {
document.body.appendChild(document.createElement('br'));
} else {
document.body.appendChild(x);
}
}
},
url({ document }, { url }) {
const a = document.createElement('a');
a.href = url;
a.textContent = url;
document.body.appendChild(a);
},
search({ document }, { content, query }) {
const a = document.createElement('a');
a.href = `https://www.google.com/?#q=${query}`;
a.textContent = content;
document.body.appendChild(a);
}
};
export default (tokens: TextElement[], mentionedRemoteUsers: INote['mentionedRemoteUsers'] = []) => {
export default (tokens: Node[], mentionedRemoteUsers: INote['mentionedRemoteUsers'] = []) => {
if (tokens == null) {
return null;
}
const { window } = new JSDOM('');
for (const token of tokens) {
handlers[token.type](window, token, mentionedRemoteUsers);
const doc = window.document;
function dive(nodes: Node[]): any[] {
return nodes.map(n => handlers[n.name](n));
}
return `<p>${window.document.body.innerHTML}</p>`;
const handlers: { [key: string]: (token: Node) => any } = {
bold(token) {
const el = doc.createElement('b');
dive(token.children).forEach(child => el.appendChild(child));
return el;
},
big(token) {
const el = doc.createElement('strong');
dive(token.children).forEach(child => el.appendChild(child));
return el;
},
motion(token) {
const el = doc.createElement('i');
dive(token.children).forEach(child => el.appendChild(child));
return el;
},
blockCode(token) {
const pre = doc.createElement('pre');
const inner = doc.createElement('code');
inner.innerHTML = token.props.code;
pre.appendChild(inner);
return pre;
},
emoji(token) {
return doc.createTextNode(token.props.emoji ? token.props.emoji : `:${token.props.name}:`);
},
hashtag(token) {
const a = doc.createElement('a');
a.href = `${config.url}/tags/${token.props.hashtag}`;
a.textContent = `#${token.props.hashtag}`;
a.setAttribute('rel', 'tag');
return a;
},
inlineCode(token) {
const el = doc.createElement('code');
el.textContent = token.props.code;
return el;
},
math(token) {
const el = doc.createElement('code');
el.textContent = token.props.formula;
return el;
},
link(token) {
const a = doc.createElement('a');
a.href = token.props.url;
dive(token.children).forEach(child => a.appendChild(child));
return a;
},
mention(token) {
const a = doc.createElement('a');
const { username, host, acct } = token.props;
const remoteUserInfo = mentionedRemoteUsers.find(remoteUser => remoteUser.username === username && remoteUser.host === host);
a.href = remoteUserInfo ? remoteUserInfo.uri : `${config.url}/${acct}`;
a.textContent = acct;
return a;
},
quote(token) {
const el = doc.createElement('blockquote');
dive(token.children).forEach(child => el.appendChild(child));
return el;
},
title(token) {
const el = doc.createElement('h1');
dive(token.children).forEach(child => el.appendChild(child));
return el;
},
text(token) {
const el = doc.createElement('span');
const nodes = (token.props.text as string).split('\n').map(x => doc.createTextNode(x));
for (const x of intersperse('br', nodes)) {
if (x === 'br') {
el.appendChild(doc.createElement('br'));
} else {
el.appendChild(x);
}
}
return el;
},
url(token) {
const a = doc.createElement('a');
a.href = token.props.url;
a.textContent = token.props.url;
return a;
},
search(token) {
const a = doc.createElement('a');
a.href = `https://www.google.com/?#q=${token.props.query}`;
a.textContent = token.props.content;
return a;
}
};
dive(tokens).forEach(x => {
doc.body.appendChild(x);
});
return `<p>${doc.body.innerHTML}</p>`;
};

81
src/mfm/parse.ts Normal file
View File

@@ -0,0 +1,81 @@
import parser, { Node } from './parser';
import * as A from '../prelude/array';
import * as S from '../prelude/string';
export default (source: string): Node[] => {
if (source == null || source == '') {
return null;
}
let nodes: Node[] = parser.root.tryParse(source);
const combineText = (es: Node[]): Node =>
({ name: 'text', props: { text: S.concat(es.map(e => e.props.text)) } });
const concatText = (nodes: Node[]): Node[] =>
A.concat(A.groupOn(x => x.name, nodes).map(es =>
es[0].name === 'text' ? [combineText(es)] : es
));
const concatTextRecursive = (es: Node[]): void =>
es.filter(x => x.children).forEach(x => {
x.children = concatText(x.children);
concatTextRecursive(x.children);
});
nodes = concatText(nodes);
concatTextRecursive(nodes);
function getBeforeTextNode(node: Node): Node {
if (node == null) return null;
if (node.name == 'text') return node;
if (node.children) return getBeforeTextNode(node.children[node.children.length - 1]);
return null;
}
function getAfterTextNode(node: Node): Node {
if (node == null) return null;
if (node.name == 'text') return node;
if (node.children) return getBeforeTextNode(node.children[0]);
return null;
}
function isBlockNode(node: Node): boolean {
return ['blockCode', 'quote', 'title'].includes(node.name);
}
/**
* ブロック要素の前後にある改行を削除します(ブロック要素自体が改行の役割も果たすため、余計に改行されてしまうため)
* @param nodes
*/
const removeNeedlessLineBreaks = (nodes: Node[]) => {
nodes.forEach((node, i) => {
if (node.children) removeNeedlessLineBreaks(node.children);
if (isBlockNode(node)) {
const before = getBeforeTextNode(nodes[i - 1]);
const after = getAfterTextNode(nodes[i + 1]);
if (before && before.props.text.endsWith('\n')) {
before.props.text = before.props.text.substring(0, before.props.text.length - 1);
}
if (after && after.props.text.startsWith('\n')) {
after.props.text = after.props.text.substring(1);
}
}
});
};
const removeEmptyTextNodes = (nodes: Node[]) => {
nodes.forEach(n => {
if (n.children) {
n.children = removeEmptyTextNodes(n.children);
}
});
return nodes.filter(n => !(n.name == 'text' && n.props.text == ''));
};
removeNeedlessLineBreaks(nodes);
nodes = removeEmptyTextNodes(nodes);
return nodes;
};

View File

@@ -1,20 +0,0 @@
/**
* Big
*/
export type TextElementBig = {
type: 'big';
content: string;
big: string;
};
export default function(text: string) {
const match = text.match(/^\*\*\*(.+?)\*\*\*/);
if (!match) return null;
const big = match[0];
return {
type: 'big',
content: big,
big: match[1]
} as TextElementBig;
}

View File

@@ -1,20 +0,0 @@
/**
* Bold
*/
export type TextElementBold = {
type: 'bold';
content: string;
bold: string;
};
export default function(text: string) {
const match = text.match(/^\*\*(.+?)\*\*/);
if (!match) return null;
const bold = match[0];
return {
type: 'bold',
content: bold,
bold: match[1]
} as TextElementBold;
}

View File

@@ -1,24 +0,0 @@
/**
* Code (block)
*/
import genHtml from '../core/syntax-highlighter';
export type TextElementCode = {
type: 'code';
content: string;
code: string;
html: string;
};
export default function(text: string) {
const match = text.match(/^```([\s\S]+?)```/);
if (!match) return null;
const code = match[0];
return {
type: 'code',
content: code,
code: match[1],
html: genHtml(match[1].trim())
} as TextElementCode;
}

File diff suppressed because one or more lines are too long

View File

@@ -1,33 +0,0 @@
/**
* Emoji
*/
import { emojiRegex } from "./emoji.regex";
export type TextElementEmoji = {
type: 'emoji';
content: string;
emoji?: string;
name?: string;
};
export default function(text: string) {
const name = text.match(/^:([a-zA-Z0-9+_-]+):/);
if (name) {
return {
type: 'emoji',
content: name[0],
name: name[1]
} as TextElementEmoji;
}
const unicode = text.match(emojiRegex);
if (unicode) {
const [content] = unicode;
return {
type: 'emoji',
content,
emoji: content
} as TextElementEmoji;
}
return null;
}

View File

@@ -1,27 +0,0 @@
/**
* Hashtag
*/
export type TextElementHashtag = {
type: 'hashtag';
content: string;
hashtag: string;
};
export default function(text: string, before: string) {
const isBegin = before == '';
if (!(/^\s#[^\s\.,!\?#]+/.test(text) || (isBegin && /^#[^\s\.,!\?#]+/.test(text)))) return null;
const isHead = text.startsWith('#');
const hashtag = text.match(/^\s?#[^\s\.,!\?#]+/)[0];
const res: any[] = !isHead ? [{
type: 'text',
content: text[0]
}] : [];
res.push({
type: 'hashtag',
content: isHead ? hashtag : hashtag.substr(1),
hashtag: isHead ? hashtag.substr(1) : hashtag.substr(2)
});
return res as TextElementHashtag[];
}

View File

@@ -1,25 +0,0 @@
/**
* Code (inline)
*/
import genHtml from '../core/syntax-highlighter';
export type TextElementInlineCode = {
type: 'inline-code';
content: string;
code: string;
html: string;
};
export default function(text: string) {
const match = text.match(/^`(.+?)`/);
if (!match) return null;
if (match[1].includes('´')) return null;
const code = match[0];
return {
type: 'inline-code',
content: code,
code: match[1],
html: genHtml(match[1])
} as TextElementInlineCode;
}

View File

@@ -1,27 +0,0 @@
/**
* Link
*/
export type TextElementLink = {
type: 'link';
content: string;
title: string;
url: string;
silent: boolean;
};
export default function(text: string) {
const match = text.match(/^\??\[([^\[\]]+?)\]\((https?:\/\/[\w\/:%#@\$&\?!\(\)\[\]~\.=\+\-]+?)\)/);
if (!match) return null;
const silent = text.startsWith('?');
const link = match[0];
const title = match[1];
const url = match[2];
return {
type: 'link',
content: link,
title: title,
url: url,
silent: silent
} as TextElementLink;
}

View File

@@ -1,20 +0,0 @@
/**
* Math
*/
export type TextElementMath = {
type: 'math';
content: string;
formula: string;
};
export default function(text: string) {
const match = text.match(/^\\\((.+?)\\\)/);
if (!match) return null;
const math = match[0];
return {
type: 'math',
content: math,
formula: match[1]
} as TextElementMath;
}

View File

@@ -1,29 +0,0 @@
/**
* Mention
*/
import parseAcct from '../../../misc/acct/parse';
import { toUnicode } from 'punycode';
export type TextElementMention = {
type: 'mention';
content: string;
canonical: string;
username: string;
host: string;
};
export default function(text: string, before: string) {
const match = text.match(/^@[a-z0-9_]+(?:@[a-z0-9\.\-]+[a-z0-9])?/i);
if (!match) return null;
if (/[a-zA-Z0-9]$/.test(before)) return null;
const mention = match[0];
const { username, host } = parseAcct(mention.substr(1));
const canonical = host != null ? `@${username}@${toUnicode(host)}` : mention;
return {
type: 'mention',
content: mention,
canonical,
username,
host
} as TextElementMention;
}

View File

@@ -1,20 +0,0 @@
/**
* Motion
*/
export type TextElementMotion = {
type: 'motion';
content: string;
motion: string;
};
export default function(text: string) {
const match = text.match(/^\(\(\((.+?)\)\)\)/) || text.match(/^<motion>(.+?)<\/motion>/);
if (!match) return null;
const motion = match[0];
return {
type: 'motion',
content: motion,
motion: match[1]
} as TextElementMotion;
}

View File

@@ -1,30 +0,0 @@
/**
* Quoted text
*/
export type TextElementQuote = {
type: 'quote';
content: string;
quote: string;
};
export default function(text: string, before: string) {
const isBegin = before == '';
const match = text.match(/^"([\s\S]+?)\n"/) || text.match(/^\n>([\s\S]+?)(\n\n|$)/) ||
(isBegin ? text.match(/^>([\s\S]+?)(\n\n|$)/) : null);
if (!match) return null;
const quote = match[1]
.split('\n')
.map(line => line.replace(/^>+/g, '').trim())
.join('\n')
.trim();
return {
type: 'quote',
content: match[0],
quote: quote,
} as TextElementQuote;
}

View File

@@ -1,19 +0,0 @@
/**
* Search
*/
export type TextElementSearch = {
type: 'search';
content: string;
query: string;
};
export default function(text: string) {
const match = text.match(/^(.+?)( | )(検索|\[検索\]|Search|\[Search\])(\n|$)/i);
if (!match) return null;
return {
type: 'search',
content: match[0],
query: match[1]
};
}

View File

@@ -1,21 +0,0 @@
/**
* Title
*/
export type TextElementTitle = {
type: 'title';
content: string;
title: string;
};
export default function(text: string, before: string) {
const isBegin = before == '';
const match = isBegin ? text.match(/^(【|\[)(.+?)(】|])\n/) : text.match(/^\n(【|\[)(.+?)(】|])\n/);
if (!match) return null;
return {
type: 'title',
content: match[0],
title: match[2]
} as TextElementTitle;
}

View File

@@ -1,23 +0,0 @@
/**
* URL
*/
export type TextElementUrl = {
type: 'url';
content: string;
url: string;
};
export default function(text: string, before: string) {
const match = text.match(/^https?:\/\/[\w\/:%#@\$&\?!\(\)\[\]~\.,=\+\-]+/);
if (!match) return null;
let url = match[0];
if (url.endsWith('.')) url = url.substr(0, url.lastIndexOf('.'));
if (url.endsWith(',')) url = url.substr(0, url.lastIndexOf(','));
if (url.endsWith(')') && before.endsWith('(')) url = url.substr(0, url.lastIndexOf(')'));
return {
type: 'url',
content: url,
url: url
} as TextElementUrl;
}

View File

@@ -1,100 +0,0 @@
/**
* Misskey Text Analyzer
*/
import { TextElementBold } from './elements/bold';
import { TextElementBig } from './elements/big';
import { TextElementCode } from './elements/code';
import { TextElementEmoji } from './elements/emoji';
import { TextElementHashtag } from './elements/hashtag';
import { TextElementInlineCode } from './elements/inline-code';
import { TextElementMath } from './elements/math';
import { TextElementLink } from './elements/link';
import { TextElementMention } from './elements/mention';
import { TextElementQuote } from './elements/quote';
import { TextElementSearch } from './elements/search';
import { TextElementTitle } from './elements/title';
import { TextElementUrl } from './elements/url';
import { TextElementMotion } from './elements/motion';
import { groupOn } from '../../prelude/array';
import * as A from '../../prelude/array';
import * as S from '../../prelude/string';
const elements = [
require('./elements/big'),
require('./elements/bold'),
require('./elements/title'),
require('./elements/url'),
require('./elements/link'),
require('./elements/mention'),
require('./elements/hashtag'),
require('./elements/code'),
require('./elements/inline-code'),
require('./elements/math'),
require('./elements/quote'),
require('./elements/emoji'),
require('./elements/search'),
require('./elements/motion')
].map(element => element.default as TextElementProcessor);
export type TextElement = { type: 'text', content: string }
| TextElementBold
| TextElementBig
| TextElementCode
| TextElementEmoji
| TextElementHashtag
| TextElementInlineCode
| TextElementMath
| TextElementLink
| TextElementMention
| TextElementQuote
| TextElementSearch
| TextElementTitle
| TextElementUrl
| TextElementMotion;
export type TextElementProcessor = (text: string, before: string) => TextElement | TextElement[];
export default (source: string): TextElement[] => {
if (source == null || source == '') {
return null;
}
const tokens: TextElement[] = [];
function push(token: TextElement) {
if (token != null) {
tokens.push(token);
source = source.substr(token.content.length);
}
}
// パース
while (source != '') {
const parsed = elements.some(el => {
let _tokens = el(source, tokens.map(token => token.content).join(''));
if (_tokens) {
if (!Array.isArray(_tokens)) {
_tokens = [_tokens];
}
_tokens.forEach(push);
return true;
} else {
return false;
}
});
if (!parsed) {
push({
type: 'text',
content: source[0]
});
}
}
const combineText = (es: TextElement[]): TextElement =>
({ type: 'text', content: S.concat(es.map(e => e.content)) });
return A.concat(groupOn(x => x.type, tokens).map(es =>
es[0].type === 'text' ? [combineText(es)] : es
));
};

256
src/mfm/parser.ts Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,4 +1,4 @@
import { capitalize, toUpperCase } from "../../../prelude/string";
import { capitalize, toUpperCase } from "../prelude/string";
function escape(text: string) {
return text
@@ -308,7 +308,7 @@ const elements: Element[] = [
];
// specify lang is todo
export default (source: string, lang?: string) => {
export default (source: string, lang?: string): string => {
let code = source;
let html = '';