Use mfm-js for MFM parsing (#7415)

* wip

* Update mfm.ts

* wip

* update mfmjs

* refactor

* nanka

* Update mfm.ts

* Update to-html.ts

* Update to-html.ts

* wip

* fix test

* fix test
This commit is contained in:
syuilo
2021-04-02 10:36:11 +09:00
committed by GitHub
parent b378066ebf
commit 1f4ae2f63a
31 changed files with 262 additions and 1771 deletions

View File

@@ -1,7 +1,9 @@
import * as parse5 from 'parse5';
import treeAdapter = require('parse5/lib/tree-adapters/default');
import { URL } from 'url';
import { urlRegex, urlRegexFull } from './prelude';
const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;
export function fromHtml(html: string, hashtagNames?: string[]): string {
const dom = parse5.parseFragment(html);

View File

@@ -1,191 +0,0 @@
import * as P from 'parsimmon';
import { createLeaf, createTree, urlRegex } from './prelude';
import { takeWhile, cumulativeSum } from '../prelude/array';
import parseAcct from '@/misc/acct/parse';
import { toUnicode } from 'punycode';
import { emojiRegex } from '@/misc/emoji-regex';
export function removeOrphanedBrackets(s: string): string {
const openBrackets = ['(', '「', '['];
const closeBrackets = [')', '」', ']'];
const xs = cumulativeSum(s.split('').map(c => {
if (openBrackets.includes(c)) return 1;
if (closeBrackets.includes(c)) return -1;
return 0;
}));
const firstOrphanedCloseBracket = xs.findIndex(x => x < 0);
if (firstOrphanedCloseBracket !== -1) return s.substr(0, firstOrphanedCloseBracket);
const lastMatched = xs.lastIndexOf(0);
return s.substr(0, lastMatched + 1);
}
export const mfmLanguage = P.createLanguage({
root: r => P.alt(r.block, r.inline).atLeast(1),
plain: r => P.alt(r.emoji, r.text).atLeast(1),
block: r => P.alt(
r.quote,
r.search,
r.blockCode,
r.mathBlock,
r.center,
),
startOfLine: () => P((input, i) => {
if (i === 0 || input[i] === '\n' || input[i - 1] === '\n') {
return P.makeSuccess(i, null);
} else {
return P.makeFailure(i, 'not newline');
}
}),
quote: r => r.startOfLine.then(P((input, i) => {
const text = input.substr(i);
if (!text.match(/^>[\s\S]+?/)) return P.makeFailure(i, 'not a quote');
const quote = takeWhile(line => line.startsWith('>'), text.split('\n'));
const qInner = quote.join('\n').replace(/^>/gm, '').replace(/^ /gm, '');
if (qInner === '') return P.makeFailure(i, 'not a quote');
const contents = r.root.tryParse(qInner);
return P.makeSuccess(i + quote.join('\n').length + 1, createTree('quote', contents, {}));
})),
search: r => r.startOfLine.then(P((input, i) => {
const text = input.substr(i);
const match = text.match(/^(.+?)( | )(検索|\[検索\]|Search|\[Search\])(\n|$)/i);
if (!match) return P.makeFailure(i, 'not a search');
return P.makeSuccess(i + match[0].length, createLeaf('search', { query: match[1], content: match[0].trim() }));
})),
blockCode: r => r.startOfLine.then(P((input, i) => {
const text = input.substr(i);
const match = text.match(/^```(.+?)?\n([\s\S]+?)\n```(\n|$)/i);
if (!match) return P.makeFailure(i, 'not a blockCode');
return P.makeSuccess(i + match[0].length, createLeaf('blockCode', { code: match[2], lang: match[1] ? match[1].trim() : null }));
})),
inline: r => P.alt(
r.big,
r.bold,
r.small,
r.italic,
r.strike,
r.inlineCode,
r.mathInline,
r.mention,
r.hashtag,
r.url,
r.link,
r.emoji,
r.fn,
r.text
),
// TODO: そのうち消す
big: r => P.regexp(/^\*\*\*([\s\S]+?)\*\*\*/, 1).map(x => createTree('fn', r.inline.atLeast(1).tryParse(x), {
name: 'tada',
args: {}
})),
bold: r => {
const asterisk = P.regexp(/\*\*([\s\S]+?)\*\*/, 1);
const underscore = P.regexp(/__([a-zA-Z0-9\s]+?)__/, 1);
return P.alt(asterisk, underscore).map(x => createTree('bold', r.inline.atLeast(1).tryParse(x), {}));
},
small: r => P.regexp(/<small>([\s\S]+?)<\/small>/, 1).map(x => createTree('small', r.inline.atLeast(1).tryParse(x), {})),
italic: r => {
const xml = P.regexp(/<i>([\s\S]+?)<\/i>/, 1);
const underscore = P((input, i) => {
const text = input.substr(i);
const match = text.match(/^(\*|_)([a-zA-Z0-9]+?[\s\S]*?)\1/);
if (!match) return P.makeFailure(i, 'not a italic');
if (input[i - 1] != null && input[i - 1] != ' ' && input[i - 1] != '\n') return P.makeFailure(i, 'not a italic');
return P.makeSuccess(i + match[0].length, match[2]);
});
return P.alt(xml, underscore).map(x => createTree('italic', r.inline.atLeast(1).tryParse(x), {}));
},
strike: r => P.regexp(/~~([^\n~]+?)~~/, 1).map(x => createTree('strike', r.inline.atLeast(1).tryParse(x), {})),
center: r => r.startOfLine.then(P.regexp(/<center>([\s\S]+?)<\/center>/, 1).map(x => createTree('center', r.inline.atLeast(1).tryParse(x), {}))),
inlineCode: () => P.regexp(/`([^´\n]+?)`/, 1).map(x => createLeaf('inlineCode', { code: x })),
mathBlock: r => r.startOfLine.then(P.regexp(/\\\[([\s\S]+?)\\\]/, 1).map(x => createLeaf('mathBlock', { formula: x.trim() }))),
mathInline: () => P.regexp(/\\\((.+?)\\\)/, 1).map(x => createLeaf('mathInline', { formula: x })),
mention: () => {
return P((input, i) => {
const text = input.substr(i);
const match = text.match(/^@\w([\w-]*\w)?(?:@[\w.\-]+\w)?/);
if (!match) return P.makeFailure(i, 'not a mention');
if (input[i - 1] != null && input[i - 1].match(/[a-z0-9]/i)) return P.makeFailure(i, 'not a mention');
return P.makeSuccess(i + match[0].length, match[0]);
}).map(x => {
const { username, host } = parseAcct(x.substr(1));
const canonical = host != null ? `@${username}@${toUnicode(host)}` : x;
return createLeaf('mention', { canonical, username, host, acct: x });
});
},
hashtag: () => P((input, i) => {
const text = input.substr(i);
const match = text.match(/^#([^\s.,!?'"#:\/\[\]【】]+)/i);
if (!match) return P.makeFailure(i, 'not a hashtag');
let hashtag = match[1];
hashtag = removeOrphanedBrackets(hashtag);
if (hashtag.match(/^(\u20e3|\ufe0f)/)) return P.makeFailure(i, 'not a hashtag');
if (hashtag.match(/^[0-9]+$/)) return P.makeFailure(i, 'not a hashtag');
if (input[i - 1] != null && input[i - 1].match(/[a-z0-9]/i)) return P.makeFailure(i, 'not a hashtag');
if (Array.from(hashtag || '').length > 128) return P.makeFailure(i, 'not a hashtag');
return P.makeSuccess(i + ('#' + hashtag).length, createLeaf('hashtag', { hashtag: hashtag }));
}),
url: () => {
return P((input, i) => {
const text = input.substr(i);
const match = text.match(urlRegex);
let url: string;
if (!match) {
const match = text.match(/^<(https?:\/\/.*?)>/);
if (!match) {
return P.makeFailure(i, 'not a url');
}
url = match[1];
i += 2;
} else {
url = match[0];
}
url = removeOrphanedBrackets(url);
url = url.replace(/[.,]*$/, '');
return P.makeSuccess(i + url.length, url);
}).map(x => createLeaf('url', { url: x }));
},
link: r => {
return P.seqObj(
['silent', P.string('?').fallback(null).map(x => x != null)] as any,
P.string('['), ['text', P.regexp(/[^\n\[\]]+/)] as any, P.string(']'),
P.string('('), ['url', r.url] as any, P.string(')'),
).map((x: any) => {
return createTree('link', r.inline.atLeast(1).tryParse(x.text), {
silent: x.silent,
url: x.url.node.props.url
});
});
},
emoji: () => {
const name = P.regexp(/:([a-z0-9_+-]+):/i, 1).map(x => createLeaf('emoji', { name: x }));
const code = P.regexp(emojiRegex).map(x => createLeaf('emoji', { emoji: x }));
return P.alt(name, code);
},
fn: r => {
return P.seqObj(
P.string('['), ['fn', P.regexp(/[^\s\n\[\]]+/)] as any, P.string(' '), P.optWhitespace, ['text', P.regexp(/[^\n\[\]]+/)] as any, P.string(']'),
).map((x: any) => {
let name = x.fn;
const args = {};
const separator = x.fn.indexOf('.');
if (separator > -1) {
name = x.fn.substr(0, separator);
for (const arg of x.fn.substr(separator + 1).split(',')) {
const kv = arg.split('=');
if (kv.length === 1) {
args[kv[0]] = true;
} else {
args[kv[0]] = kv[1];
}
}
}
return createTree('fn', r.inline.atLeast(1).tryParse(x.text), {
name,
args
});
});
},
text: () => P.any.map(x => createLeaf('text', { text: x }))
});

View File

@@ -1,31 +0,0 @@
import * as A from '../prelude/array';
import * as S from '../prelude/string';
import { MfmForest, MfmTree } from './prelude';
import { createTree, createLeaf } from '../prelude/tree';
function isEmptyTextTree(t: MfmTree): boolean {
return t.node.type === 'text' && t.node.props.text === '';
}
function concatTextTrees(ts: MfmForest): MfmTree {
return createLeaf({ type: 'text', props: { text: S.concat(ts.map(x => x.node.props.text)) } });
}
function concatIfTextTrees(ts: MfmForest): MfmForest {
return ts[0].node.type === 'text' ? [concatTextTrees(ts)] : ts;
}
function concatConsecutiveTextTrees(ts: MfmForest): MfmForest {
const us = A.concat(A.groupOn(t => t.node.type, ts).map(concatIfTextTrees));
return us.map(t => createTree(t.node, concatConsecutiveTextTrees(t.children)));
}
function removeEmptyTextNodes(ts: MfmForest): MfmForest {
return ts
.filter(t => !isEmptyTextTree(t))
.map(t => createTree(t.node, removeEmptyTextNodes(t.children)));
}
export function normalize(ts: MfmForest): MfmForest {
return removeEmptyTextNodes(concatConsecutiveTextTrees(ts));
}

View File

@@ -1,19 +0,0 @@
import { mfmLanguage } from './language';
import { MfmForest } from './prelude';
import { normalize } from './normalize';
export function parse(source: string | null): MfmForest | null {
if (source == null || source === '') {
return null;
}
return normalize(mfmLanguage.root.tryParse(source));
}
export function parsePlain(source: string | null): MfmForest | null {
if (source == null || source === '') {
return null;
}
return normalize(mfmLanguage.plain.tryParse(source));
}

View File

@@ -1,40 +0,0 @@
import { Tree } from '../prelude/tree';
import * as T from '../prelude/tree';
type Node<T, P> = { type: T, props: P };
export type MentionNode = Node<'mention', {
canonical: string,
username: string,
host: string,
acct: string
}>;
export type HashtagNode = Node<'hashtag', {
hashtag: string
}>;
export type EmojiNode = Node<'emoji', {
name: string
}>;
export type MfmNode =
MentionNode |
HashtagNode |
EmojiNode |
Node<string, any>;
export type MfmTree = Tree<MfmNode>;
export type MfmForest = MfmTree[];
export function createLeaf(type: string, props: any): MfmTree {
return T.createLeaf({ type, props });
}
export function createTree(type: string, children: MfmForest, props: any): MfmTree {
return T.createTree({ type, props }, children);
}
export const urlRegex = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+/;
export const urlRegexFull = /^https?:\/\/[\w\/:%#@$&?!()\[\]~.,=+\-]+$/;

View File

@@ -1,12 +1,12 @@
import { JSDOM } from 'jsdom';
import * as mfm from 'mfm-js';
import config from '@/config';
import { intersperse } from '../prelude/array';
import { MfmForest, MfmTree } from './prelude';
import { IMentionedRemoteUsers } from '../models/entities/note';
import { wellKnownServices } from '../well-known-services';
export function toHtml(tokens: MfmForest | null, mentionedRemoteUsers: IMentionedRemoteUsers = []) {
if (tokens == null) {
export function toHtml(nodes: mfm.MfmNode[] | null, mentionedRemoteUsers: IMentionedRemoteUsers = []) {
if (nodes == null) {
return null;
}
@@ -14,95 +14,101 @@ export function toHtml(tokens: MfmForest | null, mentionedRemoteUsers: IMentione
const doc = window.document;
function appendChildren(children: MfmForest, targetElement: any): void {
for (const child of children.map(t => handlers[t.node.type](t))) targetElement.appendChild(child);
function appendChildren(children: mfm.MfmNode[], targetElement: any): void {
if (children) {
for (const child of children.map(x => (handlers as any)[x.type](x))) targetElement.appendChild(child);
}
}
const handlers: { [key: string]: (token: MfmTree) => any } = {
bold(token) {
const handlers: { [K in mfm.MfmNode['type']]: (node: mfm.NodeType<K>) => any } = {
bold(node) {
const el = doc.createElement('b');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
small(token) {
small(node) {
const el = doc.createElement('small');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
strike(token) {
strike(node) {
const el = doc.createElement('del');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
italic(token) {
italic(node) {
const el = doc.createElement('i');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
fn(token) {
fn(node) {
const el = doc.createElement('i');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
blockCode(token) {
blockCode(node) {
const pre = doc.createElement('pre');
const inner = doc.createElement('code');
inner.textContent = token.node.props.code;
inner.textContent = node.props.code;
pre.appendChild(inner);
return pre;
},
center(token) {
center(node) {
const el = doc.createElement('div');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
emoji(token) {
return doc.createTextNode(token.node.props.emoji ? token.node.props.emoji : `\u200B:${token.node.props.name}:\u200B`);
emojiCode(node) {
return doc.createTextNode(`\u200B:${node.props.name}:\u200B`);
},
hashtag(token) {
unicodeEmoji(node) {
return doc.createTextNode(node.props.emoji);
},
hashtag(node) {
const a = doc.createElement('a');
a.href = `${config.url}/tags/${token.node.props.hashtag}`;
a.textContent = `#${token.node.props.hashtag}`;
a.href = `${config.url}/tags/${node.props.hashtag}`;
a.textContent = `#${node.props.hashtag}`;
a.setAttribute('rel', 'tag');
return a;
},
inlineCode(token) {
inlineCode(node) {
const el = doc.createElement('code');
el.textContent = token.node.props.code;
el.textContent = node.props.code;
return el;
},
mathInline(token) {
mathInline(node) {
const el = doc.createElement('code');
el.textContent = token.node.props.formula;
el.textContent = node.props.formula;
return el;
},
mathBlock(token) {
mathBlock(node) {
const el = doc.createElement('code');
el.textContent = token.node.props.formula;
el.textContent = node.props.formula;
return el;
},
link(token) {
link(node) {
const a = doc.createElement('a');
a.href = token.node.props.url;
appendChildren(token.children, a);
a.href = node.props.url;
appendChildren(node.children, a);
return a;
},
mention(token) {
mention(node) {
const a = doc.createElement('a');
const { username, host, acct } = token.node.props;
const { username, host, acct } = node.props;
const wellKnown = wellKnownServices.find(x => x[0] === host);
if (wellKnown) {
a.href = wellKnown[1](username);
@@ -115,39 +121,39 @@ export function toHtml(tokens: MfmForest | null, mentionedRemoteUsers: IMentione
return a;
},
quote(token) {
quote(node) {
const el = doc.createElement('blockquote');
appendChildren(token.children, el);
appendChildren(node.children, el);
return el;
},
text(token) {
text(node) {
const el = doc.createElement('span');
const nodes = (token.node.props.text as string).split(/\r\n|\r|\n/).map(x => doc.createTextNode(x) as Node);
const nodes = node.props.text.split(/\r\n|\r|\n/).map(x => doc.createTextNode(x));
for (const x of intersperse<Node | 'br'>('br', nodes)) {
for (const x of intersperse<FIXME | 'br'>('br', nodes)) {
el.appendChild(x === 'br' ? doc.createElement('br') : x);
}
return el;
},
url(token) {
url(node) {
const a = doc.createElement('a');
a.href = token.node.props.url;
a.textContent = token.node.props.url;
a.href = node.props.url;
a.textContent = node.props.url;
return a;
},
search(token) {
search(node) {
const a = doc.createElement('a');
a.href = `https://www.google.com/search?q=${token.node.props.query}`;
a.textContent = token.node.props.content;
a.href = `https://www.google.com/search?q=${node.props.query}`;
a.textContent = node.props.content;
return a;
}
};
appendChildren(tokens, doc.body);
appendChildren(nodes, doc.body);
return `<p>${doc.body.innerHTML}</p>`;
}

View File

@@ -1,99 +0,0 @@
import { MfmForest, MfmTree } from './prelude';
import { nyaize } from '@/misc/nyaize';
export type RestoreOptions = {
doNyaize?: boolean;
};
export function toString(tokens: MfmForest | null, opts?: RestoreOptions): string {
if (tokens === null) return '';
function appendChildren(children: MfmForest, opts?: RestoreOptions): string {
return children.map(t => handlers[t.node.type](t, opts)).join('');
}
const handlers: { [key: string]: (token: MfmTree, opts?: RestoreOptions) => string } = {
bold(token, opts) {
return `**${appendChildren(token.children, opts)}**`;
},
small(token, opts) {
return `<small>${appendChildren(token.children, opts)}</small>`;
},
strike(token, opts) {
return `~~${appendChildren(token.children, opts)}~~`;
},
italic(token, opts) {
return `<i>${appendChildren(token.children, opts)}</i>`;
},
fn(token, opts) {
const name = token.node.props?.name;
const args = token.node.props?.args || {};
const argsStr = Object.entries(args).map(([k, v]) => v === true ? k : `${k}=${v}`).join(',');
return `[${name}${argsStr !== '' ? '.' + argsStr : ''} ${appendChildren(token.children, opts)}]`;
},
blockCode(token) {
return `\`\`\`${token.node.props.lang || ''}\n${token.node.props.code}\n\`\`\`\n`;
},
center(token, opts) {
return `<center>${appendChildren(token.children, opts)}</center>`;
},
emoji(token) {
return (token.node.props.emoji ? token.node.props.emoji : `:${token.node.props.name}:`);
},
hashtag(token) {
return `#${token.node.props.hashtag}`;
},
inlineCode(token) {
return `\`${token.node.props.code}\``;
},
mathInline(token) {
return `\\(${token.node.props.formula}\\)`;
},
mathBlock(token) {
return `\\[${token.node.props.formula}\\]`;
},
link(token, opts) {
if (token.node.props.silent) {
return `?[${appendChildren(token.children, opts)}](${token.node.props.url})`;
} else {
return `[${appendChildren(token.children, opts)}](${token.node.props.url})`;
}
},
mention(token) {
return token.node.props.canonical;
},
quote(token) {
return `${appendChildren(token.children, {doNyaize: false}).replace(/^/gm,'>').trim()}\n`;
},
text(token, opts) {
return (opts && opts.doNyaize) ? nyaize(token.node.props.text) : token.node.props.text;
},
url(token) {
return `<${token.node.props.url}>`;
},
search(token, opts) {
const query = token.node.props.query;
return `${(opts && opts.doNyaize ? nyaize(query) : query)} [search]\n`;
}
};
return appendChildren(tokens, { doNyaize: (opts && opts.doNyaize) || false }).trim();
}