feat: 抽取 html 标签包裹的单行文本(通常出现于表格中)

This commit is contained in:
Zhicheng Wang 2018-03-03 11:45:03 +08:00
parent b8400fad7c
commit eb73ad0027
6 changed files with 988 additions and 3060 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,21 @@ describe('从对照翻译文件中采集生成字典', () => {
expect(result).eql([{original: 'a', translation: '一'}]);
});
it('处理 html 标签包裹的翻译文本', () => {
const result = gatherTranslations(`
<p>
a
</p>
<p>
</p>
`);
expect(result).eql([{original: 'a', translation: '一'}]);
});
it('从真实的文件中采集(测试)', function () {
const fs = require('fs');
const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8');

View File

@ -2,6 +2,7 @@ import * as globby from 'globby';
import { DictEntry } from './dict-entry';
import {
isNotCnPages,
isOnlyTag,
normalizeLines,
originalIsNotChinese,
originalIsNotTag,
@ -28,7 +29,20 @@ export function gatherTranslations(text: string): DictEntry[] {
const translation = purifyText(lines[i]);
if (isTranslation(translation)) {
const original = purifyText(lines[i - 1]);
result.push({original, translation});
// 对于包裹在 html tag 中的翻译文本进行特殊处理
if (isOnlyTag(original)) {
const prevTag = lines[i - 4].trim();
const prevEndTag = lines[i - 2].trim();
const thisEndTag = lines[i + 1].trim();
if (original === prevTag && prevEndTag === thisEndTag) {
result.push({
original: lines[i - 3],
translation: lines[i],
});
}
} else {
result.push({original, translation});
}
}
}
return result

View File

@ -9,12 +9,20 @@ export function originalIsNotChinese(entry: DictEntry): boolean {
return !isTranslation(entry.original);
}
export function isTagLine(text: string) {
return /^\s*<\/?\w+.*/.test(text);
}
export function originalIsNotTag(entry: DictEntry): boolean {
return !/^\s*<div.*/.test(entry.original);
return !isTagLine(entry.original);
}
export function isOnlyTag(text: string) {
return /^\s*<\w+>\s*$/.test(text);
}
export function originalIsOnlyTag(entry: DictEntry): boolean {
return !/^\s*<\w+>\s*$/.test(entry.original);
return !isOnlyTag(entry.original);
}
export function isNotImg(entry: DictEntry): boolean {
@ -40,10 +48,11 @@ export function isHead(line: string): boolean {
export function normalizeLines(text: string): string {
// 列表、标题等自带换行含义的markdown
const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g;
const htmlTagPattern = /\n(\s*<.*?>\s*?)\n/g;
return text.replace(blockElementPattern, '\n\n')
.replace(htmlTagPattern, '\n\n$1\n\n')
.replace(/\n\n+/, '\n\n');
const htmlTagPattern = /(\s*<.*?>\s*?)\n/g;
text = text.replace(blockElementPattern, '\n\n');
text = text.replace(htmlTagPattern, '\n\n$1\n\n');
text = text.replace(/\n\n+/, '\n\n');
return text;
}
export function indentOf(line): number {