feat: 抽取 html 标签包裹的单行文本(通常出现于表格中)

This commit is contained in:
Zhicheng Wang 2018-03-03 11:45:03 +08:00
parent b8400fad7c
commit eb73ad0027
6 changed files with 988 additions and 3060 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,21 @@ describe('从对照翻译文件中采集生成字典', () => {
expect(result).eql([{original: 'a', translation: '一'}]); expect(result).eql([{original: 'a', translation: '一'}]);
}); });
it('处理 html 标签包裹的翻译文本', () => {
const result = gatherTranslations(`
<p>
a
</p>
<p>
</p>
`);
expect(result).eql([{original: 'a', translation: '一'}]);
});
it('从真实的文件中采集(测试)', function () { it('从真实的文件中采集(测试)', function () {
const fs = require('fs'); const fs = require('fs');
const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8'); const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8');

View File

@ -2,6 +2,7 @@ import * as globby from 'globby';
import { DictEntry } from './dict-entry'; import { DictEntry } from './dict-entry';
import { import {
isNotCnPages, isNotCnPages,
isOnlyTag,
normalizeLines, normalizeLines,
originalIsNotChinese, originalIsNotChinese,
originalIsNotTag, originalIsNotTag,
@ -28,7 +29,20 @@ export function gatherTranslations(text: string): DictEntry[] {
const translation = purifyText(lines[i]); const translation = purifyText(lines[i]);
if (isTranslation(translation)) { if (isTranslation(translation)) {
const original = purifyText(lines[i - 1]); const original = purifyText(lines[i - 1]);
result.push({original, translation}); // 对于包裹在 html tag 中的翻译文本进行特殊处理
if (isOnlyTag(original)) {
const prevTag = lines[i - 4].trim();
const prevEndTag = lines[i - 2].trim();
const thisEndTag = lines[i + 1].trim();
if (original === prevTag && prevEndTag === thisEndTag) {
result.push({
original: lines[i - 3],
translation: lines[i],
});
}
} else {
result.push({original, translation});
}
} }
} }
return result return result

View File

@ -9,12 +9,20 @@ export function originalIsNotChinese(entry: DictEntry): boolean {
return !isTranslation(entry.original); return !isTranslation(entry.original);
} }
export function isTagLine(text: string) {
return /^\s*<\/?\w+.*/.test(text);
}
export function originalIsNotTag(entry: DictEntry): boolean { export function originalIsNotTag(entry: DictEntry): boolean {
return !/^\s*<div.*/.test(entry.original); return !isTagLine(entry.original);
}
export function isOnlyTag(text: string) {
return /^\s*<\w+>\s*$/.test(text);
} }
export function originalIsOnlyTag(entry: DictEntry): boolean { export function originalIsOnlyTag(entry: DictEntry): boolean {
return !/^\s*<\w+>\s*$/.test(entry.original); return !isOnlyTag(entry.original);
} }
export function isNotImg(entry: DictEntry): boolean { export function isNotImg(entry: DictEntry): boolean {
@ -40,10 +48,11 @@ export function isHead(line: string): boolean {
export function normalizeLines(text: string): string { export function normalizeLines(text: string): string {
// 列表、标题等自带换行含义的markdown // 列表、标题等自带换行含义的markdown
const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g; const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g;
const htmlTagPattern = /\n(\s*<.*?>\s*?)\n/g; const htmlTagPattern = /(\s*<.*?>\s*?)\n/g;
return text.replace(blockElementPattern, '\n\n') text = text.replace(blockElementPattern, '\n\n');
.replace(htmlTagPattern, '\n\n$1\n\n') text = text.replace(htmlTagPattern, '\n\n$1\n\n');
.replace(/\n\n+/, '\n\n'); text = text.replace(/\n\n+/, '\n\n');
return text;
} }
export function indentOf(line): number { export function indentOf(line): number {