feat: 抽取 html 标签包裹的单行文本(通常出现于表格中)
This commit is contained in:
parent
b8400fad7c
commit
eb73ad0027
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -30,6 +30,21 @@ describe('从对照翻译文件中采集生成字典', () => {
|
||||
expect(result).eql([{original: 'a', translation: '一'}]);
|
||||
});
|
||||
|
||||
it('处理 html 标签包裹的翻译文本', () => {
|
||||
const result = gatherTranslations(`
|
||||
<p>
|
||||
a
|
||||
</p>
|
||||
|
||||
<p>
|
||||
一
|
||||
</p>
|
||||
|
||||
`);
|
||||
expect(result).eql([{original: 'a', translation: '一'}]);
|
||||
|
||||
});
|
||||
|
||||
it('从真实的文件中采集(测试)', function () {
|
||||
const fs = require('fs');
|
||||
const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8');
|
||||
|
@ -2,6 +2,7 @@ import * as globby from 'globby';
|
||||
import { DictEntry } from './dict-entry';
|
||||
import {
|
||||
isNotCnPages,
|
||||
isOnlyTag,
|
||||
normalizeLines,
|
||||
originalIsNotChinese,
|
||||
originalIsNotTag,
|
||||
@ -28,7 +29,20 @@ export function gatherTranslations(text: string): DictEntry[] {
|
||||
const translation = purifyText(lines[i]);
|
||||
if (isTranslation(translation)) {
|
||||
const original = purifyText(lines[i - 1]);
|
||||
result.push({original, translation});
|
||||
// 对于包裹在 html tag 中的翻译文本进行特殊处理
|
||||
if (isOnlyTag(original)) {
|
||||
const prevTag = lines[i - 4].trim();
|
||||
const prevEndTag = lines[i - 2].trim();
|
||||
const thisEndTag = lines[i + 1].trim();
|
||||
if (original === prevTag && prevEndTag === thisEndTag) {
|
||||
result.push({
|
||||
original: lines[i - 3],
|
||||
translation: lines[i],
|
||||
});
|
||||
}
|
||||
} else {
|
||||
result.push({original, translation});
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
|
@ -9,12 +9,20 @@ export function originalIsNotChinese(entry: DictEntry): boolean {
|
||||
return !isTranslation(entry.original);
|
||||
}
|
||||
|
||||
export function isTagLine(text: string) {
|
||||
return /^\s*<\/?\w+.*/.test(text);
|
||||
}
|
||||
|
||||
export function originalIsNotTag(entry: DictEntry): boolean {
|
||||
return !/^\s*<div.*/.test(entry.original);
|
||||
return !isTagLine(entry.original);
|
||||
}
|
||||
|
||||
export function isOnlyTag(text: string) {
|
||||
return /^\s*<\w+>\s*$/.test(text);
|
||||
}
|
||||
|
||||
export function originalIsOnlyTag(entry: DictEntry): boolean {
|
||||
return !/^\s*<\w+>\s*$/.test(entry.original);
|
||||
return !isOnlyTag(entry.original);
|
||||
}
|
||||
|
||||
export function isNotImg(entry: DictEntry): boolean {
|
||||
@ -40,10 +48,11 @@ export function isHead(line: string): boolean {
|
||||
export function normalizeLines(text: string): string {
|
||||
// 列表、标题等自带换行含义的markdown
|
||||
const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g;
|
||||
const htmlTagPattern = /\n(\s*<.*?>\s*?)\n/g;
|
||||
return text.replace(blockElementPattern, '\n\n')
|
||||
.replace(htmlTagPattern, '\n\n$1\n\n')
|
||||
.replace(/\n\n+/, '\n\n');
|
||||
const htmlTagPattern = /(\s*<.*?>\s*?)\n/g;
|
||||
text = text.replace(blockElementPattern, '\n\n');
|
||||
text = text.replace(htmlTagPattern, '\n\n$1\n\n');
|
||||
text = text.replace(/\n\n+/, '\n\n');
|
||||
return text;
|
||||
}
|
||||
|
||||
export function indentOf(line): number {
|
||||
|
Loading…
x
Reference in New Issue
Block a user