feat: 抽取 html 标签包裹的单行文本(通常出现于表格中)
This commit is contained in:
parent
b8400fad7c
commit
eb73ad0027
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -30,6 +30,21 @@ describe('从对照翻译文件中采集生成字典', () => {
|
|||||||
expect(result).eql([{original: 'a', translation: '一'}]);
|
expect(result).eql([{original: 'a', translation: '一'}]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('处理 html 标签包裹的翻译文本', () => {
|
||||||
|
const result = gatherTranslations(`
|
||||||
|
<p>
|
||||||
|
a
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
一
|
||||||
|
</p>
|
||||||
|
|
||||||
|
`);
|
||||||
|
expect(result).eql([{original: 'a', translation: '一'}]);
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
it('从真实的文件中采集(测试)', function () {
|
it('从真实的文件中采集(测试)', function () {
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8');
|
const content = fs.readFileSync(dirs.content + 'guide/forms.md', 'utf-8');
|
||||||
|
@ -2,6 +2,7 @@ import * as globby from 'globby';
|
|||||||
import { DictEntry } from './dict-entry';
|
import { DictEntry } from './dict-entry';
|
||||||
import {
|
import {
|
||||||
isNotCnPages,
|
isNotCnPages,
|
||||||
|
isOnlyTag,
|
||||||
normalizeLines,
|
normalizeLines,
|
||||||
originalIsNotChinese,
|
originalIsNotChinese,
|
||||||
originalIsNotTag,
|
originalIsNotTag,
|
||||||
@ -28,7 +29,20 @@ export function gatherTranslations(text: string): DictEntry[] {
|
|||||||
const translation = purifyText(lines[i]);
|
const translation = purifyText(lines[i]);
|
||||||
if (isTranslation(translation)) {
|
if (isTranslation(translation)) {
|
||||||
const original = purifyText(lines[i - 1]);
|
const original = purifyText(lines[i - 1]);
|
||||||
result.push({original, translation});
|
// 对于包裹在 html tag 中的翻译文本进行特殊处理
|
||||||
|
if (isOnlyTag(original)) {
|
||||||
|
const prevTag = lines[i - 4].trim();
|
||||||
|
const prevEndTag = lines[i - 2].trim();
|
||||||
|
const thisEndTag = lines[i + 1].trim();
|
||||||
|
if (original === prevTag && prevEndTag === thisEndTag) {
|
||||||
|
result.push({
|
||||||
|
original: lines[i - 3],
|
||||||
|
translation: lines[i],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result.push({original, translation});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
@ -9,12 +9,20 @@ export function originalIsNotChinese(entry: DictEntry): boolean {
|
|||||||
return !isTranslation(entry.original);
|
return !isTranslation(entry.original);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function isTagLine(text: string) {
|
||||||
|
return /^\s*<\/?\w+.*/.test(text);
|
||||||
|
}
|
||||||
|
|
||||||
export function originalIsNotTag(entry: DictEntry): boolean {
|
export function originalIsNotTag(entry: DictEntry): boolean {
|
||||||
return !/^\s*<div.*/.test(entry.original);
|
return !isTagLine(entry.original);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isOnlyTag(text: string) {
|
||||||
|
return /^\s*<\w+>\s*$/.test(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function originalIsOnlyTag(entry: DictEntry): boolean {
|
export function originalIsOnlyTag(entry: DictEntry): boolean {
|
||||||
return !/^\s*<\w+>\s*$/.test(entry.original);
|
return !isOnlyTag(entry.original);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isNotImg(entry: DictEntry): boolean {
|
export function isNotImg(entry: DictEntry): boolean {
|
||||||
@ -40,10 +48,11 @@ export function isHead(line: string): boolean {
|
|||||||
export function normalizeLines(text: string): string {
|
export function normalizeLines(text: string): string {
|
||||||
// 列表、标题等自带换行含义的markdown
|
// 列表、标题等自带换行含义的markdown
|
||||||
const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g;
|
const blockElementPattern = /(?=\n *(\d+\.|-|\*|#|<) )\n/g;
|
||||||
const htmlTagPattern = /\n(\s*<.*?>\s*?)\n/g;
|
const htmlTagPattern = /(\s*<.*?>\s*?)\n/g;
|
||||||
return text.replace(blockElementPattern, '\n\n')
|
text = text.replace(blockElementPattern, '\n\n');
|
||||||
.replace(htmlTagPattern, '\n\n$1\n\n')
|
text = text.replace(htmlTagPattern, '\n\n$1\n\n');
|
||||||
.replace(/\n\n+/, '\n\n');
|
text = text.replace(/\n\n+/, '\n\n');
|
||||||
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function indentOf(line): number {
|
export function indentOf(line): number {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user