angular-docs-cn/aio/tools/translator/utils.ts

import { DictEntry } from './dict-entry';
import { isTranslation } from './extractor';
import * as _ from 'lodash';

export function translationHasNotCodeExample(entry: DictEntry): boolean {
  return entry.translation.indexOf('<code-example') === -1;
}

export function originalIsNotChinese(entry: DictEntry): boolean {
  return !isTranslation(entry.original);
}

export function originalIsNotSpecialDivTag(entry: DictEntry): boolean {
  return !/^<div class="\w+\b[^>\n]*>$/.test(entry.original);
}

export function originalIsNotCodeExampleTag(entry: DictEntry): boolean {
  return !/^<\/?code-example\b[^>\n]*>$/.test(entry.original);
}

export function originalIsNotPureCloseTag(entry: DictEntry): boolean {
  return !/^<\/(td|a|div|header|p|figure)>$/.test(entry.original);
}

export function isOnlyBeginTag(text: string) {
  return /^\s*<\w+>\s*$/.test(text);
}

export function originalIsNotOnlyBeginTag(entry: DictEntry): boolean {
  return !isOnlyBeginTag(entry.original);
}

export function isNotImg(entry: DictEntry): boolean {
  return !/^<(img|figure)/.test(entry.translation);
}

export function isNotCheatSheet(entry: DictEntry): boolean {
  return !/cheatsheet.md$/.test(entry.sourceFile!!);
}

export function isNotMarketingDocs(entry: DictEntry): boolean {
  return !/marketing\/docs.md$/.test(entry.sourceFile!!);
}

export function isNotCnPages(entry: DictEntry): boolean {
  return !/cn\/.*?.md$/.test(entry.sourceFile!!);
}

export function isHead(line: string): boolean {
  return /^#/.test(line);
}

export function normalizeLines(text: string): string {
  // 原文中有 back to top 被分成两行的情况，这里把它标准化一下
  const specialBackToTopPattern = /<a href="#toc">Back to top\s+<\/a>/g;
  text = text.replace(specialBackToTopPattern, '<a href="#toc">Back to top</a>');
  // 原文中有</table，为它补齐
  text = text.replace(/^ *<\/table$/gm, '</table>');
  // 原文中有的换行会干扰生成 html 的格式，替换一下
  // tslint:disable:max-line-length
  text = text.replace(`<a href="https://docs.npmjs.com/getting-started/installing-node" target="_blank" title="Installing Node.js and updating npm">
Get them now</a> if they're not already installed on your machine.
`, `<a href="https://docs.npmjs.com/getting-started/installing-node" target="_blank" title="Installing Node.js and updating npm">Get them now</a> if they're not already installed on your machine.
`);

  // 为各种列表多加一个空行
  const listElementPattern = /(?=\n *(\d+\.|-|\*) )\n/g;
  text = text.replace(listElementPattern, '\n\n');
  // 为标题增加空行
  const hxPattern = /^( *#+ .*)$/gm;
  text = text.replace(hxPattern, '\n$1\n');
  // 把多行的 HTML 标题或 p 元素变成单行
  const hxMultilinePattern = /^( *)<(h\d|p|header)([^>]*)>\s*(.*)\s*<\/\2>$/gm;
  text = text.replace(hxMultilinePattern, '\n$1<$2$3>$4</$2>\n');
  // 为单行的成对标签前后添加空行
  const oneLinePairedTagPattern = /^( *)<(p|div|h\d+|code-example|section)\b([^>]*)>([^\n]*?)<\/\2>( *)$/gm;
  text = text.replace(oneLinePairedTagPattern, '\n$1<$2$3>$4</$2>$5\n');
  // 为单行的注释前后添加空行
  const oneLineCommentPattern = /^( *<!--[\s\S]*?--> *)$/gm;
  text = text.replace(oneLineCommentPattern, '\n$1\n');
  // 为单行的 back to top 前后添加空行
  const backToTopPattern = /^( *<a href="#toc">Back to top<\/a> *)$/gm;
  text = text.replace(backToTopPattern, '\n$1\n');
  // 为单行的 {@ 语句前后添加空行
  const atTagCommentPattern = /^( *{@a.*} *)$/gm;
  text = text.replace(atTagCommentPattern, '\n$1\n');
  // 为单行的自封闭标签前后添加空行
  const oneLineClosedTagPattern = /^( *<hr *\/?>) *$/gm;
  text = text.replace(oneLineClosedTagPattern, '\n$1\n');
  // 为单行的 <br class="clear"> 前后添加空行
  const oneLineBrTagPattern = /^( *<br class="clear"> *)$/gm;
  text = text.replace(oneLineBrTagPattern, '\n$1\n');
  // 为单独的 div 前后添加空行
  const oneLineDivTagPattern = /^( *<\/?(div|li|ul|ol)\b([^>]*)> *)$/gm;
  text = text.replace(oneLineDivTagPattern, '\n$1\n');
  // 在 pre 前后添加空行
  const preBeginTagPattern = /(^ *<pre)/gm;
  text = text.replace(preBeginTagPattern, '\n$1');
  const preEndTagPattern = /(<\/pre> *)$/gm;
  text = text.replace(preEndTagPattern, '$1\n');

  // 为 ``` 前后添加空行
  const multiLineCodePattern = /^( *```\w* *)$/gm;
  text = text.replace(multiLineCodePattern, '\n$1\n');

  // 把单行的 tr 拆成多行，以便翻译
  const trTagPattern = /^( *)(<tr\b[^>]*>)(.*)(<\/tr>)$/gm;
  text = text.replace(trTagPattern, '\n$1$2\n\n$1    $3\n\n$1$4\n');

  // 把单行的 th/td/li 等拆成多行，以便翻译，
  const oneLineThTdTagPattern = /^( *)<(th|td|li)\b([^>]*)>(.*?)<\/\2>$/gm;
  text = text.replace(oneLineThTdTagPattern, '\n$1<$2$3>\n\n$1    $4\n\n$1</$2>\n');

  // 把原本就是多行的 th/td 中间添加空行
  const thTdTagPattern = /^( *)<(th|td)\b( *[^>]*)>([\s\S]*?)<\/\2>$/gm;
  text = text.replace(thTdTagPattern, '\n\n$1<$2$3>\n\n$1    $4\n\n$1</$2>\n\n');

  // 在所有的起始标签前面加空行
  const blockBeginTagPattern = /^( *)<(code-example|code-tabs|pre|p)\b( *[^>]*)>( *)$/gm;
  text = text.replace(blockBeginTagPattern, '\n$1<$2$3>$4');

  // 在所有的结束标签前面加空行
  const blockEndTagPattern = /^( *)<\/(code-example|code-tabs|pre|p)>( *)$/gm;
  text = text.replace(blockEndTagPattern, '$1</$2>$3\n');

  // 把所有由空格组成的空行都去掉
  const blankLinePattern = /^[ \t]+$/gm;
  text = text.replace(blankLinePattern, '');
  // 把中间的多个回车都变成两个回车
  const multipleBlankLinePattern = /\n{2,}/g;
  text = text.replace(multipleBlankLinePattern, '\n\n');

  // 去掉全文头尾的空白
  text = text.trim();
  return text;
}

export function indentOf(line): number {
  let pattern = /^( *)[\s\S]*/;
  if (!pattern.test(line)) {
    return 0;
  }
  const leadSpaces = line.replace(pattern, '$1').length;
  if (/^ *(\d+\.|-|\*) /.test(line)) {
    return leadSpaces + 3;
  } else {
    return leadSpaces;
  }
}

export function repeat(indent: number): string {
  let result = '';
  for (let i = 0; i < indent; ++i) {
    result = result + ' ';
  }
  return result;
}

// 目前还不能正常工作
export function fuzzyTest(text1: string, text2: string): boolean {
  const tokens1 = tokenize(text1);
  const tokens2 = tokenize(text2);
  const sameTokens = _.intersection(tokens1, tokens2);
  const maxTokens = Math.max(tokens1.length, tokens2.length);
  return sameTokens.length > 5 && sameTokens.length / maxTokens >= 0.8;
}

export function exactlyTest(key: string, text: string): boolean {
  return !!key && key === text;
}

export function kernelText(text: string): string {
  return text
    .replace(/([^a-zA-Z0-9#:]|\s|\.$)/g, '')
    .toUpperCase()
    .trim();
}

export function tokenize(text: string): string[] {
  return text.split(/\W/)
    .map(token => token.trim())
    .filter(token => !!token);
}

export function hasInlineText(text: string): boolean {
  return /<t>(.*?)<\/t> *<t>.*?<\/t>/g.test(text);
}

export function extractOriginalContent(text: string): string {
  if (!hasInlineText(text)) {
    return text;
  }
  return text.replace(/<t>(.*?)<\/t> *<t>.*?<\/t>/gi, '$1')
    .replace(/ +/g, ' ');
}
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								import { DictEntry } from './dict-entry';
 								import { isTranslation } from './extractor';
-												refactor: 提供模糊匹配的功能（未完成）

											
										
										
											2018-03-15 16:40:11 +08:00
+								import * as _ from 'lodash';
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
 								export function translationHasNotCodeExample(entry: DictEntry): boolean {
 								  return entry.translation.indexOf('<code-example') === -1;
 								}
 								export function originalIsNotChinese(entry: DictEntry): boolean {
 								  return !isTranslation(entry.original);
 								}
-												fix: 修正第一版和第二版的字典
fix: 修正原文过滤逻辑

											
										
										
											2018-03-07 14:19:47 +08:00
+								export function originalIsNotSpecialDivTag(entry: DictEntry): boolean {
 								  return !/^<div class="\w+\b[^>\n]*>$/.test(entry.original);
 								}
 								export function originalIsNotCodeExampleTag(entry: DictEntry): boolean {
 								  return !/^<\/?code-example\b[^>\n]*>$/.test(entry.original);
 								}
 								export function originalIsNotPureCloseTag(entry: DictEntry): boolean {
-												fix: 修正残留错误

											
										
										
											2018-03-07 15:48:58 +08:00
+								  return !/^<\/(td|a|div|header|p|figure)>$/.test(entry.original);
-												feat: 抽取 html 标签包裹的单行文本（通常出现于表格中）

											
										
										
											2018-03-03 11:45:03 +08:00
+								}
-												fix: 错误的忽略了单行的 a 标签

											
										
										
											2018-03-07 09:08:44 +08:00
+								export function isOnlyBeginTag(text: string) {
-												feat: 抽取 html 标签包裹的单行文本（通常出现于表格中）

											
										
										
											2018-03-03 11:45:03 +08:00
+								  return /^\s*<\w+>\s*$/.test(text);
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								}
-												fix: 错误的忽略了单行的 a 标签

											
										
										
											2018-03-07 09:08:44 +08:00
+								export function originalIsNotOnlyBeginTag(entry: DictEntry): boolean {
 								  return !isOnlyBeginTag(entry.original);
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								}
 								export function isNotImg(entry: DictEntry): boolean {
 								  return !/^<(img|figure)/.test(entry.translation);
 								}
 								export function isNotCheatSheet(entry: DictEntry): boolean {
-												fix: 修复 TypeScript 最新版本报的编译错误

											
										
										
											2018-07-27 16:34:19 +08:00
+								  return !/cheatsheet.md$/.test(entry.sourceFile!!);
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								}
 								export function isNotMarketingDocs(entry: DictEntry): boolean {
-												fix: 修复 TypeScript 最新版本报的编译错误

											
										
										
											2018-07-27 16:34:19 +08:00
+								  return !/marketing\/docs.md$/.test(entry.sourceFile!!);
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								}
 								export function isNotCnPages(entry: DictEntry): boolean {
-												fix: 修复 TypeScript 最新版本报的编译错误

											
										
										
											2018-07-27 16:34:19 +08:00
+								  return !/cn\/.*?.md$/.test(entry.sourceFile!!);
-												refactor: extract checker functions

											
										
										
											2018-03-03 08:13:52 +08:00
+								}
 								export function isHead(line: string): boolean {
 								  return /^#/.test(line);
 								}
-												feat: 三行简化为两行
refactor: 抽取出工具函数

											
										
										
											2018-03-03 11:01:57 +08:00
 								export function normalizeLines(text: string): string {
-												fix: 人工修订自动翻译
fix: 标准化原文中的异常语句

											
										
										
											2018-03-07 13:28:58 +08:00
+								  // 原文中有 back to top 被分成两行的情况，这里把它标准化一下
 								  const specialBackToTopPattern = /<a href="#toc">Back to top\s+<\/a>/g;
 								  text = text.replace(specialBackToTopPattern, '<a href="#toc">Back to top</a>');
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								  // 原文中有</table，为它补齐
 								  text = text.replace(/^ *<\/table$/gm, '</table>');
 								  // 原文中有的换行会干扰生成 html 的格式，替换一下
 								  // tslint:disable:max-line-length
 								  text = text.replace(`<a href="https://docs.npmjs.com/getting-started/installing-node" target="_blank" title="Installing Node.js and updating npm">
 								Get them now</a> if they're not already installed on your machine.
 								`, `<a href="https://docs.npmjs.com/getting-started/installing-node" target="_blank" title="Installing Node.js and updating npm">Get them now</a> if they're not already installed on your machine.
 								`);
 								  // 为各种列表多加一个空行
 								  const listElementPattern = /(?=\n *(\d+\.|-|\*) )\n/g;
 								  text = text.replace(listElementPattern, '\n\n');
 								  // 为标题增加空行
 								  const hxPattern = /^( *#+ .*)$/gm;
 								  text = text.replace(hxPattern, '\n$1\n');
 								  // 把多行的 HTML 标题或 p 元素变成单行
 								  const hxMultilinePattern = /^( *)<(h\d|p|header)([^>]*)>\s*(.*)\s*<\/\2>$/gm;
 								  text = text.replace(hxMultilinePattern, '\n$1<$2$3>$4</$2>\n');
 								  // 为单行的成对标签前后添加空行
 								  const oneLinePairedTagPattern = /^( *)<(p|div|h\d+|code-example|section)\b([^>]*)>([^\n]*?)<\/\2>( *)$/gm;
 								  text = text.replace(oneLinePairedTagPattern, '\n$1<$2$3>$4</$2>$5\n');
 								  // 为单行的注释前后添加空行
 								  const oneLineCommentPattern = /^( *<!--[\s\S]*?--> *)$/gm;
 								  text = text.replace(oneLineCommentPattern, '\n$1\n');
 								  // 为单行的 back to top 前后添加空行
 								  const backToTopPattern = /^( *<a href="#toc">Back to top<\/a> *)$/gm;
 								  text = text.replace(backToTopPattern, '\n$1\n');
 								  // 为单行的 {@ 语句前后添加空行
 								  const atTagCommentPattern = /^( *{@a.*} *)$/gm;
 								  text = text.replace(atTagCommentPattern, '\n$1\n');
 								  // 为单行的自封闭标签前后添加空行
 								  const oneLineClosedTagPattern = /^( *<hr *\/?>) *$/gm;
 								  text = text.replace(oneLineClosedTagPattern, '\n$1\n');
 								  // 为单行的 <br class="clear"> 前后添加空行
 								  const oneLineBrTagPattern = /^( *<br class="clear"> *)$/gm;
 								  text = text.replace(oneLineBrTagPattern, '\n$1\n');
 								  // 为单独的 div 前后添加空行
 								  const oneLineDivTagPattern = /^( *<\/?(div|li|ul|ol)\b([^>]*)> *)$/gm;
 								  text = text.replace(oneLineDivTagPattern, '\n$1\n');
 								  // 在 pre 前后添加空行
 								  const preBeginTagPattern = /(^ *<pre)/gm;
 								  text = text.replace(preBeginTagPattern, '\n$1');
 								  const preEndTagPattern = /(<\/pre> *)$/gm;
 								  text = text.replace(preEndTagPattern, '$1\n');
 								  // 为 ``` 前后添加空行
 								  const multiLineCodePattern = /^( *```\w* *)$/gm;
 								  text = text.replace(multiLineCodePattern, '\n$1\n');
 								  // 把单行的 tr 拆成多行，以便翻译
 								  const trTagPattern = /^( *)(<tr\b[^>]*>)(.*)(<\/tr>)$/gm;
 								  text = text.replace(trTagPattern, '\n$1$2\n\n$1    $3\n\n$1$4\n');
 								  // 把单行的 th/td/li 等拆成多行，以便翻译，
 								  const oneLineThTdTagPattern = /^( *)<(th|td|li)\b([^>]*)>(.*?)<\/\2>$/gm;
 								  text = text.replace(oneLineThTdTagPattern, '\n$1<$2$3>\n\n$1    $4\n\n$1</$2>\n');
 								  // 把原本就是多行的 th/td 中间添加空行
 								  const thTdTagPattern = /^( *)<(th|td)\b( *[^>]*)>([\s\S]*?)<\/\2>$/gm;
-												fix: 拆解单行的 td/th

											
										
										
											2018-03-07 13:09:36 +08:00
+								  text = text.replace(thTdTagPattern, '\n\n$1<$2$3>\n\n$1    $4\n\n$1</$2>\n\n');
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								  // 在所有的起始标签前面加空行
 								  const blockBeginTagPattern = /^( *)<(code-example|code-tabs|pre|p)\b( *[^>]*)>( *)$/gm;
 								  text = text.replace(blockBeginTagPattern, '\n$1<$2$3>$4');
-												fix: 全面改用环视进行匹配，以免“吃掉”回车，影响下一个匹配
fix: html 标签中的属性匹配允许空格

											
										
										
											2018-03-07 10:43:12 +08:00
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								  // 在所有的结束标签前面加空行
 								  const blockEndTagPattern = /^( *)<\/(code-example|code-tabs|pre|p)>( *)$/gm;
 								  text = text.replace(blockEndTagPattern, '$1</$2>$3\n');
-												fix: 拆解其它单行块标签

											
										
										
											2018-03-06 17:32:21 +08:00
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								  // 把所有由空格组成的空行都去掉
 								  const blankLinePattern = /^[ \t]+$/gm;
 								  text = text.replace(blankLinePattern, '');
 								  // 把中间的多个回车都变成两个回车
 								  const multipleBlankLinePattern = /\n{2,}/g;
-												fix: header 的拆解

											
										
										
											2018-03-06 17:08:10 +08:00
+								  text = text.replace(multipleBlankLinePattern, '\n\n');
-												fix: 全面改用环视进行匹配，以免“吃掉”回车，影响下一个匹配
fix: html 标签中的属性匹配允许空格

											
										
										
											2018-03-07 10:43:12 +08:00
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								  // 去掉全文头尾的空白
 								  text = text.trim();
-												fix: 全面改用环视进行匹配，以免“吃掉”回车，影响下一个匹配
fix: html 标签中的属性匹配允许空格

											
										
										
											2018-03-07 10:43:12 +08:00
+								  return text;
-												feat: 三行简化为两行
refactor: 抽取出工具函数

											
										
										
											2018-03-03 11:01:57 +08:00
+								}
 								export function indentOf(line): number {
 								  let pattern = /^( *)[\s\S]*/;
 								  if (!pattern.test(line)) {
 								    return 0;
 								  }
 								  const leadSpaces = line.replace(pattern, '$1').length;
 								  if (/^ *(\d+\.|-|\*) /.test(line)) {
 								    return leadSpaces + 3;
 								  } else {
 								    return leadSpaces;
 								  }
 								}
 								export function repeat(indent: number): string {
 								  let result = '';
 								  for (let i = 0; i < indent; ++i) {
 								    result = result + ' ';
 								  }
 								  return result;
 								}
-												refactor: 提供模糊匹配的功能（未完成）

											
										
										
											2018-03-15 16:40:11 +08:00
 								// 目前还不能正常工作
 								export function fuzzyTest(text1: string, text2: string): boolean {
 								  const tokens1 = tokenize(text1);
 								  const tokens2 = tokenize(text2);
 								  const sameTokens = _.intersection(tokens1, tokens2);
 								  const maxTokens = Math.max(tokens1.length, tokens2.length);
 								  return sameTokens.length > 5 && sameTokens.length / maxTokens >= 0.8;
 								}
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								export function exactlyTest(key: string, text: string): boolean {
-												fix: 修复 TypeScript 最新版本报的编译错误

											
										
										
											2018-07-27 16:34:19 +08:00
+								  return !!key && key === text;
-												refactor: 提供模糊匹配的功能（未完成）

											
										
										
											2018-03-15 16:40:11 +08:00
+								}
 								export function kernelText(text: string): string {
 								  return text
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
+								    .replace(/([^a-zA-Z0-9#:]|\s|\.$)/g, '')
-												refactor: 提供模糊匹配的功能（未完成）

											
										
										
											2018-03-15 16:40:11 +08:00
+								    .toUpperCase()
 								    .trim();
 								}
 								export function tokenize(text: string): string[] {
 								  return text.split(/\W/)
 								    .map(token => token.trim())
 								    .filter(token => !!token);
 								}
-												refactor: 重构自动翻译代码

											
										
										
											2018-03-24 16:33:17 +08:00
 								export function hasInlineText(text: string): boolean {
 								  return /<t>(.*?)<\/t> *<t>.*?<\/t>/g.test(text);
 								}
 								export function extractOriginalContent(text: string): string {
 								  if (!hasInlineText(text)) {
 								    return text;
 								  }
 								  return text.replace(/<t>(.*?)<\/t> *<t>.*?<\/t>/gi, '$1')
 								    .replace(/ +/g, ' ');
 								}