refactor: 提供模糊匹配的功能（未完成）

2018-03-15 16:40:11 +08:00 · 2018-03-15 16:40:11 +08:00 · 8a711cba59
commit 8a711cba59
parent 18ac616385
6 changed files with 4440 additions and 84 deletions
--- a/aio/tools/translator/bin/translate-one.ts
+++ b/aio/tools/translator/bin/translate-one.ts
@ -3,5 +3,5 @@
 import { dirs } from '../dirs';
 import { translateFile } from '../translate';

-const filename = 'guide/router.md';
+const filename = 'guide/testing.md';
 translateFile(__dirname + '/../../../../../content-en/' + filename, dirs.content + filename);
--- a/aio/tools/translator/dict-latest.json
+++ b/aio/tools/translator/dict-latest.json
--- a/aio/tools/translator/translate.spec.ts
+++ b/aio/tools/translator/translate.spec.ts
@ -1,5 +1,6 @@
 import { expect } from 'chai';
-import { kernelText, lookup } from './translate';
+import { lookup } from './translate';
+import { kernelText } from './utils';


 describe('根据字典进行翻译', () => {
--- a/aio/tools/translator/translate.ts
+++ b/aio/tools/translator/translate.ts
@ -3,24 +3,19 @@ import * as _ from 'lodash';
 import { DictEntry } from './dict-entry';
 import { dirs } from './dirs';
 import { listMarkdownFiles } from './extractor';
-import { indentOf, normalizeLines, repeat } from './utils';
+import { exactlyTest, indentOf, normalizeLines, repeat } from './utils';
+
+// TODO: 改用 markdown 解析器实现

 export const dict = require('./dict-latest.json') as DictEntry[];

 export function lookup(english: string, filename: RegExp = /.*/): DictEntry[] {
  const entries = dict
    .filter(entry => filename.test(entry.sourceFile))
-    .filter(entry => kernelText(entry.original) === kernelText(english));
+    .filter(entry => exactlyTest(entry.original, english));
  return _.uniqBy(entries, 'translation');
 }

-export function kernelText(text: string): string {
-  return text
-    .replace(/[\s\n]+/g, '')
-    .replace(/\.$/g, '')
-    .trim();
-}
-
 export function translate(content: string): string[] {
  const lines = normalizeLines(content)
    .split(/\n+\s*\n+/);
--- a/aio/tools/translator/utils.spec.ts
+++ b/aio/tools/translator/utils.spec.ts
@ -1,5 +1,5 @@
 import { expect } from 'chai';
-import { normalizeLines } from './utils';
+import { fuzzyTest, normalizeLines, tokenize } from './utils';

 describe(' 工具函数', () => {
  it('把“1. ”列表处理成空行分隔的格式，以便处理', function () {
@ -338,4 +338,16 @@ a <b> c
 `);
  });

+  it('拆分', function () {
+    expect(tokenize('abc def,abc.')).eql(['abc', 'def', 'abc']);
+  });
+
+  it('模糊匹配', function () {
+    expect(fuzzyTest(`a b c d e`, `a b c d e`)).is.false;
+    expect(fuzzyTest(`a b c d e f g`, `a b c d e`)).is.false;
+    expect(fuzzyTest(`Make that easy by encapsulating the _click-triggering_ process in a helper such as the \`click\` function below:`,
+      `Make that consistent and easy by encapsulating the _click-triggering_ process 
+in a helper such as the \`click()\` function below:
+`)).is.true;
+  });
 });
--- a/aio/tools/translator/utils.ts
+++ b/aio/tools/translator/utils.ts
@ -1,5 +1,6 @@
 import { DictEntry } from './dict-entry';
 import { isTranslation } from './extractor';
+import * as _ from 'lodash';

 export function translationHasNotCodeExample(entry: DictEntry): boolean {
  return entry.translation.indexOf('<code-example') === -1;
@ -51,7 +52,7 @@ export function isHead(line: string): boolean {

 export function normalizeLines(text: string): string {
  text = '\n' + text + '\n';
-  // 列表、标题等自带换行含义的markdown
+  // 为列表、标题等自带换行含义的markdown多加一个空行
  const blockElementPattern = /(?=\n *(\d+\.|-|\*) )\n/g;
  text = text.replace(blockElementPattern, '\n\n');
  const hxPattern = /(\n *#+ .*)(?=\n)/g;
@ -120,3 +121,30 @@ export function repeat(indent: number): string {
  }
  return result;
 }
+
+// 目前还不能正常工作
+export function fuzzyTest(text1: string, text2: string): boolean {
+  const tokens1 = tokenize(text1);
+  const tokens2 = tokenize(text2);
+  const sameTokens = _.intersection(tokens1, tokens2);
+  const maxTokens = Math.max(tokens1.length, tokens2.length);
+  return sameTokens.length > 5 && sameTokens.length / maxTokens >= 0.8;
+}
+
+export function exactlyTest(text1: string, text2: string): boolean {
+  return kernelText(text1) === kernelText(text2);
+}
+
+export function kernelText(text: string): string {
+  return text
+    .replace(/[\s\n]+/g, '')
+    .replace(/\.$/g, '')
+    .toUpperCase()
+    .trim();
+}
+
+export function tokenize(text: string): string[] {
+  return text.split(/\W/)
+    .map(token => token.trim())
+    .filter(token => !!token);
+}