build: update stemmer to version 2.0.0 (#41724)

NOTE: `stemmer` v2.0.0 switched to ES modules (see words/stemmer@03519229c8), which means that the only way to consume it in our CommonJS setup (for example, in [generateKeywords][1]) is via an async `import()`. This commit makes the `generateKeywords` processor asynchronous in order to be able to dynamically import and use `stemmer`. [1]: 251bec159a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js PR Close #41724
2021-04-24 11:18:01 +03:00 · 2021-04-24 11:18:01 +03:00 · 99f2ffc740
commit 99f2ffc740
parent a938849148
5 changed files with 114 additions and 119 deletions
--- a/aio/package.json
+++ b/aio/package.json
@ -162,7 +162,7 @@
    "rimraf": "^3.0.2",
    "semver": "^7.3.5",
    "shelljs": "^0.8.4",
-    "stemmer": "^1.0.5",
+    "stemmer": "^2.0.0",
    "timezone-mock": "^1.1.3",
    "tree-kill": "^1.1.0",
    "ts-node": "^9.1.1",
--- a/aio/tools/transforms/.eslintrc.js
+++ b/aio/tools/transforms/.eslintrc.js
@ -8,6 +8,9 @@ module.exports = {
    'eslint:recommended',
    'plugin:jasmine/recommended'
  ],
+  'parserOptions': {
+    'ecmaVersion': 2020,
+  },
  'plugins': [
    'jasmine'
  ],
--- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js
+++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js
@ -1,7 +1,5 @@
 'use strict';

-const stem = require('stemmer');
-
 /**
 * @dgProcessor generateKeywordsProcessor
 * @description
@ -23,7 +21,9 @@ module.exports = function generateKeywordsProcessor(log) {
    },
    $runAfter: ['postProcessHtml'],
    $runBefore: ['writing-files'],
-    $process(docs) {
+    async $process(docs) {
+      const {stemmer: stem} = await import('stemmer');
+

      const dictionary = new Map();

@ -110,6 +110,77 @@ module.exports = function generateKeywordsProcessor(log) {
        data: searchData,
        renderedContent: JSON.stringify(searchData)
      });
+
+      return docs;
+
+      // Helpers
+      function tokenize(text, ignoreWords, dictionary) {
+        // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
+        const rawTokens = text.split(new RegExp(
+                                            '[\\s/]+' +                                // whitespace
+                                            '|' +                                      // or
+                                            '</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>',  // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
+                                            'ig'));
+        const tokens = [];
+        for (let token of rawTokens) {
+          token = token.trim();
+
+          // Trim unwanted trivia characters from the start and end of the token
+          const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
+          // Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
+          // The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
+          // But we do need to ensure we don't capture the at the end of the token.
+          const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
+          token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
+
+          // Skip if blank or in the ignored words list
+          if (token === '' || ignoreWords.has(token.toLowerCase())) {
+            continue;
+          }
+
+          // Skip tokens that contain weird characters
+          if (!/^\w[\w.-]*$/.test(token)) {
+            continue;
+          }
+
+          storeToken(token, tokens, dictionary);
+          if (token.startsWith('ng')) {
+            // Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
+            storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
+          }
+        }
+
+        return tokens;
+      }
+
+      function storeToken(token, tokens, dictionary) {
+        token = stem(token);
+        if (!dictionary.has(token)) {
+          dictionary.set(token, dictionary.size);
+        }
+        tokens.push(dictionary.get(token));
+      }
+
+      function extractMemberTokens(doc, ignoreWords, dictionary) {
+        if (!doc) return [];
+
+        let memberContent = [];
+
+        if (doc.members) {
+          doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
+        }
+        if (doc.statics) {
+          doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
+        }
+        if (doc.extendsClauses) {
+          doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
+        }
+        if (doc.implementsClauses) {
+          doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
+        }
+
+        return memberContent;
+      }
    }
  };
 };
@ -117,71 +188,3 @@ module.exports = function generateKeywordsProcessor(log) {
 function isString(value) {
  return typeof value == 'string';
 }
-
-function tokenize(text, ignoreWords, dictionary) {
-  // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
-  const rawTokens = text.split(new RegExp(
-                                      '[\\s/]+' +                                // whitespace
-                                      '|' +                                      // or
-                                      '</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>',  // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
-                                      'ig'));
-  const tokens = [];
-  for (let token of rawTokens) {
-    token = token.trim();
-
-    // Trim unwanted trivia characters from the start and end of the token
-    const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
-    // Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
-    // The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
-    // But we do need to ensure we don't capture the at the end of the token.
-    const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
-    token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
-
-    // Skip if blank or in the ignored words list
-    if (token === '' || ignoreWords.has(token.toLowerCase())) {
-      continue;
-    }
-
-    // Skip tokens that contain weird characters
-    if (!/^\w[\w.-]*$/.test(token)) {
-      continue;
-    }
-
-    storeToken(token, tokens, dictionary);
-    if (token.startsWith('ng')) {
-      // Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
-      storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
-    }
-  }
-
-  return tokens;
-}
-
-function storeToken(token, tokens, dictionary) {
-  token = stem(token);
-  if (!dictionary.has(token)) {
-    dictionary.set(token, dictionary.size);
-  }
-  tokens.push(dictionary.get(token));
-}
-
-function extractMemberTokens(doc, ignoreWords, dictionary) {
-  if (!doc) return [];
-
-  let memberContent = [];
-
-  if (doc.members) {
-    doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
-  }
-  if (doc.statics) {
-    doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
-  }
-  if (doc.extendsClauses) {
-    doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
-  }
-  if (doc.implementsClauses) {
-    doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
-  }
-
-  return memberContent;
-}
--- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js
+++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js
@ -36,42 +36,39 @@ describe('generateKeywords processor', () => {
    expect(processor.$runBefore).toEqual(['writing-files']);
  });

-  it('should ignore internal and private exports', () => {
+  it('should ignore internal and private exports', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'PublicExport' },
      { docType: 'class', name: 'PrivateExport', privateExport: true },
      { docType: 'class', name: 'InternalExport', internal: true }
-    ];
-    processor.$process(docs);
+    ]);
    expect(docs[docs.length - 1].data.pages).toEqual([
      jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
    ]);
  });

-  it('should ignore docs that are in the `docTypesToIgnore` list', () => {
+  it('should ignore docs that are in the `docTypesToIgnore` list', async () => {
    const processor = createProcessor();
    processor.docTypesToIgnore = ['interface'];
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'Class' },
      { docType: 'interface', name: 'Interface' },
      { docType: 'content', name: 'Guide' },
-    ];
-    processor.$process(docs);
+    ]);
    expect(docs[docs.length - 1].data.pages).toEqual([
      jasmine.objectContaining({ title: 'Class', type: 'class' }),
      jasmine.objectContaining({ title: 'Guide', type: 'content' }),
    ]);
  });

-  it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => {
+  it('should not collect keywords from properties that are in the `propertiesToIgnore` list', async () => {
    const processor = createProcessor();
    processor.propertiesToIgnore = ['docType', 'ignore'];
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
      { docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
-    ];
-    processor.$process(docs);
+    ]);
    expect(docs[docs.length - 1].data).toEqual({
      dictionary: 'fooclass barinterfac captur content',
      pages: [
@ -81,17 +78,16 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should not collect keywords that look like HTML tags', () => {
+  it('should not collect keywords that look like HTML tags', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'FooClass', content: `
      <table id="foo">
        <tr class="moo" id="bar">
          <td>Content inside a table</td>
        </tr>
      </table>` },
-    ];
-    processor.$process(docs);
+    ]);
    expect(docs[docs.length - 1].data).toEqual({
      dictionary: 'class fooclass content insid tabl',
      pages: [
@ -100,15 +96,14 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should compute `doc.searchTitle` from the doc properties if not already provided', () => {
+  it('should compute `doc.searchTitle` from the doc properties if not already provided', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
      { docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
      { docType: 'class', name: 'C', vFile: { title: 'vFile C', headings: { h1: ['vFile C'] } } },
      { docType: 'class', name: 'D' },
-    ];
-    processor.$process(docs);
+    ]);
    expect(docs[docs.length - 1].data.pages).toEqual([
      jasmine.objectContaining({ title: 'searchTitle A' }),
      jasmine.objectContaining({ title: 'title B' }),
@ -117,29 +112,27 @@ describe('generateKeywords processor', () => {
    ]);
  });

-  it('should use `doc.searchTitle` as the title in the search index', () => {
+  it('should use `doc.searchTitle` as the title in the search index', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      { docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
-    ];
-    processor.$process(docs);
+    ]);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data.pages).toEqual([
      jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
    ]);
  });

-  it('should add heading words to the search terms', () => {
+  it('should add heading words to the search terms', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      {
        docType: 'class',
        name: 'PublicExport',
        searchTitle: 'class PublicExport',
        vFile: { headings: { h2: ['Important heading', 'Secondary heading'] } }
      },
-    ];
-    processor.$process(docs);
+    ]);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data).toEqual({
      dictionary: 'class publicexport head secondari',
@ -149,9 +142,9 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should add member doc properties to the search terms', () => {
+  it('should add member doc properties to the search terms', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      {
        docType: 'class',
        name: 'PublicExport',
@ -171,8 +164,7 @@ describe('generateKeywords processor', () => {
          { name: 'staticPropertyB' },
        ],
      },
-    ];
-    processor.$process(docs);
+    ]);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data).toEqual({
      dictionary: 'class publicexport content ngclass instancemethoda instancepropertya instancemethodb instancepropertyb staticmethoda staticpropertya staticmethodb staticpropertyb head',
@ -184,7 +176,7 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should add inherited member doc properties to the search terms', () => {
+  it('should add inherited member doc properties to the search terms', async () => {
    const processor = createProcessor();
    const parentClass =       {
      docType: 'class',
@ -216,8 +208,7 @@ describe('generateKeywords processor', () => {
      extendsClauses: [{ doc: parentClass }],
      implementsClauses: [{ doc: parentInterface }]
    };
-    const docs = [childClass, parentClass, parentInterface];
-    processor.$process(docs);
+    const docs = await processor.$process([childClass, parentClass, parentInterface]);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data).toEqual({
      dictionary: 'class child childmember1 childmember2 parentmember1 parentmember2 parentmember3 parentclass interfac parentinterfac',
@ -238,9 +229,9 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should include both stripped and unstripped "ng" prefixed tokens', () => {
+  it('should include both stripped and unstripped "ng" prefixed tokens', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      {
        docType: 'class',
        name: 'PublicExport',
@ -248,8 +239,7 @@ describe('generateKeywords processor', () => {
        vFile: { headings: { h2: ['ngModel'] } },
        content: 'Some content with ngClass in it.'
      },
-    ];
-    processor.$process(docs);
+    ]);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data).toEqual({
      dictionary: 'class publicexport ngcontrol control content ngclass ngmodel model',
@ -262,9 +252,9 @@ describe('generateKeywords processor', () => {
    });
  });

-  it('should generate compressed encoded renderedContent property', () => {
+  it('should generate compressed encoded renderedContent property', async () => {
    const processor = createProcessor();
-    const docs = [
+    const docs = await processor.$process([
      {
        docType: 'class',
        name: 'SomeClass',
@ -280,8 +270,7 @@ describe('generateKeywords processor', () => {
        ],
        deprecated: true
      },
-    ];
-    processor.$process(docs);
+    ]);
    const keywordsDoc = docs[docs.length - 1];
    expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
      dictionary: 'class someclass document api head someclass2 descript member1',
--- a/aio/yarn.lock
+++ b/aio/yarn.lock
@ -11955,10 +11955,10 @@ stealthy-require@^1.1.1:
  resolved "https://registry.yarnpkg.com/stealthy-require/-/stealthy-require-1.1.1.tgz#35b09875b4ff49f26a777e509b3090a3226bf24b"
  integrity sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=

-stemmer@^1.0.5:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0"
-  integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A==
+stemmer@^2.0.0:
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-2.0.0.tgz#05fcaf174c423b0fec85e660759ebd4867d811c9"
+  integrity sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A==

 stream-browserify@^2.0.1:
  version "2.0.2"