build: update stemmer to version 2.0.0 (#41724)

NOTE:
`stemmer` v2.0.0 switched to ES modules (see
words/stemmer@03519229c8), which means
that the only way to consume it in our CommonJS setup (for example, in
[generateKeywords][1]) is via an async `import()`.

This commit makes the `generateKeywords` processor asynchronous in order
to be able to dynamically import and use `stemmer`.

[1]: 251bec159a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js

PR Close #41724
This commit is contained in:
George Kalpakas 2021-04-24 11:18:01 +03:00 committed by Jessica Janiuk
parent a938849148
commit 99f2ffc740
5 changed files with 114 additions and 119 deletions

View File

@ -162,7 +162,7 @@
"rimraf": "^3.0.2", "rimraf": "^3.0.2",
"semver": "^7.3.5", "semver": "^7.3.5",
"shelljs": "^0.8.4", "shelljs": "^0.8.4",
"stemmer": "^1.0.5", "stemmer": "^2.0.0",
"timezone-mock": "^1.1.3", "timezone-mock": "^1.1.3",
"tree-kill": "^1.1.0", "tree-kill": "^1.1.0",
"ts-node": "^9.1.1", "ts-node": "^9.1.1",

View File

@ -8,6 +8,9 @@ module.exports = {
'eslint:recommended', 'eslint:recommended',
'plugin:jasmine/recommended' 'plugin:jasmine/recommended'
], ],
'parserOptions': {
'ecmaVersion': 2020,
},
'plugins': [ 'plugins': [
'jasmine' 'jasmine'
], ],

View File

@ -1,7 +1,5 @@
'use strict'; 'use strict';
const stem = require('stemmer');
/** /**
* @dgProcessor generateKeywordsProcessor * @dgProcessor generateKeywordsProcessor
* @description * @description
@ -23,7 +21,9 @@ module.exports = function generateKeywordsProcessor(log) {
}, },
$runAfter: ['postProcessHtml'], $runAfter: ['postProcessHtml'],
$runBefore: ['writing-files'], $runBefore: ['writing-files'],
$process(docs) { async $process(docs) {
const {stemmer: stem} = await import('stemmer');
const dictionary = new Map(); const dictionary = new Map();
@ -110,6 +110,77 @@ module.exports = function generateKeywordsProcessor(log) {
data: searchData, data: searchData,
renderedContent: JSON.stringify(searchData) renderedContent: JSON.stringify(searchData)
}); });
return docs;
// Helpers
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for (let token of rawTokens) {
token = token.trim();
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return [];
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}
} }
}; };
}; };
@ -117,71 +188,3 @@ module.exports = function generateKeywordsProcessor(log) {
function isString(value) { function isString(value) {
return typeof value == 'string'; return typeof value == 'string';
} }
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for (let token of rawTokens) {
token = token.trim();
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return [];
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}

View File

@ -36,42 +36,39 @@ describe('generateKeywords processor', () => {
expect(processor.$runBefore).toEqual(['writing-files']); expect(processor.$runBefore).toEqual(['writing-files']);
}); });
it('should ignore internal and private exports', () => { it('should ignore internal and private exports', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'PublicExport' }, { docType: 'class', name: 'PublicExport' },
{ docType: 'class', name: 'PrivateExport', privateExport: true }, { docType: 'class', name: 'PrivateExport', privateExport: true },
{ docType: 'class', name: 'InternalExport', internal: true } { docType: 'class', name: 'InternalExport', internal: true }
]; ]);
processor.$process(docs);
expect(docs[docs.length - 1].data.pages).toEqual([ expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'PublicExport', type: 'class' }) jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
]); ]);
}); });
it('should ignore docs that are in the `docTypesToIgnore` list', () => { it('should ignore docs that are in the `docTypesToIgnore` list', async () => {
const processor = createProcessor(); const processor = createProcessor();
processor.docTypesToIgnore = ['interface']; processor.docTypesToIgnore = ['interface'];
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'Class' }, { docType: 'class', name: 'Class' },
{ docType: 'interface', name: 'Interface' }, { docType: 'interface', name: 'Interface' },
{ docType: 'content', name: 'Guide' }, { docType: 'content', name: 'Guide' },
]; ]);
processor.$process(docs);
expect(docs[docs.length - 1].data.pages).toEqual([ expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'Class', type: 'class' }), jasmine.objectContaining({ title: 'Class', type: 'class' }),
jasmine.objectContaining({ title: 'Guide', type: 'content' }), jasmine.objectContaining({ title: 'Guide', type: 'content' }),
]); ]);
}); });
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => { it('should not collect keywords from properties that are in the `propertiesToIgnore` list', async () => {
const processor = createProcessor(); const processor = createProcessor();
processor.propertiesToIgnore = ['docType', 'ignore']; processor.propertiesToIgnore = ['docType', 'ignore'];
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'FooClass', ignore: 'ignore this content' }, { docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
{ docType: 'interface', name: 'BarInterface', capture: 'capture this content' }, { docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
]; ]);
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual({ expect(docs[docs.length - 1].data).toEqual({
dictionary: 'fooclass barinterfac captur content', dictionary: 'fooclass barinterfac captur content',
pages: [ pages: [
@ -81,17 +78,16 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should not collect keywords that look like HTML tags', () => { it('should not collect keywords that look like HTML tags', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'FooClass', content: ` { docType: 'class', name: 'FooClass', content: `
<table id="foo"> <table id="foo">
<tr class="moo" id="bar"> <tr class="moo" id="bar">
<td>Content inside a table</td> <td>Content inside a table</td>
</tr> </tr>
</table>` }, </table>` },
]; ]);
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual({ expect(docs[docs.length - 1].data).toEqual({
dictionary: 'class fooclass content insid tabl', dictionary: 'class fooclass content insid tabl',
pages: [ pages: [
@ -100,15 +96,14 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should compute `doc.searchTitle` from the doc properties if not already provided', () => { it('should compute `doc.searchTitle` from the doc properties if not already provided', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } }, { docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
{ docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } }, { docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
{ docType: 'class', name: 'C', vFile: { title: 'vFile C', headings: { h1: ['vFile C'] } } }, { docType: 'class', name: 'C', vFile: { title: 'vFile C', headings: { h1: ['vFile C'] } } },
{ docType: 'class', name: 'D' }, { docType: 'class', name: 'D' },
]; ]);
processor.$process(docs);
expect(docs[docs.length - 1].data.pages).toEqual([ expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'searchTitle A' }), jasmine.objectContaining({ title: 'searchTitle A' }),
jasmine.objectContaining({ title: 'title B' }), jasmine.objectContaining({ title: 'title B' }),
@ -117,29 +112,27 @@ describe('generateKeywords processor', () => {
]); ]);
}); });
it('should use `doc.searchTitle` as the title in the search index', () => { it('should use `doc.searchTitle` as the title in the search index', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' }, { docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
]; ]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data.pages).toEqual([ expect(keywordsDoc.data.pages).toEqual([
jasmine.objectContaining({ title: 'class PublicExport', type: 'class' }) jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
]); ]);
}); });
it('should add heading words to the search terms', () => { it('should add heading words to the search terms', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ {
docType: 'class', docType: 'class',
name: 'PublicExport', name: 'PublicExport',
searchTitle: 'class PublicExport', searchTitle: 'class PublicExport',
vFile: { headings: { h2: ['Important heading', 'Secondary heading'] } } vFile: { headings: { h2: ['Important heading', 'Secondary heading'] } }
}, },
]; ]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({ expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport head secondari', dictionary: 'class publicexport head secondari',
@ -149,9 +142,9 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should add member doc properties to the search terms', () => { it('should add member doc properties to the search terms', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ {
docType: 'class', docType: 'class',
name: 'PublicExport', name: 'PublicExport',
@ -171,8 +164,7 @@ describe('generateKeywords processor', () => {
{ name: 'staticPropertyB' }, { name: 'staticPropertyB' },
], ],
}, },
]; ]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({ expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport content ngclass instancemethoda instancepropertya instancemethodb instancepropertyb staticmethoda staticpropertya staticmethodb staticpropertyb head', dictionary: 'class publicexport content ngclass instancemethoda instancepropertya instancemethodb instancepropertyb staticmethoda staticpropertya staticmethodb staticpropertyb head',
@ -184,7 +176,7 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should add inherited member doc properties to the search terms', () => { it('should add inherited member doc properties to the search terms', async () => {
const processor = createProcessor(); const processor = createProcessor();
const parentClass = { const parentClass = {
docType: 'class', docType: 'class',
@ -216,8 +208,7 @@ describe('generateKeywords processor', () => {
extendsClauses: [{ doc: parentClass }], extendsClauses: [{ doc: parentClass }],
implementsClauses: [{ doc: parentInterface }] implementsClauses: [{ doc: parentInterface }]
}; };
const docs = [childClass, parentClass, parentInterface]; const docs = await processor.$process([childClass, parentClass, parentInterface]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({ expect(keywordsDoc.data).toEqual({
dictionary: 'class child childmember1 childmember2 parentmember1 parentmember2 parentmember3 parentclass interfac parentinterfac', dictionary: 'class child childmember1 childmember2 parentmember1 parentmember2 parentmember3 parentclass interfac parentinterfac',
@ -238,9 +229,9 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should include both stripped and unstripped "ng" prefixed tokens', () => { it('should include both stripped and unstripped "ng" prefixed tokens', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ {
docType: 'class', docType: 'class',
name: 'PublicExport', name: 'PublicExport',
@ -248,8 +239,7 @@ describe('generateKeywords processor', () => {
vFile: { headings: { h2: ['ngModel'] } }, vFile: { headings: { h2: ['ngModel'] } },
content: 'Some content with ngClass in it.' content: 'Some content with ngClass in it.'
}, },
]; ]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({ expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport ngcontrol control content ngclass ngmodel model', dictionary: 'class publicexport ngcontrol control content ngclass ngmodel model',
@ -262,9 +252,9 @@ describe('generateKeywords processor', () => {
}); });
}); });
it('should generate compressed encoded renderedContent property', () => { it('should generate compressed encoded renderedContent property', async () => {
const processor = createProcessor(); const processor = createProcessor();
const docs = [ const docs = await processor.$process([
{ {
docType: 'class', docType: 'class',
name: 'SomeClass', name: 'SomeClass',
@ -280,8 +270,7 @@ describe('generateKeywords processor', () => {
], ],
deprecated: true deprecated: true
}, },
]; ]);
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({ expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
dictionary: 'class someclass document api head someclass2 descript member1', dictionary: 'class someclass document api head someclass2 descript member1',

View File

@ -11955,10 +11955,10 @@ stealthy-require@^1.1.1:
resolved "https://registry.yarnpkg.com/stealthy-require/-/stealthy-require-1.1.1.tgz#35b09875b4ff49f26a777e509b3090a3226bf24b" resolved "https://registry.yarnpkg.com/stealthy-require/-/stealthy-require-1.1.1.tgz#35b09875b4ff49f26a777e509b3090a3226bf24b"
integrity sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks= integrity sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=
stemmer@^1.0.5: stemmer@^2.0.0:
version "1.0.5" version "2.0.0"
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0" resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-2.0.0.tgz#05fcaf174c423b0fec85e660759ebd4867d811c9"
integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A== integrity sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A==
stream-browserify@^2.0.1: stream-browserify@^2.0.1:
version "2.0.2" version "2.0.2"