build: update stemmer to version 2.0.0 (#41724)

NOTE:
`stemmer` v2.0.0 switched to ES modules (see
words/stemmer@03519229c8), which means
that the only way to consume it in our CommonJS setup (for example, in
[generateKeywords][1]) is via an async `import()`.

This commit makes the `generateKeywords` processor asynchronous in order
to be able to dynamically import and use `stemmer`.

[1]: 251bec159a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js

PR Close #41724
This commit is contained in:
George Kalpakas 2021-04-24 11:18:01 +03:00 committed by Jessica Janiuk
parent a938849148
commit 99f2ffc740
5 changed files with 114 additions and 119 deletions

View File

@ -162,7 +162,7 @@
"rimraf": "^3.0.2",
"semver": "^7.3.5",
"shelljs": "^0.8.4",
"stemmer": "^1.0.5",
"stemmer": "^2.0.0",
"timezone-mock": "^1.1.3",
"tree-kill": "^1.1.0",
"ts-node": "^9.1.1",

View File

@ -8,6 +8,9 @@ module.exports = {
'eslint:recommended',
'plugin:jasmine/recommended'
],
'parserOptions': {
'ecmaVersion': 2020,
},
'plugins': [
'jasmine'
],

View File

@ -1,7 +1,5 @@
'use strict';
const stem = require('stemmer');
/**
* @dgProcessor generateKeywordsProcessor
* @description
@ -23,7 +21,9 @@ module.exports = function generateKeywordsProcessor(log) {
},
$runAfter: ['postProcessHtml'],
$runBefore: ['writing-files'],
$process(docs) {
async $process(docs) {
const {stemmer: stem} = await import('stemmer');
const dictionary = new Map();
@ -110,6 +110,77 @@ module.exports = function generateKeywordsProcessor(log) {
data: searchData,
renderedContent: JSON.stringify(searchData)
});
return docs;
// Helpers
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for (let token of rawTokens) {
token = token.trim();
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return [];
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}
}
};
};
@ -117,71 +188,3 @@ module.exports = function generateKeywordsProcessor(log) {
function isString(value) {
return typeof value == 'string';
}
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for (let token of rawTokens) {
token = token.trim();
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return [];
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}

View File

@ -36,42 +36,39 @@ describe('generateKeywords processor', () => {
expect(processor.$runBefore).toEqual(['writing-files']);
});
it('should ignore internal and private exports', () => {
it('should ignore internal and private exports', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'PublicExport' },
{ docType: 'class', name: 'PrivateExport', privateExport: true },
{ docType: 'class', name: 'InternalExport', internal: true }
];
processor.$process(docs);
]);
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
]);
});
it('should ignore docs that are in the `docTypesToIgnore` list', () => {
it('should ignore docs that are in the `docTypesToIgnore` list', async () => {
const processor = createProcessor();
processor.docTypesToIgnore = ['interface'];
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'Class' },
{ docType: 'interface', name: 'Interface' },
{ docType: 'content', name: 'Guide' },
];
processor.$process(docs);
]);
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'Class', type: 'class' }),
jasmine.objectContaining({ title: 'Guide', type: 'content' }),
]);
});
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => {
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', async () => {
const processor = createProcessor();
processor.propertiesToIgnore = ['docType', 'ignore'];
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
{ docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
];
processor.$process(docs);
]);
expect(docs[docs.length - 1].data).toEqual({
dictionary: 'fooclass barinterfac captur content',
pages: [
@ -81,17 +78,16 @@ describe('generateKeywords processor', () => {
});
});
it('should not collect keywords that look like HTML tags', () => {
it('should not collect keywords that look like HTML tags', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'FooClass', content: `
<table id="foo">
<tr class="moo" id="bar">
<td>Content inside a table</td>
</tr>
</table>` },
];
processor.$process(docs);
]);
expect(docs[docs.length - 1].data).toEqual({
dictionary: 'class fooclass content insid tabl',
pages: [
@ -100,15 +96,14 @@ describe('generateKeywords processor', () => {
});
});
it('should compute `doc.searchTitle` from the doc properties if not already provided', () => {
it('should compute `doc.searchTitle` from the doc properties if not already provided', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
{ docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
{ docType: 'class', name: 'C', vFile: { title: 'vFile C', headings: { h1: ['vFile C'] } } },
{ docType: 'class', name: 'D' },
];
processor.$process(docs);
]);
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'searchTitle A' }),
jasmine.objectContaining({ title: 'title B' }),
@ -117,29 +112,27 @@ describe('generateKeywords processor', () => {
]);
});
it('should use `doc.searchTitle` as the title in the search index', () => {
it('should use `doc.searchTitle` as the title in the search index', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{ docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
];
processor.$process(docs);
]);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data.pages).toEqual([
jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
]);
});
it('should add heading words to the search terms', () => {
it('should add heading words to the search terms', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{
docType: 'class',
name: 'PublicExport',
searchTitle: 'class PublicExport',
vFile: { headings: { h2: ['Important heading', 'Secondary heading'] } }
},
];
processor.$process(docs);
]);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport head secondari',
@ -149,9 +142,9 @@ describe('generateKeywords processor', () => {
});
});
it('should add member doc properties to the search terms', () => {
it('should add member doc properties to the search terms', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{
docType: 'class',
name: 'PublicExport',
@ -171,8 +164,7 @@ describe('generateKeywords processor', () => {
{ name: 'staticPropertyB' },
],
},
];
processor.$process(docs);
]);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport content ngclass instancemethoda instancepropertya instancemethodb instancepropertyb staticmethoda staticpropertya staticmethodb staticpropertyb head',
@ -184,7 +176,7 @@ describe('generateKeywords processor', () => {
});
});
it('should add inherited member doc properties to the search terms', () => {
it('should add inherited member doc properties to the search terms', async () => {
const processor = createProcessor();
const parentClass = {
docType: 'class',
@ -216,8 +208,7 @@ describe('generateKeywords processor', () => {
extendsClauses: [{ doc: parentClass }],
implementsClauses: [{ doc: parentInterface }]
};
const docs = [childClass, parentClass, parentInterface];
processor.$process(docs);
const docs = await processor.$process([childClass, parentClass, parentInterface]);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({
dictionary: 'class child childmember1 childmember2 parentmember1 parentmember2 parentmember3 parentclass interfac parentinterfac',
@ -238,9 +229,9 @@ describe('generateKeywords processor', () => {
});
});
it('should include both stripped and unstripped "ng" prefixed tokens', () => {
it('should include both stripped and unstripped "ng" prefixed tokens', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{
docType: 'class',
name: 'PublicExport',
@ -248,8 +239,7 @@ describe('generateKeywords processor', () => {
vFile: { headings: { h2: ['ngModel'] } },
content: 'Some content with ngClass in it.'
},
];
processor.$process(docs);
]);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual({
dictionary: 'class publicexport ngcontrol control content ngclass ngmodel model',
@ -262,9 +252,9 @@ describe('generateKeywords processor', () => {
});
});
it('should generate compressed encoded renderedContent property', () => {
it('should generate compressed encoded renderedContent property', async () => {
const processor = createProcessor();
const docs = [
const docs = await processor.$process([
{
docType: 'class',
name: 'SomeClass',
@ -280,8 +270,7 @@ describe('generateKeywords processor', () => {
],
deprecated: true
},
];
processor.$process(docs);
]);
const keywordsDoc = docs[docs.length - 1];
expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
dictionary: 'class someclass document api head someclass2 descript member1',

View File

@ -11955,10 +11955,10 @@ stealthy-require@^1.1.1:
resolved "https://registry.yarnpkg.com/stealthy-require/-/stealthy-require-1.1.1.tgz#35b09875b4ff49f26a777e509b3090a3226bf24b"
integrity sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=
stemmer@^1.0.5:
version "1.0.5"
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0"
integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A==
stemmer@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-2.0.0.tgz#05fcaf174c423b0fec85e660759ebd4867d811c9"
integrity sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A==
stream-browserify@^2.0.1:
version "2.0.2"