build: update stemmer to version 2.0.0 (#41724)
NOTE:
`stemmer` v2.0.0 switched to ES modules (see
words/stemmer@03519229c8), which means
that the only way to consume it in our CommonJS setup (for example, in
[generateKeywords][1]) is via an async `import()`.
This commit makes the `generateKeywords` processor asynchronous in order
to be able to dynamically import and use `stemmer`.
[1]: 251bec159a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js
PR Close #41724
This commit is contained in:
parent
a938849148
commit
99f2ffc740
|
@ -162,7 +162,7 @@
|
|||
"rimraf": "^3.0.2",
|
||||
"semver": "^7.3.5",
|
||||
"shelljs": "^0.8.4",
|
||||
"stemmer": "^1.0.5",
|
||||
"stemmer": "^2.0.0",
|
||||
"timezone-mock": "^1.1.3",
|
||||
"tree-kill": "^1.1.0",
|
||||
"ts-node": "^9.1.1",
|
||||
|
|
|
@ -8,6 +8,9 @@ module.exports = {
|
|||
'eslint:recommended',
|
||||
'plugin:jasmine/recommended'
|
||||
],
|
||||
'parserOptions': {
|
||||
'ecmaVersion': 2020,
|
||||
},
|
||||
'plugins': [
|
||||
'jasmine'
|
||||
],
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
'use strict';
|
||||
|
||||
const stem = require('stemmer');
|
||||
|
||||
/**
|
||||
* @dgProcessor generateKeywordsProcessor
|
||||
* @description
|
||||
|
@ -23,7 +21,9 @@ module.exports = function generateKeywordsProcessor(log) {
|
|||
},
|
||||
$runAfter: ['postProcessHtml'],
|
||||
$runBefore: ['writing-files'],
|
||||
$process(docs) {
|
||||
async $process(docs) {
|
||||
const {stemmer: stem} = await import('stemmer');
|
||||
|
||||
|
||||
const dictionary = new Map();
|
||||
|
||||
|
@ -110,6 +110,77 @@ module.exports = function generateKeywordsProcessor(log) {
|
|||
data: searchData,
|
||||
renderedContent: JSON.stringify(searchData)
|
||||
});
|
||||
|
||||
return docs;
|
||||
|
||||
// Helpers
|
||||
function tokenize(text, ignoreWords, dictionary) {
|
||||
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
|
||||
const rawTokens = text.split(new RegExp(
|
||||
'[\\s/]+' + // whitespace
|
||||
'|' + // or
|
||||
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
|
||||
'ig'));
|
||||
const tokens = [];
|
||||
for (let token of rawTokens) {
|
||||
token = token.trim();
|
||||
|
||||
// Trim unwanted trivia characters from the start and end of the token
|
||||
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
|
||||
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
|
||||
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
|
||||
// But we do need to ensure we don't capture the at the end of the token.
|
||||
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
|
||||
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
|
||||
|
||||
// Skip if blank or in the ignored words list
|
||||
if (token === '' || ignoreWords.has(token.toLowerCase())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that contain weird characters
|
||||
if (!/^\w[\w.-]*$/.test(token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
storeToken(token, tokens, dictionary);
|
||||
if (token.startsWith('ng')) {
|
||||
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
|
||||
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
function storeToken(token, tokens, dictionary) {
|
||||
token = stem(token);
|
||||
if (!dictionary.has(token)) {
|
||||
dictionary.set(token, dictionary.size);
|
||||
}
|
||||
tokens.push(dictionary.get(token));
|
||||
}
|
||||
|
||||
function extractMemberTokens(doc, ignoreWords, dictionary) {
|
||||
if (!doc) return [];
|
||||
|
||||
let memberContent = [];
|
||||
|
||||
if (doc.members) {
|
||||
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.statics) {
|
||||
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.extendsClauses) {
|
||||
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.implementsClauses) {
|
||||
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
|
||||
}
|
||||
|
||||
return memberContent;
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
@ -117,71 +188,3 @@ module.exports = function generateKeywordsProcessor(log) {
|
|||
function isString(value) {
|
||||
return typeof value == 'string';
|
||||
}
|
||||
|
||||
function tokenize(text, ignoreWords, dictionary) {
|
||||
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
|
||||
const rawTokens = text.split(new RegExp(
|
||||
'[\\s/]+' + // whitespace
|
||||
'|' + // or
|
||||
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
|
||||
'ig'));
|
||||
const tokens = [];
|
||||
for (let token of rawTokens) {
|
||||
token = token.trim();
|
||||
|
||||
// Trim unwanted trivia characters from the start and end of the token
|
||||
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
|
||||
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
|
||||
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
|
||||
// But we do need to ensure we don't capture the at the end of the token.
|
||||
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
|
||||
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
|
||||
|
||||
// Skip if blank or in the ignored words list
|
||||
if (token === '' || ignoreWords.has(token.toLowerCase())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that contain weird characters
|
||||
if (!/^\w[\w.-]*$/.test(token)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
storeToken(token, tokens, dictionary);
|
||||
if (token.startsWith('ng')) {
|
||||
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
|
||||
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
function storeToken(token, tokens, dictionary) {
|
||||
token = stem(token);
|
||||
if (!dictionary.has(token)) {
|
||||
dictionary.set(token, dictionary.size);
|
||||
}
|
||||
tokens.push(dictionary.get(token));
|
||||
}
|
||||
|
||||
function extractMemberTokens(doc, ignoreWords, dictionary) {
|
||||
if (!doc) return [];
|
||||
|
||||
let memberContent = [];
|
||||
|
||||
if (doc.members) {
|
||||
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.statics) {
|
||||
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.extendsClauses) {
|
||||
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
|
||||
}
|
||||
if (doc.implementsClauses) {
|
||||
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
|
||||
}
|
||||
|
||||
return memberContent;
|
||||
}
|
||||
|
|
|
@ -36,42 +36,39 @@ describe('generateKeywords processor', () => {
|
|||
expect(processor.$runBefore).toEqual(['writing-files']);
|
||||
});
|
||||
|
||||
it('should ignore internal and private exports', () => {
|
||||
it('should ignore internal and private exports', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'PublicExport' },
|
||||
{ docType: 'class', name: 'PrivateExport', privateExport: true },
|
||||
{ docType: 'class', name: 'InternalExport', internal: true }
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
expect(docs[docs.length - 1].data.pages).toEqual([
|
||||
jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
|
||||
]);
|
||||
});
|
||||
|
||||
it('should ignore docs that are in the `docTypesToIgnore` list', () => {
|
||||
it('should ignore docs that are in the `docTypesToIgnore` list', async () => {
|
||||
const processor = createProcessor();
|
||||
processor.docTypesToIgnore = ['interface'];
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'Class' },
|
||||
{ docType: 'interface', name: 'Interface' },
|
||||
{ docType: 'content', name: 'Guide' },
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
expect(docs[docs.length - 1].data.pages).toEqual([
|
||||
jasmine.objectContaining({ title: 'Class', type: 'class' }),
|
||||
jasmine.objectContaining({ title: 'Guide', type: 'content' }),
|
||||
]);
|
||||
});
|
||||
|
||||
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => {
|
||||
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', async () => {
|
||||
const processor = createProcessor();
|
||||
processor.propertiesToIgnore = ['docType', 'ignore'];
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
|
||||
{ docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
expect(docs[docs.length - 1].data).toEqual({
|
||||
dictionary: 'fooclass barinterfac captur content',
|
||||
pages: [
|
||||
|
@ -81,17 +78,16 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should not collect keywords that look like HTML tags', () => {
|
||||
it('should not collect keywords that look like HTML tags', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'FooClass', content: `
|
||||
<table id="foo">
|
||||
<tr class="moo" id="bar">
|
||||
<td>Content inside a table</td>
|
||||
</tr>
|
||||
</table>` },
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
expect(docs[docs.length - 1].data).toEqual({
|
||||
dictionary: 'class fooclass content insid tabl',
|
||||
pages: [
|
||||
|
@ -100,15 +96,14 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should compute `doc.searchTitle` from the doc properties if not already provided', () => {
|
||||
it('should compute `doc.searchTitle` from the doc properties if not already provided', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
|
||||
{ docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
|
||||
{ docType: 'class', name: 'C', vFile: { title: 'vFile C', headings: { h1: ['vFile C'] } } },
|
||||
{ docType: 'class', name: 'D' },
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
expect(docs[docs.length - 1].data.pages).toEqual([
|
||||
jasmine.objectContaining({ title: 'searchTitle A' }),
|
||||
jasmine.objectContaining({ title: 'title B' }),
|
||||
|
@ -117,29 +112,27 @@ describe('generateKeywords processor', () => {
|
|||
]);
|
||||
});
|
||||
|
||||
it('should use `doc.searchTitle` as the title in the search index', () => {
|
||||
it('should use `doc.searchTitle` as the title in the search index', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{ docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(keywordsDoc.data.pages).toEqual([
|
||||
jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
|
||||
]);
|
||||
});
|
||||
|
||||
it('should add heading words to the search terms', () => {
|
||||
it('should add heading words to the search terms', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{
|
||||
docType: 'class',
|
||||
name: 'PublicExport',
|
||||
searchTitle: 'class PublicExport',
|
||||
vFile: { headings: { h2: ['Important heading', 'Secondary heading'] } }
|
||||
},
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(keywordsDoc.data).toEqual({
|
||||
dictionary: 'class publicexport head secondari',
|
||||
|
@ -149,9 +142,9 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should add member doc properties to the search terms', () => {
|
||||
it('should add member doc properties to the search terms', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{
|
||||
docType: 'class',
|
||||
name: 'PublicExport',
|
||||
|
@ -171,8 +164,7 @@ describe('generateKeywords processor', () => {
|
|||
{ name: 'staticPropertyB' },
|
||||
],
|
||||
},
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(keywordsDoc.data).toEqual({
|
||||
dictionary: 'class publicexport content ngclass instancemethoda instancepropertya instancemethodb instancepropertyb staticmethoda staticpropertya staticmethodb staticpropertyb head',
|
||||
|
@ -184,7 +176,7 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should add inherited member doc properties to the search terms', () => {
|
||||
it('should add inherited member doc properties to the search terms', async () => {
|
||||
const processor = createProcessor();
|
||||
const parentClass = {
|
||||
docType: 'class',
|
||||
|
@ -216,8 +208,7 @@ describe('generateKeywords processor', () => {
|
|||
extendsClauses: [{ doc: parentClass }],
|
||||
implementsClauses: [{ doc: parentInterface }]
|
||||
};
|
||||
const docs = [childClass, parentClass, parentInterface];
|
||||
processor.$process(docs);
|
||||
const docs = await processor.$process([childClass, parentClass, parentInterface]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(keywordsDoc.data).toEqual({
|
||||
dictionary: 'class child childmember1 childmember2 parentmember1 parentmember2 parentmember3 parentclass interfac parentinterfac',
|
||||
|
@ -238,9 +229,9 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should include both stripped and unstripped "ng" prefixed tokens', () => {
|
||||
it('should include both stripped and unstripped "ng" prefixed tokens', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{
|
||||
docType: 'class',
|
||||
name: 'PublicExport',
|
||||
|
@ -248,8 +239,7 @@ describe('generateKeywords processor', () => {
|
|||
vFile: { headings: { h2: ['ngModel'] } },
|
||||
content: 'Some content with ngClass in it.'
|
||||
},
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(keywordsDoc.data).toEqual({
|
||||
dictionary: 'class publicexport ngcontrol control content ngclass ngmodel model',
|
||||
|
@ -262,9 +252,9 @@ describe('generateKeywords processor', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should generate compressed encoded renderedContent property', () => {
|
||||
it('should generate compressed encoded renderedContent property', async () => {
|
||||
const processor = createProcessor();
|
||||
const docs = [
|
||||
const docs = await processor.$process([
|
||||
{
|
||||
docType: 'class',
|
||||
name: 'SomeClass',
|
||||
|
@ -280,8 +270,7 @@ describe('generateKeywords processor', () => {
|
|||
],
|
||||
deprecated: true
|
||||
},
|
||||
];
|
||||
processor.$process(docs);
|
||||
]);
|
||||
const keywordsDoc = docs[docs.length - 1];
|
||||
expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
|
||||
dictionary: 'class someclass document api head someclass2 descript member1',
|
||||
|
|
|
@ -11955,10 +11955,10 @@ stealthy-require@^1.1.1:
|
|||
resolved "https://registry.yarnpkg.com/stealthy-require/-/stealthy-require-1.1.1.tgz#35b09875b4ff49f26a777e509b3090a3226bf24b"
|
||||
integrity sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=
|
||||
|
||||
stemmer@^1.0.5:
|
||||
version "1.0.5"
|
||||
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0"
|
||||
integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A==
|
||||
stemmer@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-2.0.0.tgz#05fcaf174c423b0fec85e660759ebd4867d811c9"
|
||||
integrity sha512-0YS2oMdTZ/wAWUHMMpf7AAJ8Gm6dHXyHddJ0zCu2DIfOfIbdwqAm1bbk4+Vti6gxNIcOrnm5jAP7vYTzQDvc5A==
|
||||
|
||||
stream-browserify@^2.0.1:
|
||||
version "2.0.2"
|
||||
|
|
Loading…
Reference in New Issue