build(docs-infra): tidy up the generateKeywords processor (#41447)

The recent PR #41368 contained some changes that could be improved.

PR Close #41447
This commit is contained in:
Pete Bacon Darwin 2021-04-04 19:09:21 +01:00 committed by Zach Arend
parent c385e74454
commit 65cd44e731
2 changed files with 24 additions and 14 deletions

View File

@ -66,7 +66,7 @@ module.exports = new Package('angular-base', [
collectExamples.exampleFolders = [];
generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en'];
generateKeywordsProcessor.docTypesToIgnore = ['example-region'];
generateKeywordsProcessor.docTypesToIgnore = [undefined, 'example-region', 'json-doc', 'api-list-data', 'api-list-data', 'contributors-json', 'navigation-json', 'announcements-json'];
generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle'];
})

View File

@ -43,10 +43,10 @@ module.exports = function generateKeywordsProcessor(log) {
.filter(doc => !doc.internal && !doc.privateExport);
for(const doc of filteredDocs) {
for (const doc of filteredDocs) {
// Search each top level property of the document for search terms
let mainTokens = [];
for(const key of Object.keys(doc)) {
for (const key of Object.keys(doc)) {
const value = doc[key];
if (isString(value) && !propertiesToIgnore.has(key)) {
mainTokens.push(...tokenize(value, ignoreWords, dictionary));
@ -58,8 +58,8 @@ module.exports = function generateKeywordsProcessor(log) {
// Extract all the keywords from the headings
let headingTokens = [];
if (doc.vFile && doc.vFile.headings) {
for(const headingTag of Object.keys(doc.vFile.headings)) {
for(const headingText of doc.vFile.headings[headingTag]) {
for (const headingTag of Object.keys(doc.vFile.headings)) {
for (const headingText of doc.vFile.headings[headingTag]) {
headingTokens.push(...tokenize(headingText, ignoreWords, dictionary));
}
}
@ -120,27 +120,37 @@ function isString(value) {
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(/[\s/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img);
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for(let token of rawTokens) {
for (let token of rawTokens) {
token = token.trim();
// Strip off unwanted trivial characters
token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if in the ignored words list
if (ignoreWords.has(token.toLowerCase())) {
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^[\w._-]+$/.test(token)) {
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
storeToken(token.substr(2), tokens, dictionary);
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
@ -156,7 +166,7 @@ function storeToken(token, tokens, dictionary) {
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return '';
if (!doc) return [];
let memberContent = [];