From 65cd44e731b2a6f01e72647264d33015481e82c8 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Sun, 4 Apr 2021 19:09:21 +0100 Subject: [PATCH] build(docs-infra): tidy up the generateKeywords processor (#41447) The recent PR #41368 contained some changes that could be improved. PR Close #41447 --- .../transforms/angular-base-package/index.js | 2 +- .../processors/generateKeywords.js | 36 ++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/aio/tools/transforms/angular-base-package/index.js b/aio/tools/transforms/angular-base-package/index.js index 8c946c89ec..037f52322e 100644 --- a/aio/tools/transforms/angular-base-package/index.js +++ b/aio/tools/transforms/angular-base-package/index.js @@ -66,7 +66,7 @@ module.exports = new Package('angular-base', [ collectExamples.exampleFolders = []; generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en']; - generateKeywordsProcessor.docTypesToIgnore = ['example-region']; + generateKeywordsProcessor.docTypesToIgnore = [undefined, 'example-region', 'json-doc', 'api-list-data', 'api-list-data', 'contributors-json', 'navigation-json', 'announcements-json']; generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle']; }) diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js index a1d85e7e5c..d5141d88d1 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js @@ -43,10 +43,10 @@ module.exports = function generateKeywordsProcessor(log) { .filter(doc => !doc.internal && !doc.privateExport); - for(const doc of filteredDocs) { + for (const doc of filteredDocs) { // Search each top level property of the document for search terms let mainTokens = []; - for(const key of Object.keys(doc)) { + for (const key of Object.keys(doc)) { const value = doc[key]; if (isString(value) && !propertiesToIgnore.has(key)) { mainTokens.push(...tokenize(value, ignoreWords, dictionary)); @@ -58,8 +58,8 @@ module.exports = function generateKeywordsProcessor(log) { // Extract all the keywords from the headings let headingTokens = []; if (doc.vFile && doc.vFile.headings) { - for(const headingTag of Object.keys(doc.vFile.headings)) { - for(const headingText of doc.vFile.headings[headingTag]) { + for (const headingTag of Object.keys(doc.vFile.headings)) { + for (const headingText of doc.vFile.headings[headingTag]) { headingTokens.push(...tokenize(headingText, ignoreWords, dictionary)); } } @@ -120,27 +120,37 @@ function isString(value) { function tokenize(text, ignoreWords, dictionary) { // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed). - const rawTokens = text.split(/[\s/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img); + const rawTokens = text.split(new RegExp( + '[\\s/]+' + // whitespace + '|' + // or + '', // simple HTML tags (e.g. ,
, , etc.) + 'ig')); const tokens = []; - for(let token of rawTokens) { + for (let token of rawTokens) { token = token.trim(); - // Strip off unwanted trivial characters - token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); + // Trim unwanted trivia characters from the start and end of the token + const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]'; + // Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end. + // The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression. + // But we do need to ensure we don't capture the at the end of the token. + const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]'; + token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1'); - // Skip if in the ignored words list - if (ignoreWords.has(token.toLowerCase())) { + // Skip if blank or in the ignored words list + if (token === '' || ignoreWords.has(token.toLowerCase())) { continue; } // Skip tokens that contain weird characters - if (!/^[\w._-]+$/.test(token)) { + if (!/^\w[\w.-]*$/.test(token)) { continue; } storeToken(token, tokens, dictionary); if (token.startsWith('ng')) { - storeToken(token.substr(2), tokens, dictionary); + // Strip off `ng`, `ng-`, `ng1`, `ng2`, etc + storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary); } } @@ -156,7 +166,7 @@ function storeToken(token, tokens, dictionary) { } function extractMemberTokens(doc, ignoreWords, dictionary) { - if (!doc) return ''; + if (!doc) return []; let memberContent = [];