From 96f9f03d25f8b1960e243cccf6e605fbb1a3face Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Fri, 14 Sep 2018 14:08:48 +0100 Subject: [PATCH] build(docs-infra): improve search quality (#25750) PR Close #25750 --- aio/src/app/search/search-worker.js | 20 ++++++---- aio/tests/e2e/search.e2e-spec.ts | 2 +- .../processors/generateKeywords.js | 38 ++++++++++++------- .../processors/generateKeywords.spec.js | 6 +-- .../transforms/angular.io-package/index.js | 2 +- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/aio/src/app/search/search-worker.js b/aio/src/app/search/search-worker.js index 503a75aaa8..77859b7c66 100644 --- a/aio/src/app/search/search-worker.js +++ b/aio/src/app/search/search-worker.js @@ -27,12 +27,13 @@ self.onmessage = handleMessage; // Create the lunr index - the docs should be an array of objects, each object containing // the path and search terms for a page function createIndex(addFn) { + lunr.QueryLexer.termSeparator = lunr.tokenizer.separator = /\s+/; return lunr(/** @this */function() { this.ref('path'); - this.field('titleWords', {boost: 100}); - this.field('headingWords', {boost: 50}); - this.field('members', {boost: 40}); - this.field('keywords', {boost: 20}); + this.field('titleWords', {boost: 10}); + this.field('headingWords', {boost: 5}); + this.field('members', {boost: 4}); + this.field('keywords', {boost: 2}); addFn(this); }); } @@ -86,10 +87,13 @@ function loadIndex(searchInfo /*: SearchInfo */) { function queryIndex(query) { try { if (query.length) { - // Add a relaxed search in the title for the first word in the query - // E.g. if the search is "ngCont guide" then we search for "ngCont guide titleWords:ngCont*" - var titleQuery = 'titleWords:*' + query.split(' ', 1)[0] + '*'; - var results = index.search(query + ' ' + titleQuery); + var results = index.search(query); + if (results.length === 0) { + // Add a relaxed search in the title for the first word in the query + // E.g. if the search is "ngCont guide" then we search for "ngCont guide titleWords:ngCont*" + var titleQuery = 'titleWords:*' + query.split(' ', 1)[0] + '*'; + results = index.search(query + ' ' + titleQuery); + } // Map the hits into info about each page to be returned as results return results.map(function(hit) { return pages[hit.ref]; }); } diff --git a/aio/tests/e2e/search.e2e-spec.ts b/aio/tests/e2e/search.e2e-spec.ts index 84bd4e64d6..12dff497c8 100644 --- a/aio/tests/e2e/search.e2e-spec.ts +++ b/aio/tests/e2e/search.e2e-spec.ts @@ -12,7 +12,7 @@ describe('site search', () => { page.enterSearch('ngCont'); expect(page.getSearchResults()).toContain('NgControl'); - page.enterSearch('accessor'); + page.enterSearch('valueaccess'); expect(page.getSearchResults()).toContain('ControlValueAccessor'); }); diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js index ae52139c29..4e9aefa503 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js @@ -31,9 +31,6 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { var propertiesToIgnore; var docTypesToIgnore; - // Keywords start with "ng:" or one of $, _ or a letter - var KEYWORD_REGEX = /^((ng:|[$_a-z])[\w\-_]+)/; - // Load up the keywords to ignore, if specified in the config if (this.ignoreWordsFile) { var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile); @@ -52,20 +49,33 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { // If the heading contains a name starting with ng, e.g. "ngController", then add the // name without the ng to the text, e.g. "controller". - function preprocessText(text) { - return text.replace(/(^|\s)([nN]g([A-Z]\w*))/g, '$1$2 $3'); + function tokenize(text) { + const rawTokens = text.split(/[\s\/]+/mg); + const tokens = []; + rawTokens.forEach(token => { + // Strip off unwanted trivial characters + token = token + .trim() + .replace(/^[_\-"'`({[<$*)}\]>.]+/, '') + .replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); + // Ignore tokens that contain weird characters + if (/^[\w.\-]+$/.test(token)) { + tokens.push(token.toLowerCase()); + const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token); + if (ngTokenMatch) { + tokens.push(ngTokenMatch[1].toLowerCase()); + } + } + }); + return tokens; } function extractWords(text, words, keywordMap) { - var tokens = preprocessText(text).toLowerCase().split(/[.\s,`'"#]+/mg); + var tokens = tokenize(text); tokens.forEach(function(token) { - var match = token.match(KEYWORD_REGEX); - if (match) { - var key = match[1]; - if (!keywordMap[key]) { - keywordMap[key] = true; - words.push(key); - } + if (!keywordMap[token]) { + words.push(token); + keywordMap[token] = true; } }); } @@ -116,7 +126,7 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { // Attach all this search data to the document doc.searchTerms = { - titleWords: preprocessText(doc.searchTitle), + titleWords: tokenize(doc.searchTitle).join(' '), headingWords: headingWords.sort().join(' '), keywords: words.sort().join(' '), members: members.sort().join(' ') diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js index 7b899643d0..1bd96155ce 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js @@ -80,7 +80,7 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('class PublicExport'); + expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport'); }); it('should add heading words to the search terms', () => { @@ -141,7 +141,7 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('ngController Controller'); + expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller'); expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel'); expect(keywordsDoc.data[0].keywords).toContain('class'); expect(keywordsDoc.data[0].keywords).toContain('ngclass'); @@ -163,7 +163,7 @@ describe('generateKeywords processor', () => { [{ 'title':'SomeClass', 'type':'class', - 'titleWords':'SomeClass', + 'titleWords':'someclass', 'headingWords':'heading some someclass', 'keywords':'api class documentation for is someclass the', 'members':'' diff --git a/aio/tools/transforms/angular.io-package/index.js b/aio/tools/transforms/angular.io-package/index.js index cb24f95e99..bd3d196739 100644 --- a/aio/tools/transforms/angular.io-package/index.js +++ b/aio/tools/transforms/angular.io-package/index.js @@ -37,7 +37,7 @@ module.exports = new Package('angular.io', [gitPackage, apiPackage, contentPacka checkAnchorLinksProcessor.$runBefore = ['convertToJsonProcessor']; checkAnchorLinksProcessor.$runAfter = ['fixInternalDocumentLinks']; // We only want to check docs that are going to be output as JSON docs. - checkAnchorLinksProcessor.checkDoc = (doc) => doc.path && doc.outputPath && extname(doc.outputPath) === '.json'; + checkAnchorLinksProcessor.checkDoc = (doc) => doc.path && doc.outputPath && extname(doc.outputPath) === '.json' && doc.docType !== 'json-doc'; // Since we have a `base[href="/"]` arrangement all links are relative to that and not relative to the source document's path checkAnchorLinksProcessor.base = '/'; // Ignore links to local assets