From fccffc647b6b4ee2a4e15b6d3563e24f853fb0d5 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Sun, 28 Mar 2021 20:34:09 +0100 Subject: [PATCH] refactor(docs-infra): include more info in search index data (#41368) The AIO search index is built in a WebWorker on the browser from a set of page information that is downloaded as a JSON file (`search-data.json`). We want to keep this file as small as possible while providing enough data to generate a useful index to query against. Previously, we only included one copy of each (non-ignored) term from each doc but this prevents more subtle ranking of query results, since the number of occurences of a term in a doc is lost. This commit changes the generated file in the following ways: - All non-ignored terms are now included in the order in which they appear in the doc. - The terms are indexed into a dictonary to avoid the text of the term being repeated in every doc that contains the term. - Each term is pre-"stemmed" using the same Porter Stemming algorith that the Lunr search engine uses. The web-worker has been updated to decode the new format of the file. Now that all terms are included, it may enable some level of phrase based matching in the future. The size of the generated file is considerably larger than previously, but on production HTTP servers the data is sent compressed, which reduces the size dramatically. PR Close #41368 --- aio/package.json | 2 + aio/src/app/search/search.worker.ts | 73 +- .../angular-base-package/ignore-words.json | 705 ++++++++++++++++++ .../angular-base-package/ignore.words | 701 ----------------- .../transforms/angular-base-package/index.js | 4 +- .../processors/generateKeywords.js | 197 +++-- .../processors/generateKeywords.spec.js | 193 +++-- aio/yarn.lock | 10 + 8 files changed, 1007 insertions(+), 878 deletions(-) create mode 100644 aio/tools/transforms/angular-base-package/ignore-words.json delete mode 100644 aio/tools/transforms/angular-base-package/ignore.words diff --git a/aio/package.json b/aio/package.json index 3922a8354a..2157b78ab4 100644 --- a/aio/package.json +++ b/aio/package.json @@ -116,6 +116,7 @@ "@types/jasmine": "~3.6.0", "@types/lunr": "^2.3.2", "@types/node": "^12.7.9", + "@types/stemmer": "^1.0.2", "@types/xregexp": "^3.0.30", "@yarnpkg/lockfile": "^1.1.0", "archiver": "^1.3.0", @@ -166,6 +167,7 @@ "rimraf": "^2.6.1", "semver": "^5.3.0", "shelljs": "^0.8.4", + "stemmer": "^1.0.5", "timezone-mock": "^1.1.3", "tree-kill": "^1.1.0", "ts-node": "^8.4.1", diff --git a/aio/src/app/search/search.worker.ts b/aio/src/app/search/search.worker.ts index 5c15cd1c6b..d31a5ce935 100644 --- a/aio/src/app/search/search.worker.ts +++ b/aio/src/app/search/search.worker.ts @@ -1,10 +1,11 @@ /// -import { WebWorkerMessage } from '../shared/web-worker-message'; import * as lunr from 'lunr'; +import {WebWorkerMessage} from '../shared/web-worker-message'; + const SEARCH_TERMS_URL = '/generated/docs/app/search-data.json'; let index: lunr.Index; -const pages: SearchInfo = {}; +const pageMap: SearchInfo = {}; interface SearchInfo { [key: string]: PageInfo; @@ -13,8 +14,25 @@ interface SearchInfo { interface PageInfo { path: string; type: string; - titleWords: string; - keyWords: string; + title: string; + headings: string; + keywords: string; + members: string; + topics: string; +} + +interface EncodedPages { + dictionary: string[]; + pages: EncodedPage[]; +} + +interface EncodedPage { + path: string; + type: string; + title: string; + headings: number[]; + keywords: number[]; + members: number[]; topics: string; } @@ -24,42 +42,42 @@ addEventListener('message', handleMessage); // the path and search terms for a page function createIndex(loadIndexFn: IndexLoader): lunr.Index { // The lunr typings are missing QueryLexer so we have to add them here manually. - const queryLexer = (lunr as any as { QueryLexer: { termSeparator: RegExp } }).QueryLexer; + const queryLexer = (lunr as any as {QueryLexer: {termSeparator: RegExp}}).QueryLexer; queryLexer.termSeparator = lunr.tokenizer.separator = /\s+/; return lunr(function() { + this.pipeline.remove(lunr.stemmer); this.ref('path'); - this.field('topics', { boost: 15 }); - this.field('titleWords', { boost: 10 }); - this.field('headingWords', { boost: 5 }); - this.field('members', { boost: 4 }); - this.field('keywords', { boost: 2 }); + this.field('topics', {boost: 15}); + this.field('title', {boost: 10}); + this.field('headings', {boost: 5}); + this.field('members', {boost: 4}); + this.field('keywords', {boost: 2}); loadIndexFn(this); }); } // The worker receives a message to load the index and to query the index -function handleMessage(message: { data: WebWorkerMessage }): void { +function handleMessage(message: {data: WebWorkerMessage}): void { const type = message.data.type; const id = message.data.id; const payload = message.data.payload; switch (type) { case 'load-index': - makeRequest(SEARCH_TERMS_URL, (searchInfo: PageInfo[]) => { - index = createIndex(loadIndex(searchInfo)); - postMessage({ type, id, payload: true }); + makeRequest(SEARCH_TERMS_URL, (encodedPages: EncodedPages) => { + index = createIndex(loadIndex(encodedPages)); + postMessage({type, id, payload: true}); }); break; case 'query-index': - postMessage({ type, id, payload: { query: payload, results: queryIndex(payload) } }); + postMessage({type, id, payload: {query: payload, results: queryIndex(payload)}}); break; default: - postMessage({ type, id, payload: { error: 'invalid message type' } }); + postMessage({type, id, payload: {error: 'invalid message type'}}); } } // Use XHR to make a request to the server function makeRequest(url: string, callback: (response: any) => void): void { - // The JSON file that is loaded should be an array of PageInfo: const searchDataRequest = new XMLHttpRequest(); searchDataRequest.onload = function() { @@ -70,18 +88,29 @@ function makeRequest(url: string, callback: (response: any) => void): void { } -// Create the search index from the searchInfo which contains the information about each page to be indexed -function loadIndex(pagesData: PageInfo[]): IndexLoader { +// Create the search index from the searchInfo which contains the information about each page to be +// indexed +function loadIndex({dictionary, pages}: EncodedPages): IndexLoader { return (indexBuilder: lunr.Builder) => { // Store the pages data to be used in mapping query results back to pages // Add search terms from each page to the search index - pagesData.forEach(page => { + pages.forEach(encodedPage => { + const page = decodePage(encodedPage, dictionary); indexBuilder.add(page); - pages[page.path] = page; + pageMap[page.path] = page; }); }; } +function decodePage(encodedPage: EncodedPage, dictionary: string[]): PageInfo { + return { + ...encodedPage, + headings: encodedPage.headings?.map(i => dictionary[i]).join(' ') ?? '', + keywords: encodedPage.keywords?.map(i => dictionary[i]).join(' ') ?? '', + members: encodedPage.members?.map(i => dictionary[i]).join(' ') ?? '', + }; +} + // Query the index and return the processed results function queryIndex(query: string): PageInfo[] { // Strip off quotes @@ -105,7 +134,7 @@ function queryIndex(query: string): PageInfo[] { } // Map the hits into info about each page to be returned as results - return results.map(hit => pages[hit.ref]); + return results.map(hit => pageMap[hit.ref]); } } catch (e) { // If the search query cannot be parsed the index throws an error diff --git a/aio/tools/transforms/angular-base-package/ignore-words.json b/aio/tools/transforms/angular-base-package/ignore-words.json new file mode 100644 index 0000000000..0f56284ddb --- /dev/null +++ b/aio/tools/transforms/angular-base-package/ignore-words.json @@ -0,0 +1,705 @@ +{ + "en": [ + "a", + "able", + "about", + "above", + "abst", + "accordance", + "according", + "accordingly", + "across", + "act", + "actually", + "added", + "adj", + "adopted", + "affected", + "affecting", + "affects", + "after", + "afterwards", + "again", + "against", + "ah", + "all", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "an", + "and", + "announce", + "another", + "any", + "anybody", + "anyhow", + "anymore", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "apparently", + "approximately", + "are", + "aren", + "arent", + "arise", + "around", + "as", + "aside", + "ask", + "asking", + "at", + "auth", + "available", + "away", + "awfully", + "b", + "back", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "begin", + "beginning", + "beginnings", + "begins", + "behind", + "being", + "believe", + "below", + "beside", + "besides", + "between", + "beyond", + "biol", + "both", + "brief", + "briefly", + "but", + "by", + "c", + "ca", + "came", + "can", + "cannot", + "can't", + "cant", + "cause", + "causes", + "certain", + "certainly", + "co", + "com", + "come", + "comes", + "contain", + "containing", + "contains", + "could", + "couldnt", + "d", + "date", + "did", + "didn't", + "didnt", + "different", + "do", + "does", + "doesn't", + "doesnt", + "doing", + "done", + "don't", + "dont", + "down", + "downwards", + "due", + "during", + "e", + "each", + "ed", + "edu", + "effect", + "eg", + "eight", + "eighty", + "either", + "else", + "elsewhere", + "end", + "ending", + "enough", + "especially", + "et", + "et-al", + "etc", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "except", + "f", + "far", + "few", + "ff", + "fifth", + "first", + "five", + "fix", + "followed", + "following", + "follows", + "for", + "former", + "formerly", + "forth", + "found", + "four", + "from", + "further", + "furthermore", + "g", + "gave", + "get", + "gets", + "getting", + "give", + "given", + "gives", + "giving", + "go", + "goes", + "gone", + "got", + "gotten", + "h", + "had", + "happens", + "hardly", + "has", + "hasn't", + "hasnt", + "have", + "haven't", + "havent", + "having", + "he", + "hed", + "hence", + "her", + "here", + "hereafter", + "hereby", + "herein", + "heres", + "hereupon", + "hers", + "herself", + "hes", + "hi", + "hid", + "him", + "himself", + "his", + "hither", + "home", + "how", + "howbeit", + "however", + "hundred", + "i", + "id", + "ie", + "if", + "i'll", + "ill", + "im", + "immediate", + "immediately", + "importance", + "important", + "in", + "inc", + "indeed", + "index", + "information", + "instead", + "into", + "invention", + "inward", + "is", + "isn't", + "isnt", + "it", + "itd", + "it'll", + "itll", + "its", + "itself", + "i've", + "ive", + "j", + "just", + "k", + "keep", + "keeps", + "kept", + "keys", + "kg", + "km", + "know", + "known", + "knows", + "l", + "largely", + "last", + "lately", + "later", + "latter", + "latterly", + "least", + "less", + "lest", + "let", + "lets", + "like", + "liked", + "likely", + "line", + "little", + "'ll", + "'ll", + "look", + "looking", + "looks", + "ltd", + "m", + "made", + "mainly", + "make", + "makes", + "many", + "may", + "maybe", + "me", + "mean", + "means", + "meantime", + "meanwhile", + "merely", + "mg", + "might", + "million", + "miss", + "ml", + "more", + "moreover", + "most", + "mostly", + "mr", + "mrs", + "much", + "mug", + "must", + "my", + "myself", + "n", + "na", + "name", + "namely", + "nay", + "nd", + "near", + "nearly", + "necessarily", + "necessary", + "need", + "needs", + "neither", + "never", + "nevertheless", + "new", + "next", + "nine", + "ninety", + "no", + "nobody", + "non", + "none", + "nonetheless", + "noone", + "nor", + "normally", + "nos", + "not", + "noted", + "nothing", + "now", + "nowhere", + "o", + "obtain", + "obtained", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "omitted", + "on", + "once", + "one", + "ones", + "only", + "onto", + "or", + "ord", + "other", + "others", + "otherwise", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "owing", + "own", + "p", + "page", + "pages", + "part", + "particular", + "particularly", + "past", + "per", + "perhaps", + "placed", + "please", + "plus", + "poorly", + "possible", + "possibly", + "potentially", + "pp", + "predominantly", + "present", + "previously", + "primarily", + "probably", + "promptly", + "proud", + "provides", + "put", + "q", + "que", + "quickly", + "quite", + "qv", + "r", + "ran", + "rather", + "rd", + "re", + "readily", + "really", + "recent", + "recently", + "ref", + "refs", + "regarding", + "regardless", + "regards", + "related", + "relatively", + "research", + "respectively", + "resulted", + "resulting", + "results", + "right", + "run", + "s", + "said", + "same", + "saw", + "say", + "saying", + "says", + "sec", + "section", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "self", + "selves", + "sent", + "seven", + "several", + "shall", + "she", + "shed", + "she'll", + "shell", + "shes", + "should", + "shouldn't", + "shouldnt", + "show", + "showed", + "shown", + "showns", + "shows", + "significant", + "significantly", + "similar", + "similarly", + "since", + "six", + "slightly", + "so", + "some", + "somebody", + "somehow", + "someone", + "somethan", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specifically", + "specified", + "specify", + "specifying", + "state", + "states", + "still", + "stop", + "strongly", + "sub", + "substantially", + "successfully", + "such", + "sufficiently", + "suggest", + "sup", + "sure", + "t", + "take", + "taken", + "taking", + "tell", + "tends", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "that'll", + "thatll", + "thats", + "that've", + "thatve", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "thered", + "therefore", + "therein", + "there'll", + "therell", + "thereof", + "therere", + "theres", + "thereto", + "thereupon", + "there've", + "thereve", + "these", + "they", + "theyd", + "they'll", + "theyll", + "theyre", + "they've", + "theyve", + "think", + "this", + "those", + "thou", + "though", + "thoughh", + "thousand", + "throug", + "through", + "throughout", + "thru", + "thus", + "til", + "tip", + "to", + "together", + "too", + "took", + "toward", + "towards", + "tried", + "tries", + "truly", + "try", + "trying", + "ts", + "twice", + "two", + "u", + "un", + "under", + "unfortunately", + "unless", + "unlike", + "unlikely", + "until", + "unto", + "up", + "upon", + "ups", + "us", + "use", + "used", + "useful", + "usefully", + "usefulness", + "uses", + "using", + "usually", + "v", + "value", + "various", + "'ve", + "'ve", + "very", + "via", + "viz", + "vol", + "vols", + "vs", + "w", + "want", + "wants", + "was", + "wasn't", + "wasnt", + "way", + "we", + "wed", + "welcome", + "we'll", + "well", + "went", + "were", + "weren't", + "werent", + "we've", + "weve", + "what", + "whatever", + "what'll", + "whatll", + "whats", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "wheres", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whim", + "whither", + "who", + "whod", + "whoever", + "whole", + "who'll", + "wholl", + "whom", + "whomever", + "whos", + "whose", + "why", + "widely", + "will", + "willing", + "wish", + "with", + "within", + "without", + "won't", + "wont", + "words", + "would", + "wouldn't", + "wouldnt", + "www", + "x", + "y", + "yes", + "yet", + "you", + "youd", + "you'll", + "youll", + "your", + "youre", + "yours", + "yourself", + "yourselves", + "you've", + "youve", + "z", + "zero" + ] +} diff --git a/aio/tools/transforms/angular-base-package/ignore.words b/aio/tools/transforms/angular-base-package/ignore.words deleted file mode 100644 index 82b9f2fc3f..0000000000 --- a/aio/tools/transforms/angular-base-package/ignore.words +++ /dev/null @@ -1,701 +0,0 @@ -a -able -about -above -abst -accordance -according -accordingly -across -act -actually -added -adj -adopted -affected -affecting -affects -after -afterwards -again -against -ah -all -almost -alone -along -already -also -although -always -am -among -amongst -an -and -announce -another -any -anybody -anyhow -anymore -anyone -anything -anyway -anyways -anywhere -apparently -approximately -are -aren -arent -arise -around -as -aside -ask -asking -at -auth -available -away -awfully -b -back -be -became -because -become -becomes -becoming -been -before -beforehand -begin -beginning -beginnings -begins -behind -being -believe -below -beside -besides -between -beyond -biol -both -brief -briefly -but -by -c -ca -came -can -cannot -can't -cant -cause -causes -certain -certainly -co -com -come -comes -contain -containing -contains -could -couldnt -d -date -did -didn't -didnt -different -do -does -doesn't -doesnt -doing -done -don't -dont -down -downwards -due -during -e -each -ed -edu -effect -eg -eight -eighty -either -else -elsewhere -end -ending -enough -especially -et -et-al -etc -even -ever -every -everybody -everyone -everything -everywhere -ex -except -f -far -few -ff -fifth -first -five -fix -followed -following -follows -for -former -formerly -forth -found -four -from -further -furthermore -g -gave -get -gets -getting -give -given -gives -giving -go -goes -gone -got -gotten -h -had -happens -hardly -has -hasn't -hasnt -have -haven't -havent -having -he -hed -hence -her -here -hereafter -hereby -herein -heres -hereupon -hers -herself -hes -hi -hid -him -himself -his -hither -home -how -howbeit -however -hundred -i -id -ie -if -i'll -ill -im -immediate -immediately -importance -important -in -inc -indeed -index -information -instead -into -invention -inward -is -isn't -isnt -it -itd -it'll -itll -its -itself -i've -ive -j -just -k -keep -keeps -kept -keys -kg -km -know -known -knows -l -largely -last -lately -later -latter -latterly -least -less -lest -let -lets -like -liked -likely -line -little -'ll -'ll -look -looking -looks -ltd -m -made -mainly -make -makes -many -may -maybe -me -mean -means -meantime -meanwhile -merely -mg -might -million -miss -ml -more -moreover -most -mostly -mr -mrs -much -mug -must -my -myself -n -na -name -namely -nay -nd -near -nearly -necessarily -necessary -need -needs -neither -never -nevertheless -new -next -nine -ninety -no -nobody -non -none -nonetheless -noone -nor -normally -nos -not -noted -nothing -now -nowhere -o -obtain -obtained -obviously -of -off -often -oh -ok -okay -old -omitted -on -once -one -ones -only -onto -or -ord -other -others -otherwise -ought -our -ours -ourselves -out -outside -over -overall -owing -own -p -page -pages -part -particular -particularly -past -per -perhaps -placed -please -plus -poorly -possible -possibly -potentially -pp -predominantly -present -previously -primarily -probably -promptly -proud -provides -put -q -que -quickly -quite -qv -r -ran -rather -rd -re -readily -really -recent -recently -ref -refs -regarding -regardless -regards -related -relatively -research -respectively -resulted -resulting -results -right -run -s -said -same -saw -say -saying -says -sec -section -see -seeing -seem -seemed -seeming -seems -seen -self -selves -sent -seven -several -shall -she -shed -she'll -shell -shes -should -shouldn't -shouldnt -show -showed -shown -showns -shows -significant -significantly -similar -similarly -since -six -slightly -so -some -somebody -somehow -someone -somethan -something -sometime -sometimes -somewhat -somewhere -soon -sorry -specifically -specified -specify -specifying -state -states -still -stop -strongly -sub -substantially -successfully -such -sufficiently -suggest -sup -sure -t -take -taken -taking -tell -tends -th -than -thank -thanks -thanx -that -that'll -thatll -thats -that've -thatve -the -their -theirs -them -themselves -then -thence -there -thereafter -thereby -thered -therefore -therein -there'll -therell -thereof -therere -theres -thereto -thereupon -there've -thereve -these -they -theyd -they'll -theyll -theyre -they've -theyve -think -this -those -thou -though -thoughh -thousand -throug -through -throughout -thru -thus -til -tip -to -together -too -took -toward -towards -tried -tries -truly -try -trying -ts -twice -two -u -un -under -unfortunately -unless -unlike -unlikely -until -unto -up -upon -ups -us -use -used -useful -usefully -usefulness -uses -using -usually -v -value -various -'ve -'ve -very -via -viz -vol -vols -vs -w -want -wants -was -wasn't -wasnt -way -we -wed -welcome -we'll -well -went -were -weren't -werent -we've -weve -what -whatever -what'll -whatll -whats -when -whence -whenever -where -whereafter -whereas -whereby -wherein -wheres -whereupon -wherever -whether -which -while -whim -whither -who -whod -whoever -whole -who'll -wholl -whom -whomever -whos -whose -why -widely -will -willing -wish -with -within -without -won't -wont -words -would -wouldn't -wouldnt -www -x -y -yes -yet -you -youd -you'll -youll -your -youre -yours -yourself -yourselves -you've -youve -z -zero diff --git a/aio/tools/transforms/angular-base-package/index.js b/aio/tools/transforms/angular-base-package/index.js index e62f4373fb..8c946c89ec 100644 --- a/aio/tools/transforms/angular-base-package/index.js +++ b/aio/tools/transforms/angular-base-package/index.js @@ -65,9 +65,9 @@ module.exports = new Package('angular-base', [ readFilesProcessor.sourceFiles = []; collectExamples.exampleFolders = []; - generateKeywordsProcessor.ignoreWordsFile = path.resolve(__dirname, 'ignore.words'); + generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en']; generateKeywordsProcessor.docTypesToIgnore = ['example-region']; - generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent']; + generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle']; }) // Where do we write the output files? diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js index 7cad35b0db..020d460de8 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js @@ -1,7 +1,6 @@ 'use strict'; -var fs = require('fs'); -var path = require('canonical-path'); +const stem = require('stemmer'); /** * @dgProcessor generateKeywordsProcessor @@ -10,103 +9,98 @@ var path = require('canonical-path'); * a new document that will be rendered as a JavaScript file containing all * this data. */ -module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { +module.exports = function generateKeywordsProcessor(log) { return { - ignoreWordsFile: undefined, + ignoreWords: [], propertiesToIgnore: [], docTypesToIgnore: [], outputFolder: '', $validate: { - ignoreWordsFile: {}, + ignoreWords: {}, docTypesToIgnore: {}, propertiesToIgnore: {}, outputFolder: {presence: true} }, $runAfter: ['postProcessHtml'], $runBefore: ['writing-files'], - $process: function(docs) { + $process(docs) { + + const dictionary = new Map(); // Keywords to ignore - var wordsToIgnore = []; - var propertiesToIgnore; - var docTypesToIgnore; - - // Load up the keywords to ignore, if specified in the config - if (this.ignoreWordsFile) { - var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile); - wordsToIgnore = fs.readFileSync(ignoreWordsPath, 'utf8').toString().split(/[,\s\n\r]+/gm); - - log.debug('Loaded ignore words from "' + ignoreWordsPath + '"'); - log.silly(wordsToIgnore); - } - - propertiesToIgnore = convertToMap(this.propertiesToIgnore); + const ignoreWords = new Set(this.ignoreWords); + log.debug('Words to ignore', ignoreWords); + const propertiesToIgnore = new Set(this.propertiesToIgnore); log.debug('Properties to ignore', propertiesToIgnore); - docTypesToIgnore = convertToMap(this.docTypesToIgnore); + const docTypesToIgnore = new Set(this.docTypesToIgnore); log.debug('Doc types to ignore', docTypesToIgnore); - var ignoreWordsMap = convertToMap(wordsToIgnore); const filteredDocs = docs // We are not interested in some docTypes - .filter(function(doc) { return !docTypesToIgnore[doc.docType]; }) + .filter(doc => !docTypesToIgnore.has(doc.docType)) // Ignore internals and private exports (indicated by the ɵ prefix) - .filter(function(doc) { return !doc.internal && !doc.privateExport; }); + .filter(doc => !doc.internal && !doc.privateExport); - filteredDocs.forEach(function(doc) { - - var words = []; - var keywordMap = Object.assign({}, ignoreWordsMap); - var members = []; - var membersMap = Object.assign({}, ignoreWordsMap); - const headingWords = []; - const headingWordMap = Object.assign({}, ignoreWordsMap); - + for(const doc of filteredDocs) { // Search each top level property of the document for search terms - Object.keys(doc).forEach(function(key) { + let mainTokens = []; + for(const key of Object.keys(doc)) { const value = doc[key]; - - if (isString(value) && !propertiesToIgnore[key]) { - extractWords(value, words, keywordMap); + if (isString(value) && !propertiesToIgnore.has(key)) { + mainTokens.push(...tokenize(value, ignoreWords, dictionary)); } - }); + } - extractMemberWords(doc, members, membersMap); + const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary); // Extract all the keywords from the headings + let headingTokens = []; if (doc.vFile && doc.vFile.headings) { - Object.keys(doc.vFile.headings).forEach(function(headingTag) { - doc.vFile.headings[headingTag].forEach(function(headingText) { - extractWords(headingText, headingWords, headingWordMap); - }); - }); + for(const headingTag of Object.keys(doc.vFile.headings)) { + for(const headingText of doc.vFile.headings[headingTag]) { + headingTokens.push(...tokenize(headingText, ignoreWords, dictionary)); + } + } } + // Extract the title to use in searches doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || ''; // Attach all this search data to the document - doc.searchTerms = { - titleWords: tokenize(doc.searchTitle).join(' '), - headingWords: headingWords.sort().join(' '), - keywords: words.sort().join(' '), - members: members.sort().join(' '), - topics: doc.searchKeywords - }; - - }); + doc.searchTerms = {}; + if (headingTokens.length > 0) { + doc.searchTerms.headings = headingTokens; + } + if (mainTokens.length > 0) { + doc.searchTerms.keywords = mainTokens; + } + if (memberTokens.length > 0) { + doc.searchTerms.members = memberTokens; + } + if (doc.searchKeywords) { + doc.searchTerms.topics = doc.searchKeywords.trim(); + } + } // Now process all the search data and collect it up to be used in creating a new document - var searchData = filteredDocs.map(function(page) { - // Copy the properties from the searchTerms object onto the search data object - return Object.assign({ - path: page.path, - title: page.searchTitle, - type: page.docType, - deprecated: !!page.deprecated, - }, page.searchTerms); - }); + const searchData = { + dictionary: Array.from(dictionary.keys()), + pages: filteredDocs.map(page => { + // Copy the properties from the searchTerms object onto the search data object + const searchObj = { + path: page.path, + title: page.searchTitle, + type: page.docType, + }; + if (page.deprecated) { + searchObj.deprecated = true; + } + return Object.assign(searchObj, page.searchTerms); + }), + }; docs.push({ docType: 'json-doc', @@ -120,63 +114,64 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { }; }; - function isString(value) { return typeof value == 'string'; } -function convertToMap(collection) { - const obj = {}; - collection.forEach(key => { obj[key] = true; }); - return obj; -} - -// If the heading contains a name starting with ng, e.g. "ngController", then add the -// name without the ng to the text, e.g. "controller". -function tokenize(text) { - const rawTokens = text.split(/[\s\/]+/mg); +function tokenize(text, ignoreWords, dictionary) { + // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed). + const rawTokens = text.split(/[\s\/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img); const tokens = []; - rawTokens.forEach(token => { + for(let token of rawTokens) { + token = token.trim(); + // Strip off unwanted trivial characters - token = token - .trim() - .replace(/^[_\-"'`({[<$*)}\]>.]+/, '') - .replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); - // Ignore tokens that contain weird characters - if (/^[\w.\-]+$/.test(token)) { - tokens.push(token.toLowerCase()); - const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token); - if (ngTokenMatch) { - tokens.push(ngTokenMatch[1].toLowerCase()); - } + token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); + + // Skip if in the ignored words list + if (ignoreWords.has(token.toLowerCase())) { + continue; } - }); + + // Skip tokens that contain weird characters + if (!/^[\w._-]+$/.test(token)) { + continue; + } + + storeToken(token, tokens, dictionary); + if (token.startsWith('ng')) { + storeToken(token.substr(2), tokens, dictionary); + } + } + return tokens; } -function extractWords(text, words, keywordMap) { - var tokens = tokenize(text); - tokens.forEach(function(token) { - if (!keywordMap[token]) { - words.push(token); - keywordMap[token] = true; - } - }); +function storeToken(token, tokens, dictionary) { + token = stem(token); + if (!dictionary.has(token)) { + dictionary.set(token, dictionary.size); + } + tokens.push(dictionary.get(token)); } -function extractMemberWords(doc, members, membersMap) { - if (!doc) return; +function extractMemberTokens(doc, ignoreWords, dictionary) { + if (!doc) return ''; + + let memberContent = []; if (doc.members) { - doc.members.forEach(member => extractWords(member.name, members, membersMap)); + doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary))); } if (doc.statics) { - doc.statics.forEach(member => extractWords(member.name, members, membersMap)); + doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary))); } if (doc.extendsClauses) { - doc.extendsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap)); + doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary))); } if (doc.implementsClauses) { - doc.implementsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap)); + doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary))); } -} \ No newline at end of file + + return memberContent; +} diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js index e482ee4a1b..3065a1c16e 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js @@ -1,12 +1,22 @@ +const path = require('canonical-path'); +const Dgeni = require('dgeni'); + const testPackage = require('../../helpers/test-package'); const mockLogger = require('dgeni/lib/mocks/log')(false); const processorFactory = require('./generateKeywords'); -const Dgeni = require('dgeni'); const mockReadFilesProcessor = { basePath: 'base/path' }; +const ignoreWords = require(path.resolve(__dirname, '../ignore-words'))['en']; + +function createProcessor() { + const processor = processorFactory(mockLogger, mockReadFilesProcessor); + processor.ignoreWords = ignoreWords; + return processor; +} + describe('generateKeywords processor', () => { it('should be available on the injector', () => { @@ -17,30 +27,81 @@ describe('generateKeywords processor', () => { }); it('should run after the correct processor', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); expect(processor.$runAfter).toEqual(['postProcessHtml']); }); it('should run before the correct processor', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); expect(processor.$runBefore).toEqual(['writing-files']); }); it('should ignore internal and private exports', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'PublicExport' }, { docType: 'class', name: 'PrivateExport', privateExport: true }, { docType: 'class', name: 'InternalExport', internal: true } ]; processor.$process(docs); - expect(docs[docs.length - 1].data).toEqual([ - jasmine.objectContaining({ title: 'PublicExport', type: 'class'}) + expect(docs[docs.length - 1].data.pages).toEqual([ + jasmine.objectContaining({ title: 'PublicExport', type: 'class' }) ]); }); + it('should ignore docs that are in the `docTypesToIgnore` list', () => { + const processor = createProcessor(); + processor.docTypesToIgnore = ['interface']; + const docs = [ + { docType: 'class', name: 'Class' }, + { docType: 'interface', name: 'Interface' }, + { docType: 'content', name: 'Guide' }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data.pages).toEqual([ + jasmine.objectContaining({ title: 'Class', type: 'class' }), + jasmine.objectContaining({ title: 'Guide', type: 'content' }), + ]); + }); + + it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => { + const processor = createProcessor(); + processor.propertiesToIgnore = ['docType', 'ignore']; + const docs = [ + { docType: 'class', name: 'FooClass', ignore: 'ignore this content' }, + { docType: 'interface', name: 'BarInterface', capture: 'capture this content' }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data).toEqual({ + dictionary: [ 'fooclass', 'barinterfac', 'captur', 'content' ], + pages: [ + jasmine.objectContaining({ title: 'FooClass', type: 'class', keywords: [0] }), + jasmine.objectContaining({ title: 'BarInterface', type: 'interface', keywords: [1, 2, 3] }), + ], + }); + }); + + it('should not collect keywords that look like HTML tags', () => { + const processor = createProcessor(); + const docs = [ + { docType: 'class', name: 'FooClass', content: ` + + + + +
Content inside a table
` }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data).toEqual({ + dictionary: ['class', 'fooclass', 'content', 'insid', 'tabl'], + pages: [ + jasmine.objectContaining({keywords: [0, 1, 2, 3, 4] }) + ], + }); + }); + it('should compute `doc.searchTitle` from the doc properties if not already provided', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } }, { docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } }, @@ -48,7 +109,7 @@ describe('generateKeywords processor', () => { { docType: 'class', name: 'D' }, ]; processor.$process(docs); - expect(docs[docs.length - 1].data).toEqual([ + expect(docs[docs.length - 1].data.pages).toEqual([ jasmine.objectContaining({ title: 'searchTitle A' }), jasmine.objectContaining({ title: 'title B' }), jasmine.objectContaining({ title: 'vFile C' }), @@ -57,34 +118,19 @@ describe('generateKeywords processor', () => { }); it('should use `doc.searchTitle` as the title in the search index', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' }, ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data).toEqual([ - jasmine.objectContaining({ title: 'class PublicExport', type: 'class'}) + expect(keywordsDoc.data.pages).toEqual([ + jasmine.objectContaining({ title: 'class PublicExport', type: 'class' }) ]); }); - it('should add title words to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); - const docs = [ - { - docType: 'class', - name: 'PublicExport', - searchTitle: 'class PublicExport', - vFile: { headings: { h2: ['heading A', 'heading B'] } } - }, - ]; - processor.$process(docs); - const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport'); - }); - it('should add heading words to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -95,11 +141,16 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].headingWords).toEqual('heading important secondary'); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'head', 'secondari'], + pages: [ + jasmine.objectContaining({ headings: [2, 3, 2] }) + ] + }); }); it('should add member doc properties to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -123,13 +174,18 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].members).toEqual( - 'instancemethoda instancemethodb instancepropertya instancepropertyb staticmethoda staticmethodb staticpropertya staticpropertyb' - ); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'content', 'ngclass', 'instancemethoda','instancepropertya','instancemethodb','instancepropertyb','staticmethoda','staticpropertya','staticmethodb','staticpropertyb', 'head'], + pages: [ + jasmine.objectContaining({ + members: [4, 5, 6, 7, 8, 9, 10, 11] + }) + ] + }); }); it('should add inherited member doc properties to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const parentClass = { docType: 'class', name: 'ParentClass', @@ -163,13 +219,27 @@ describe('generateKeywords processor', () => { const docs = [childClass, parentClass, parentInterface]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].members.split(' ').sort().join(' ')).toEqual( - 'childmember1 childmember2 parentmember1 parentmember2 parentmember3' - ); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'child', 'childmember1', 'childmember2', 'parentmember1', 'parentmember2', 'parentmember3', 'parentclass', 'interfac', 'parentinterfac'], + pages: [ + jasmine.objectContaining({ + title: 'Child', + members: [2, 3, 4, 5, 6] + }), + jasmine.objectContaining({ + title: 'ParentClass', + members: [4, 5] + }), + jasmine.objectContaining({ + title: 'ParentInterface', + members: [6] + }) + ] + }); }); - it('should process terms prefixed with "ng" to include the term stripped of "ng"', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + it('should include both stripped and unstripped "ng" prefixed tokens', () => { + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -181,14 +251,19 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller'); - expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel'); - expect(keywordsDoc.data[0].keywords).toContain('class'); - expect(keywordsDoc.data[0].keywords).toContain('ngclass'); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'ngcontrol', 'control', 'content', 'ngclass', 'ngmodel', 'model'], + pages: [ + jasmine.objectContaining({ + headings: [6, 7], + keywords: [0, 1, 2, 3, 4, 5, 0], + }) + ], + }); }); - it('should generate renderedContent property', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + it('should generate compressed encoded renderedContent property', () => { + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -196,19 +271,33 @@ describe('generateKeywords processor', () => { description: 'The is the documentation for the SomeClass API.', vFile: { headings: { h1: ['SomeClass'], h2: ['Some heading'] } } }, + { + docType: 'class', + name: 'SomeClass2', + description: 'description', + members: [ + { name: 'member1' }, + ], + deprecated: true + }, ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(JSON.parse(keywordsDoc.renderedContent)).toEqual( - [{ + expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({ + dictionary: ['class', 'someclass', 'document', 'api', 'head', 'someclass2', 'descript', 'member1'], + pages: [{ 'title':'SomeClass', 'type':'class', - 'titleWords':'someclass', - 'headingWords':'heading some someclass', - 'keywords':'api class documentation for is someclass the', - 'members':'', - 'deprecated': false, + 'headings': [1, 4], + 'keywords': [0, 1, 2, 1, 3], + }, + { + 'title':'SomeClass2', + 'type':'class', + 'keywords': [0, 5, 6], + 'members': [7], + 'deprecated': true, }] - ); + }); }); }); diff --git a/aio/yarn.lock b/aio/yarn.lock index 6e5d4e71ad..bcfb0be3d0 100644 --- a/aio/yarn.lock +++ b/aio/yarn.lock @@ -2005,6 +2005,11 @@ resolved "https://registry.yarnpkg.com/@types/source-list-map/-/source-list-map-0.1.2.tgz#0078836063ffaf17412349bba364087e0ac02ec9" integrity sha512-K5K+yml8LTo9bWJI/rECfIPrGgxdpeNbj+d53lwN4QjW1MCwlkhUms+gtdzigTeUyBr09+u8BwOIY3MXvHdcsA== +"@types/stemmer@^1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@types/stemmer/-/stemmer-1.0.2.tgz#bd8354f50b3c9b87c351d169240e45cf1fa1f5e8" + integrity sha512-2gWEIFqVZjjZxo8/TcugCAl7nW9Jd9ArEDpTAc5nH7d+ZUkreHA7GzuFcLZ0sflLrA5b1PZ+2yDyHJcuP9KWWw== + "@types/unist@*", "@types/unist@^2.0.0", "@types/unist@^2.0.2": version "2.0.3" resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.3.tgz#9c088679876f374eb5983f150d4787aa6fb32d7e" @@ -12802,6 +12807,11 @@ static-extend@^0.1.1: resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c" integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow= +stemmer@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0" + integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A== + stream-browserify@^2.0.1: version "2.0.2" resolved "https://registry.yarnpkg.com/stream-browserify/-/stream-browserify-2.0.2.tgz#87521d38a44aa7ee91ce1cd2a47df0cb49dd660b"