diff --git a/aio/package.json b/aio/package.json index 3922a8354a..2157b78ab4 100644 --- a/aio/package.json +++ b/aio/package.json @@ -116,6 +116,7 @@ "@types/jasmine": "~3.6.0", "@types/lunr": "^2.3.2", "@types/node": "^12.7.9", + "@types/stemmer": "^1.0.2", "@types/xregexp": "^3.0.30", "@yarnpkg/lockfile": "^1.1.0", "archiver": "^1.3.0", @@ -166,6 +167,7 @@ "rimraf": "^2.6.1", "semver": "^5.3.0", "shelljs": "^0.8.4", + "stemmer": "^1.0.5", "timezone-mock": "^1.1.3", "tree-kill": "^1.1.0", "ts-node": "^8.4.1", diff --git a/aio/src/app/search/search.worker.ts b/aio/src/app/search/search.worker.ts index 5c15cd1c6b..d31a5ce935 100644 --- a/aio/src/app/search/search.worker.ts +++ b/aio/src/app/search/search.worker.ts @@ -1,10 +1,11 @@ /// -import { WebWorkerMessage } from '../shared/web-worker-message'; import * as lunr from 'lunr'; +import {WebWorkerMessage} from '../shared/web-worker-message'; + const SEARCH_TERMS_URL = '/generated/docs/app/search-data.json'; let index: lunr.Index; -const pages: SearchInfo = {}; +const pageMap: SearchInfo = {}; interface SearchInfo { [key: string]: PageInfo; @@ -13,8 +14,25 @@ interface SearchInfo { interface PageInfo { path: string; type: string; - titleWords: string; - keyWords: string; + title: string; + headings: string; + keywords: string; + members: string; + topics: string; +} + +interface EncodedPages { + dictionary: string[]; + pages: EncodedPage[]; +} + +interface EncodedPage { + path: string; + type: string; + title: string; + headings: number[]; + keywords: number[]; + members: number[]; topics: string; } @@ -24,42 +42,42 @@ addEventListener('message', handleMessage); // the path and search terms for a page function createIndex(loadIndexFn: IndexLoader): lunr.Index { // The lunr typings are missing QueryLexer so we have to add them here manually. - const queryLexer = (lunr as any as { QueryLexer: { termSeparator: RegExp } }).QueryLexer; + const queryLexer = (lunr as any as {QueryLexer: {termSeparator: RegExp}}).QueryLexer; queryLexer.termSeparator = lunr.tokenizer.separator = /\s+/; return lunr(function() { + this.pipeline.remove(lunr.stemmer); this.ref('path'); - this.field('topics', { boost: 15 }); - this.field('titleWords', { boost: 10 }); - this.field('headingWords', { boost: 5 }); - this.field('members', { boost: 4 }); - this.field('keywords', { boost: 2 }); + this.field('topics', {boost: 15}); + this.field('title', {boost: 10}); + this.field('headings', {boost: 5}); + this.field('members', {boost: 4}); + this.field('keywords', {boost: 2}); loadIndexFn(this); }); } // The worker receives a message to load the index and to query the index -function handleMessage(message: { data: WebWorkerMessage }): void { +function handleMessage(message: {data: WebWorkerMessage}): void { const type = message.data.type; const id = message.data.id; const payload = message.data.payload; switch (type) { case 'load-index': - makeRequest(SEARCH_TERMS_URL, (searchInfo: PageInfo[]) => { - index = createIndex(loadIndex(searchInfo)); - postMessage({ type, id, payload: true }); + makeRequest(SEARCH_TERMS_URL, (encodedPages: EncodedPages) => { + index = createIndex(loadIndex(encodedPages)); + postMessage({type, id, payload: true}); }); break; case 'query-index': - postMessage({ type, id, payload: { query: payload, results: queryIndex(payload) } }); + postMessage({type, id, payload: {query: payload, results: queryIndex(payload)}}); break; default: - postMessage({ type, id, payload: { error: 'invalid message type' } }); + postMessage({type, id, payload: {error: 'invalid message type'}}); } } // Use XHR to make a request to the server function makeRequest(url: string, callback: (response: any) => void): void { - // The JSON file that is loaded should be an array of PageInfo: const searchDataRequest = new XMLHttpRequest(); searchDataRequest.onload = function() { @@ -70,18 +88,29 @@ function makeRequest(url: string, callback: (response: any) => void): void { } -// Create the search index from the searchInfo which contains the information about each page to be indexed -function loadIndex(pagesData: PageInfo[]): IndexLoader { +// Create the search index from the searchInfo which contains the information about each page to be +// indexed +function loadIndex({dictionary, pages}: EncodedPages): IndexLoader { return (indexBuilder: lunr.Builder) => { // Store the pages data to be used in mapping query results back to pages // Add search terms from each page to the search index - pagesData.forEach(page => { + pages.forEach(encodedPage => { + const page = decodePage(encodedPage, dictionary); indexBuilder.add(page); - pages[page.path] = page; + pageMap[page.path] = page; }); }; } +function decodePage(encodedPage: EncodedPage, dictionary: string[]): PageInfo { + return { + ...encodedPage, + headings: encodedPage.headings?.map(i => dictionary[i]).join(' ') ?? '', + keywords: encodedPage.keywords?.map(i => dictionary[i]).join(' ') ?? '', + members: encodedPage.members?.map(i => dictionary[i]).join(' ') ?? '', + }; +} + // Query the index and return the processed results function queryIndex(query: string): PageInfo[] { // Strip off quotes @@ -105,7 +134,7 @@ function queryIndex(query: string): PageInfo[] { } // Map the hits into info about each page to be returned as results - return results.map(hit => pages[hit.ref]); + return results.map(hit => pageMap[hit.ref]); } } catch (e) { // If the search query cannot be parsed the index throws an error diff --git a/aio/tools/transforms/angular-base-package/ignore-words.json b/aio/tools/transforms/angular-base-package/ignore-words.json new file mode 100644 index 0000000000..0f56284ddb --- /dev/null +++ b/aio/tools/transforms/angular-base-package/ignore-words.json @@ -0,0 +1,705 @@ +{ + "en": [ + "a", + "able", + "about", + "above", + "abst", + "accordance", + "according", + "accordingly", + "across", + "act", + "actually", + "added", + "adj", + "adopted", + "affected", + "affecting", + "affects", + "after", + "afterwards", + "again", + "against", + "ah", + "all", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "an", + "and", + "announce", + "another", + "any", + "anybody", + "anyhow", + "anymore", + "anyone", + "anything", + "anyway", + "anyways", + "anywhere", + "apparently", + "approximately", + "are", + "aren", + "arent", + "arise", + "around", + "as", + "aside", + "ask", + "asking", + "at", + "auth", + "available", + "away", + "awfully", + "b", + "back", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "begin", + "beginning", + "beginnings", + "begins", + "behind", + "being", + "believe", + "below", + "beside", + "besides", + "between", + "beyond", + "biol", + "both", + "brief", + "briefly", + "but", + "by", + "c", + "ca", + "came", + "can", + "cannot", + "can't", + "cant", + "cause", + "causes", + "certain", + "certainly", + "co", + "com", + "come", + "comes", + "contain", + "containing", + "contains", + "could", + "couldnt", + "d", + "date", + "did", + "didn't", + "didnt", + "different", + "do", + "does", + "doesn't", + "doesnt", + "doing", + "done", + "don't", + "dont", + "down", + "downwards", + "due", + "during", + "e", + "each", + "ed", + "edu", + "effect", + "eg", + "eight", + "eighty", + "either", + "else", + "elsewhere", + "end", + "ending", + "enough", + "especially", + "et", + "et-al", + "etc", + "even", + "ever", + "every", + "everybody", + "everyone", + "everything", + "everywhere", + "ex", + "except", + "f", + "far", + "few", + "ff", + "fifth", + "first", + "five", + "fix", + "followed", + "following", + "follows", + "for", + "former", + "formerly", + "forth", + "found", + "four", + "from", + "further", + "furthermore", + "g", + "gave", + "get", + "gets", + "getting", + "give", + "given", + "gives", + "giving", + "go", + "goes", + "gone", + "got", + "gotten", + "h", + "had", + "happens", + "hardly", + "has", + "hasn't", + "hasnt", + "have", + "haven't", + "havent", + "having", + "he", + "hed", + "hence", + "her", + "here", + "hereafter", + "hereby", + "herein", + "heres", + "hereupon", + "hers", + "herself", + "hes", + "hi", + "hid", + "him", + "himself", + "his", + "hither", + "home", + "how", + "howbeit", + "however", + "hundred", + "i", + "id", + "ie", + "if", + "i'll", + "ill", + "im", + "immediate", + "immediately", + "importance", + "important", + "in", + "inc", + "indeed", + "index", + "information", + "instead", + "into", + "invention", + "inward", + "is", + "isn't", + "isnt", + "it", + "itd", + "it'll", + "itll", + "its", + "itself", + "i've", + "ive", + "j", + "just", + "k", + "keep", + "keeps", + "kept", + "keys", + "kg", + "km", + "know", + "known", + "knows", + "l", + "largely", + "last", + "lately", + "later", + "latter", + "latterly", + "least", + "less", + "lest", + "let", + "lets", + "like", + "liked", + "likely", + "line", + "little", + "'ll", + "'ll", + "look", + "looking", + "looks", + "ltd", + "m", + "made", + "mainly", + "make", + "makes", + "many", + "may", + "maybe", + "me", + "mean", + "means", + "meantime", + "meanwhile", + "merely", + "mg", + "might", + "million", + "miss", + "ml", + "more", + "moreover", + "most", + "mostly", + "mr", + "mrs", + "much", + "mug", + "must", + "my", + "myself", + "n", + "na", + "name", + "namely", + "nay", + "nd", + "near", + "nearly", + "necessarily", + "necessary", + "need", + "needs", + "neither", + "never", + "nevertheless", + "new", + "next", + "nine", + "ninety", + "no", + "nobody", + "non", + "none", + "nonetheless", + "noone", + "nor", + "normally", + "nos", + "not", + "noted", + "nothing", + "now", + "nowhere", + "o", + "obtain", + "obtained", + "obviously", + "of", + "off", + "often", + "oh", + "ok", + "okay", + "old", + "omitted", + "on", + "once", + "one", + "ones", + "only", + "onto", + "or", + "ord", + "other", + "others", + "otherwise", + "ought", + "our", + "ours", + "ourselves", + "out", + "outside", + "over", + "overall", + "owing", + "own", + "p", + "page", + "pages", + "part", + "particular", + "particularly", + "past", + "per", + "perhaps", + "placed", + "please", + "plus", + "poorly", + "possible", + "possibly", + "potentially", + "pp", + "predominantly", + "present", + "previously", + "primarily", + "probably", + "promptly", + "proud", + "provides", + "put", + "q", + "que", + "quickly", + "quite", + "qv", + "r", + "ran", + "rather", + "rd", + "re", + "readily", + "really", + "recent", + "recently", + "ref", + "refs", + "regarding", + "regardless", + "regards", + "related", + "relatively", + "research", + "respectively", + "resulted", + "resulting", + "results", + "right", + "run", + "s", + "said", + "same", + "saw", + "say", + "saying", + "says", + "sec", + "section", + "see", + "seeing", + "seem", + "seemed", + "seeming", + "seems", + "seen", + "self", + "selves", + "sent", + "seven", + "several", + "shall", + "she", + "shed", + "she'll", + "shell", + "shes", + "should", + "shouldn't", + "shouldnt", + "show", + "showed", + "shown", + "showns", + "shows", + "significant", + "significantly", + "similar", + "similarly", + "since", + "six", + "slightly", + "so", + "some", + "somebody", + "somehow", + "someone", + "somethan", + "something", + "sometime", + "sometimes", + "somewhat", + "somewhere", + "soon", + "sorry", + "specifically", + "specified", + "specify", + "specifying", + "state", + "states", + "still", + "stop", + "strongly", + "sub", + "substantially", + "successfully", + "such", + "sufficiently", + "suggest", + "sup", + "sure", + "t", + "take", + "taken", + "taking", + "tell", + "tends", + "th", + "than", + "thank", + "thanks", + "thanx", + "that", + "that'll", + "thatll", + "thats", + "that've", + "thatve", + "the", + "their", + "theirs", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "thered", + "therefore", + "therein", + "there'll", + "therell", + "thereof", + "therere", + "theres", + "thereto", + "thereupon", + "there've", + "thereve", + "these", + "they", + "theyd", + "they'll", + "theyll", + "theyre", + "they've", + "theyve", + "think", + "this", + "those", + "thou", + "though", + "thoughh", + "thousand", + "throug", + "through", + "throughout", + "thru", + "thus", + "til", + "tip", + "to", + "together", + "too", + "took", + "toward", + "towards", + "tried", + "tries", + "truly", + "try", + "trying", + "ts", + "twice", + "two", + "u", + "un", + "under", + "unfortunately", + "unless", + "unlike", + "unlikely", + "until", + "unto", + "up", + "upon", + "ups", + "us", + "use", + "used", + "useful", + "usefully", + "usefulness", + "uses", + "using", + "usually", + "v", + "value", + "various", + "'ve", + "'ve", + "very", + "via", + "viz", + "vol", + "vols", + "vs", + "w", + "want", + "wants", + "was", + "wasn't", + "wasnt", + "way", + "we", + "wed", + "welcome", + "we'll", + "well", + "went", + "were", + "weren't", + "werent", + "we've", + "weve", + "what", + "whatever", + "what'll", + "whatll", + "whats", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "wheres", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whim", + "whither", + "who", + "whod", + "whoever", + "whole", + "who'll", + "wholl", + "whom", + "whomever", + "whos", + "whose", + "why", + "widely", + "will", + "willing", + "wish", + "with", + "within", + "without", + "won't", + "wont", + "words", + "would", + "wouldn't", + "wouldnt", + "www", + "x", + "y", + "yes", + "yet", + "you", + "youd", + "you'll", + "youll", + "your", + "youre", + "yours", + "yourself", + "yourselves", + "you've", + "youve", + "z", + "zero" + ] +} diff --git a/aio/tools/transforms/angular-base-package/ignore.words b/aio/tools/transforms/angular-base-package/ignore.words deleted file mode 100644 index 82b9f2fc3f..0000000000 --- a/aio/tools/transforms/angular-base-package/ignore.words +++ /dev/null @@ -1,701 +0,0 @@ -a -able -about -above -abst -accordance -according -accordingly -across -act -actually -added -adj -adopted -affected -affecting -affects -after -afterwards -again -against -ah -all -almost -alone -along -already -also -although -always -am -among -amongst -an -and -announce -another -any -anybody -anyhow -anymore -anyone -anything -anyway -anyways -anywhere -apparently -approximately -are -aren -arent -arise -around -as -aside -ask -asking -at -auth -available -away -awfully -b -back -be -became -because -become -becomes -becoming -been -before -beforehand -begin -beginning -beginnings -begins -behind -being -believe -below -beside -besides -between -beyond -biol -both -brief -briefly -but -by -c -ca -came -can -cannot -can't -cant -cause -causes -certain -certainly -co -com -come -comes -contain -containing -contains -could -couldnt -d -date -did -didn't -didnt -different -do -does -doesn't -doesnt -doing -done -don't -dont -down -downwards -due -during -e -each -ed -edu -effect -eg -eight -eighty -either -else -elsewhere -end -ending -enough -especially -et -et-al -etc -even -ever -every -everybody -everyone -everything -everywhere -ex -except -f -far -few -ff -fifth -first -five -fix -followed -following -follows -for -former -formerly -forth -found -four -from -further -furthermore -g -gave -get -gets -getting -give -given -gives -giving -go -goes -gone -got -gotten -h -had -happens -hardly -has -hasn't -hasnt -have -haven't -havent -having -he -hed -hence -her -here -hereafter -hereby -herein -heres -hereupon -hers -herself -hes -hi -hid -him -himself -his -hither -home -how -howbeit -however -hundred -i -id -ie -if -i'll -ill -im -immediate -immediately -importance -important -in -inc -indeed -index -information -instead -into -invention -inward -is -isn't -isnt -it -itd -it'll -itll -its -itself -i've -ive -j -just -k -keep -keeps -kept -keys -kg -km -know -known -knows -l -largely -last -lately -later -latter -latterly -least -less -lest -let -lets -like -liked -likely -line -little -'ll -'ll -look -looking -looks -ltd -m -made -mainly -make -makes -many -may -maybe -me -mean -means -meantime -meanwhile -merely -mg -might -million -miss -ml -more -moreover -most -mostly -mr -mrs -much -mug -must -my -myself -n -na -name -namely -nay -nd -near -nearly -necessarily -necessary -need -needs -neither -never -nevertheless -new -next -nine -ninety -no -nobody -non -none -nonetheless -noone -nor -normally -nos -not -noted -nothing -now -nowhere -o -obtain -obtained -obviously -of -off -often -oh -ok -okay -old -omitted -on -once -one -ones -only -onto -or -ord -other -others -otherwise -ought -our -ours -ourselves -out -outside -over -overall -owing -own -p -page -pages -part -particular -particularly -past -per -perhaps -placed -please -plus -poorly -possible -possibly -potentially -pp -predominantly -present -previously -primarily -probably -promptly -proud -provides -put -q -que -quickly -quite -qv -r -ran -rather -rd -re -readily -really -recent -recently -ref -refs -regarding -regardless -regards -related -relatively -research -respectively -resulted -resulting -results -right -run -s -said -same -saw -say -saying -says -sec -section -see -seeing -seem -seemed -seeming -seems -seen -self -selves -sent -seven -several -shall -she -shed -she'll -shell -shes -should -shouldn't -shouldnt -show -showed -shown -showns -shows -significant -significantly -similar -similarly -since -six -slightly -so -some -somebody -somehow -someone -somethan -something -sometime -sometimes -somewhat -somewhere -soon -sorry -specifically -specified -specify -specifying -state -states -still -stop -strongly -sub -substantially -successfully -such -sufficiently -suggest -sup -sure -t -take -taken -taking -tell -tends -th -than -thank -thanks -thanx -that -that'll -thatll -thats -that've -thatve -the -their -theirs -them -themselves -then -thence -there -thereafter -thereby -thered -therefore -therein -there'll -therell -thereof -therere -theres -thereto -thereupon -there've -thereve -these -they -theyd -they'll -theyll -theyre -they've -theyve -think -this -those -thou -though -thoughh -thousand -throug -through -throughout -thru -thus -til -tip -to -together -too -took -toward -towards -tried -tries -truly -try -trying -ts -twice -two -u -un -under -unfortunately -unless -unlike -unlikely -until -unto -up -upon -ups -us -use -used -useful -usefully -usefulness -uses -using -usually -v -value -various -'ve -'ve -very -via -viz -vol -vols -vs -w -want -wants -was -wasn't -wasnt -way -we -wed -welcome -we'll -well -went -were -weren't -werent -we've -weve -what -whatever -what'll -whatll -whats -when -whence -whenever -where -whereafter -whereas -whereby -wherein -wheres -whereupon -wherever -whether -which -while -whim -whither -who -whod -whoever -whole -who'll -wholl -whom -whomever -whos -whose -why -widely -will -willing -wish -with -within -without -won't -wont -words -would -wouldn't -wouldnt -www -x -y -yes -yet -you -youd -you'll -youll -your -youre -yours -yourself -yourselves -you've -youve -z -zero diff --git a/aio/tools/transforms/angular-base-package/index.js b/aio/tools/transforms/angular-base-package/index.js index e62f4373fb..8c946c89ec 100644 --- a/aio/tools/transforms/angular-base-package/index.js +++ b/aio/tools/transforms/angular-base-package/index.js @@ -65,9 +65,9 @@ module.exports = new Package('angular-base', [ readFilesProcessor.sourceFiles = []; collectExamples.exampleFolders = []; - generateKeywordsProcessor.ignoreWordsFile = path.resolve(__dirname, 'ignore.words'); + generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en']; generateKeywordsProcessor.docTypesToIgnore = ['example-region']; - generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent']; + generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle']; }) // Where do we write the output files? diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js index 7cad35b0db..020d460de8 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js @@ -1,7 +1,6 @@ 'use strict'; -var fs = require('fs'); -var path = require('canonical-path'); +const stem = require('stemmer'); /** * @dgProcessor generateKeywordsProcessor @@ -10,103 +9,98 @@ var path = require('canonical-path'); * a new document that will be rendered as a JavaScript file containing all * this data. */ -module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { +module.exports = function generateKeywordsProcessor(log) { return { - ignoreWordsFile: undefined, + ignoreWords: [], propertiesToIgnore: [], docTypesToIgnore: [], outputFolder: '', $validate: { - ignoreWordsFile: {}, + ignoreWords: {}, docTypesToIgnore: {}, propertiesToIgnore: {}, outputFolder: {presence: true} }, $runAfter: ['postProcessHtml'], $runBefore: ['writing-files'], - $process: function(docs) { + $process(docs) { + + const dictionary = new Map(); // Keywords to ignore - var wordsToIgnore = []; - var propertiesToIgnore; - var docTypesToIgnore; - - // Load up the keywords to ignore, if specified in the config - if (this.ignoreWordsFile) { - var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile); - wordsToIgnore = fs.readFileSync(ignoreWordsPath, 'utf8').toString().split(/[,\s\n\r]+/gm); - - log.debug('Loaded ignore words from "' + ignoreWordsPath + '"'); - log.silly(wordsToIgnore); - } - - propertiesToIgnore = convertToMap(this.propertiesToIgnore); + const ignoreWords = new Set(this.ignoreWords); + log.debug('Words to ignore', ignoreWords); + const propertiesToIgnore = new Set(this.propertiesToIgnore); log.debug('Properties to ignore', propertiesToIgnore); - docTypesToIgnore = convertToMap(this.docTypesToIgnore); + const docTypesToIgnore = new Set(this.docTypesToIgnore); log.debug('Doc types to ignore', docTypesToIgnore); - var ignoreWordsMap = convertToMap(wordsToIgnore); const filteredDocs = docs // We are not interested in some docTypes - .filter(function(doc) { return !docTypesToIgnore[doc.docType]; }) + .filter(doc => !docTypesToIgnore.has(doc.docType)) // Ignore internals and private exports (indicated by the ɵ prefix) - .filter(function(doc) { return !doc.internal && !doc.privateExport; }); + .filter(doc => !doc.internal && !doc.privateExport); - filteredDocs.forEach(function(doc) { - - var words = []; - var keywordMap = Object.assign({}, ignoreWordsMap); - var members = []; - var membersMap = Object.assign({}, ignoreWordsMap); - const headingWords = []; - const headingWordMap = Object.assign({}, ignoreWordsMap); - + for(const doc of filteredDocs) { // Search each top level property of the document for search terms - Object.keys(doc).forEach(function(key) { + let mainTokens = []; + for(const key of Object.keys(doc)) { const value = doc[key]; - - if (isString(value) && !propertiesToIgnore[key]) { - extractWords(value, words, keywordMap); + if (isString(value) && !propertiesToIgnore.has(key)) { + mainTokens.push(...tokenize(value, ignoreWords, dictionary)); } - }); + } - extractMemberWords(doc, members, membersMap); + const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary); // Extract all the keywords from the headings + let headingTokens = []; if (doc.vFile && doc.vFile.headings) { - Object.keys(doc.vFile.headings).forEach(function(headingTag) { - doc.vFile.headings[headingTag].forEach(function(headingText) { - extractWords(headingText, headingWords, headingWordMap); - }); - }); + for(const headingTag of Object.keys(doc.vFile.headings)) { + for(const headingText of doc.vFile.headings[headingTag]) { + headingTokens.push(...tokenize(headingText, ignoreWords, dictionary)); + } + } } + // Extract the title to use in searches doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || ''; // Attach all this search data to the document - doc.searchTerms = { - titleWords: tokenize(doc.searchTitle).join(' '), - headingWords: headingWords.sort().join(' '), - keywords: words.sort().join(' '), - members: members.sort().join(' '), - topics: doc.searchKeywords - }; - - }); + doc.searchTerms = {}; + if (headingTokens.length > 0) { + doc.searchTerms.headings = headingTokens; + } + if (mainTokens.length > 0) { + doc.searchTerms.keywords = mainTokens; + } + if (memberTokens.length > 0) { + doc.searchTerms.members = memberTokens; + } + if (doc.searchKeywords) { + doc.searchTerms.topics = doc.searchKeywords.trim(); + } + } // Now process all the search data and collect it up to be used in creating a new document - var searchData = filteredDocs.map(function(page) { - // Copy the properties from the searchTerms object onto the search data object - return Object.assign({ - path: page.path, - title: page.searchTitle, - type: page.docType, - deprecated: !!page.deprecated, - }, page.searchTerms); - }); + const searchData = { + dictionary: Array.from(dictionary.keys()), + pages: filteredDocs.map(page => { + // Copy the properties from the searchTerms object onto the search data object + const searchObj = { + path: page.path, + title: page.searchTitle, + type: page.docType, + }; + if (page.deprecated) { + searchObj.deprecated = true; + } + return Object.assign(searchObj, page.searchTerms); + }), + }; docs.push({ docType: 'json-doc', @@ -120,63 +114,64 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) { }; }; - function isString(value) { return typeof value == 'string'; } -function convertToMap(collection) { - const obj = {}; - collection.forEach(key => { obj[key] = true; }); - return obj; -} - -// If the heading contains a name starting with ng, e.g. "ngController", then add the -// name without the ng to the text, e.g. "controller". -function tokenize(text) { - const rawTokens = text.split(/[\s\/]+/mg); +function tokenize(text, ignoreWords, dictionary) { + // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed). + const rawTokens = text.split(/[\s\/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img); const tokens = []; - rawTokens.forEach(token => { + for(let token of rawTokens) { + token = token.trim(); + // Strip off unwanted trivial characters - token = token - .trim() - .replace(/^[_\-"'`({[<$*)}\]>.]+/, '') - .replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); - // Ignore tokens that contain weird characters - if (/^[\w.\-]+$/.test(token)) { - tokens.push(token.toLowerCase()); - const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token); - if (ngTokenMatch) { - tokens.push(ngTokenMatch[1].toLowerCase()); - } + token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, ''); + + // Skip if in the ignored words list + if (ignoreWords.has(token.toLowerCase())) { + continue; } - }); + + // Skip tokens that contain weird characters + if (!/^[\w._-]+$/.test(token)) { + continue; + } + + storeToken(token, tokens, dictionary); + if (token.startsWith('ng')) { + storeToken(token.substr(2), tokens, dictionary); + } + } + return tokens; } -function extractWords(text, words, keywordMap) { - var tokens = tokenize(text); - tokens.forEach(function(token) { - if (!keywordMap[token]) { - words.push(token); - keywordMap[token] = true; - } - }); +function storeToken(token, tokens, dictionary) { + token = stem(token); + if (!dictionary.has(token)) { + dictionary.set(token, dictionary.size); + } + tokens.push(dictionary.get(token)); } -function extractMemberWords(doc, members, membersMap) { - if (!doc) return; +function extractMemberTokens(doc, ignoreWords, dictionary) { + if (!doc) return ''; + + let memberContent = []; if (doc.members) { - doc.members.forEach(member => extractWords(member.name, members, membersMap)); + doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary))); } if (doc.statics) { - doc.statics.forEach(member => extractWords(member.name, members, membersMap)); + doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary))); } if (doc.extendsClauses) { - doc.extendsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap)); + doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary))); } if (doc.implementsClauses) { - doc.implementsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap)); + doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary))); } -} \ No newline at end of file + + return memberContent; +} diff --git a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js index e482ee4a1b..3065a1c16e 100644 --- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js +++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js @@ -1,12 +1,22 @@ +const path = require('canonical-path'); +const Dgeni = require('dgeni'); + const testPackage = require('../../helpers/test-package'); const mockLogger = require('dgeni/lib/mocks/log')(false); const processorFactory = require('./generateKeywords'); -const Dgeni = require('dgeni'); const mockReadFilesProcessor = { basePath: 'base/path' }; +const ignoreWords = require(path.resolve(__dirname, '../ignore-words'))['en']; + +function createProcessor() { + const processor = processorFactory(mockLogger, mockReadFilesProcessor); + processor.ignoreWords = ignoreWords; + return processor; +} + describe('generateKeywords processor', () => { it('should be available on the injector', () => { @@ -17,30 +27,81 @@ describe('generateKeywords processor', () => { }); it('should run after the correct processor', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); expect(processor.$runAfter).toEqual(['postProcessHtml']); }); it('should run before the correct processor', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); expect(processor.$runBefore).toEqual(['writing-files']); }); it('should ignore internal and private exports', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'PublicExport' }, { docType: 'class', name: 'PrivateExport', privateExport: true }, { docType: 'class', name: 'InternalExport', internal: true } ]; processor.$process(docs); - expect(docs[docs.length - 1].data).toEqual([ - jasmine.objectContaining({ title: 'PublicExport', type: 'class'}) + expect(docs[docs.length - 1].data.pages).toEqual([ + jasmine.objectContaining({ title: 'PublicExport', type: 'class' }) ]); }); + it('should ignore docs that are in the `docTypesToIgnore` list', () => { + const processor = createProcessor(); + processor.docTypesToIgnore = ['interface']; + const docs = [ + { docType: 'class', name: 'Class' }, + { docType: 'interface', name: 'Interface' }, + { docType: 'content', name: 'Guide' }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data.pages).toEqual([ + jasmine.objectContaining({ title: 'Class', type: 'class' }), + jasmine.objectContaining({ title: 'Guide', type: 'content' }), + ]); + }); + + it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => { + const processor = createProcessor(); + processor.propertiesToIgnore = ['docType', 'ignore']; + const docs = [ + { docType: 'class', name: 'FooClass', ignore: 'ignore this content' }, + { docType: 'interface', name: 'BarInterface', capture: 'capture this content' }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data).toEqual({ + dictionary: [ 'fooclass', 'barinterfac', 'captur', 'content' ], + pages: [ + jasmine.objectContaining({ title: 'FooClass', type: 'class', keywords: [0] }), + jasmine.objectContaining({ title: 'BarInterface', type: 'interface', keywords: [1, 2, 3] }), + ], + }); + }); + + it('should not collect keywords that look like HTML tags', () => { + const processor = createProcessor(); + const docs = [ + { docType: 'class', name: 'FooClass', content: ` + + + + +
Content inside a table
` }, + ]; + processor.$process(docs); + expect(docs[docs.length - 1].data).toEqual({ + dictionary: ['class', 'fooclass', 'content', 'insid', 'tabl'], + pages: [ + jasmine.objectContaining({keywords: [0, 1, 2, 3, 4] }) + ], + }); + }); + it('should compute `doc.searchTitle` from the doc properties if not already provided', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } }, { docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } }, @@ -48,7 +109,7 @@ describe('generateKeywords processor', () => { { docType: 'class', name: 'D' }, ]; processor.$process(docs); - expect(docs[docs.length - 1].data).toEqual([ + expect(docs[docs.length - 1].data.pages).toEqual([ jasmine.objectContaining({ title: 'searchTitle A' }), jasmine.objectContaining({ title: 'title B' }), jasmine.objectContaining({ title: 'vFile C' }), @@ -57,34 +118,19 @@ describe('generateKeywords processor', () => { }); it('should use `doc.searchTitle` as the title in the search index', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' }, ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data).toEqual([ - jasmine.objectContaining({ title: 'class PublicExport', type: 'class'}) + expect(keywordsDoc.data.pages).toEqual([ + jasmine.objectContaining({ title: 'class PublicExport', type: 'class' }) ]); }); - it('should add title words to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); - const docs = [ - { - docType: 'class', - name: 'PublicExport', - searchTitle: 'class PublicExport', - vFile: { headings: { h2: ['heading A', 'heading B'] } } - }, - ]; - processor.$process(docs); - const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport'); - }); - it('should add heading words to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -95,11 +141,16 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].headingWords).toEqual('heading important secondary'); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'head', 'secondari'], + pages: [ + jasmine.objectContaining({ headings: [2, 3, 2] }) + ] + }); }); it('should add member doc properties to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -123,13 +174,18 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].members).toEqual( - 'instancemethoda instancemethodb instancepropertya instancepropertyb staticmethoda staticmethodb staticpropertya staticpropertyb' - ); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'content', 'ngclass', 'instancemethoda','instancepropertya','instancemethodb','instancepropertyb','staticmethoda','staticpropertya','staticmethodb','staticpropertyb', 'head'], + pages: [ + jasmine.objectContaining({ + members: [4, 5, 6, 7, 8, 9, 10, 11] + }) + ] + }); }); it('should add inherited member doc properties to the search terms', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + const processor = createProcessor(); const parentClass = { docType: 'class', name: 'ParentClass', @@ -163,13 +219,27 @@ describe('generateKeywords processor', () => { const docs = [childClass, parentClass, parentInterface]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].members.split(' ').sort().join(' ')).toEqual( - 'childmember1 childmember2 parentmember1 parentmember2 parentmember3' - ); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'child', 'childmember1', 'childmember2', 'parentmember1', 'parentmember2', 'parentmember3', 'parentclass', 'interfac', 'parentinterfac'], + pages: [ + jasmine.objectContaining({ + title: 'Child', + members: [2, 3, 4, 5, 6] + }), + jasmine.objectContaining({ + title: 'ParentClass', + members: [4, 5] + }), + jasmine.objectContaining({ + title: 'ParentInterface', + members: [6] + }) + ] + }); }); - it('should process terms prefixed with "ng" to include the term stripped of "ng"', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + it('should include both stripped and unstripped "ng" prefixed tokens', () => { + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -181,14 +251,19 @@ describe('generateKeywords processor', () => { ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller'); - expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel'); - expect(keywordsDoc.data[0].keywords).toContain('class'); - expect(keywordsDoc.data[0].keywords).toContain('ngclass'); + expect(keywordsDoc.data).toEqual({ + dictionary: ['class', 'publicexport', 'ngcontrol', 'control', 'content', 'ngclass', 'ngmodel', 'model'], + pages: [ + jasmine.objectContaining({ + headings: [6, 7], + keywords: [0, 1, 2, 3, 4, 5, 0], + }) + ], + }); }); - it('should generate renderedContent property', () => { - const processor = processorFactory(mockLogger, mockReadFilesProcessor); + it('should generate compressed encoded renderedContent property', () => { + const processor = createProcessor(); const docs = [ { docType: 'class', @@ -196,19 +271,33 @@ describe('generateKeywords processor', () => { description: 'The is the documentation for the SomeClass API.', vFile: { headings: { h1: ['SomeClass'], h2: ['Some heading'] } } }, + { + docType: 'class', + name: 'SomeClass2', + description: 'description', + members: [ + { name: 'member1' }, + ], + deprecated: true + }, ]; processor.$process(docs); const keywordsDoc = docs[docs.length - 1]; - expect(JSON.parse(keywordsDoc.renderedContent)).toEqual( - [{ + expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({ + dictionary: ['class', 'someclass', 'document', 'api', 'head', 'someclass2', 'descript', 'member1'], + pages: [{ 'title':'SomeClass', 'type':'class', - 'titleWords':'someclass', - 'headingWords':'heading some someclass', - 'keywords':'api class documentation for is someclass the', - 'members':'', - 'deprecated': false, + 'headings': [1, 4], + 'keywords': [0, 1, 2, 1, 3], + }, + { + 'title':'SomeClass2', + 'type':'class', + 'keywords': [0, 5, 6], + 'members': [7], + 'deprecated': true, }] - ); + }); }); }); diff --git a/aio/yarn.lock b/aio/yarn.lock index 6e5d4e71ad..bcfb0be3d0 100644 --- a/aio/yarn.lock +++ b/aio/yarn.lock @@ -2005,6 +2005,11 @@ resolved "https://registry.yarnpkg.com/@types/source-list-map/-/source-list-map-0.1.2.tgz#0078836063ffaf17412349bba364087e0ac02ec9" integrity sha512-K5K+yml8LTo9bWJI/rECfIPrGgxdpeNbj+d53lwN4QjW1MCwlkhUms+gtdzigTeUyBr09+u8BwOIY3MXvHdcsA== +"@types/stemmer@^1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@types/stemmer/-/stemmer-1.0.2.tgz#bd8354f50b3c9b87c351d169240e45cf1fa1f5e8" + integrity sha512-2gWEIFqVZjjZxo8/TcugCAl7nW9Jd9ArEDpTAc5nH7d+ZUkreHA7GzuFcLZ0sflLrA5b1PZ+2yDyHJcuP9KWWw== + "@types/unist@*", "@types/unist@^2.0.0", "@types/unist@^2.0.2": version "2.0.3" resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.3.tgz#9c088679876f374eb5983f150d4787aa6fb32d7e" @@ -12802,6 +12807,11 @@ static-extend@^0.1.1: resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c" integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow= +stemmer@^1.0.5: + version "1.0.5" + resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0" + integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A== + stream-browserify@^2.0.1: version "2.0.2" resolved "https://registry.yarnpkg.com/stream-browserify/-/stream-browserify-2.0.2.tgz#87521d38a44aa7ee91ce1cd2a47df0cb49dd660b"