refactor(docs-infra): include more info in search index data (#41368)

The AIO search index is built in a WebWorker on the browser from a set of page information that is downloaded as a JSON file (`search-data.json`). We want to keep this file as small as possible while providing enough data to generate a useful index to query against. Previously, we only included one copy of each (non-ignored) term from each doc but this prevents more subtle ranking of query results, since the number of occurences of a term in a doc is lost. This commit changes the generated file in the following ways: - All non-ignored terms are now included in the order in which they appear in the doc. - The terms are indexed into a dictonary to avoid the text of the term being repeated in every doc that contains the term. - Each term is pre-"stemmed" using the same Porter Stemming algorith that the Lunr search engine uses. The web-worker has been updated to decode the new format of the file. Now that all terms are included, it may enable some level of phrase based matching in the future. The size of the generated file is considerably larger than previously, but on production HTTP servers the data is sent compressed, which reduces the size dramatically. PR Close #41368
2021-03-28 20:34:09 +01:00 · 2021-03-28 20:34:09 +01:00 · fccffc647b
commit fccffc647b
parent 55f7f1d446
8 changed files with 1007 additions and 878 deletions
--- a/aio/package.json
+++ b/aio/package.json
@ -116,6 +116,7 @@
    "@types/jasmine": "~3.6.0",
    "@types/lunr": "^2.3.2",
    "@types/node": "^12.7.9",
    "@types/stemmer": "^1.0.2",
    "@types/xregexp": "^3.0.30",
    "@yarnpkg/lockfile": "^1.1.0",
    "archiver": "^1.3.0",
@ -166,6 +167,7 @@
    "rimraf": "^2.6.1",
    "semver": "^5.3.0",
    "shelljs": "^0.8.4",
    "stemmer": "^1.0.5",
    "timezone-mock": "^1.1.3",
    "tree-kill": "^1.1.0",
    "ts-node": "^8.4.1",
--- a/aio/src/app/search/search.worker.ts
+++ b/aio/src/app/search/search.worker.ts
@ -1,10 +1,11 @@
 /// <reference lib="webworker" />
 import { WebWorkerMessage } from '../shared/web-worker-message';
 import * as lunr from 'lunr';
 import {WebWorkerMessage} from '../shared/web-worker-message';
 const SEARCH_TERMS_URL = '/generated/docs/app/search-data.json';
 let index: lunr.Index;
-const pages: SearchInfo = {};
+const pageMap: SearchInfo = {};
 interface SearchInfo {
  [key: string]: PageInfo;
@ -13,8 +14,25 @@ interface SearchInfo {
 interface PageInfo {
  path: string;
  type: string;
-  titleWords: string;
+  title: string;
-  keyWords: string;
+  headings: string;
  keywords: string;
  members: string;
  topics: string;
 }
 interface EncodedPages {
  dictionary: string[];
  pages: EncodedPage[];
 }
 interface EncodedPage {
  path: string;
  type: string;
  title: string;
  headings: number[];
  keywords: number[];
  members: number[];
  topics: string;
 }
@ -24,42 +42,42 @@ addEventListener('message', handleMessage);
 // the path and search terms for a page
 function createIndex(loadIndexFn: IndexLoader): lunr.Index {
  // The lunr typings are missing QueryLexer so we have to add them here manually.
-  const queryLexer = (lunr as any as { QueryLexer: { termSeparator: RegExp } }).QueryLexer;
+  const queryLexer = (lunr as any as {QueryLexer: {termSeparator: RegExp}}).QueryLexer;
  queryLexer.termSeparator = lunr.tokenizer.separator = /\s+/;
  return lunr(function() {
    this.pipeline.remove(lunr.stemmer);
    this.ref('path');
-    this.field('topics', { boost: 15 });
+    this.field('topics', {boost: 15});
-    this.field('titleWords', { boost: 10 });
+    this.field('title', {boost: 10});
-    this.field('headingWords', { boost: 5 });
+    this.field('headings', {boost: 5});
-    this.field('members', { boost: 4 });
+    this.field('members', {boost: 4});
-    this.field('keywords', { boost: 2 });
+    this.field('keywords', {boost: 2});
    loadIndexFn(this);
  });
 }
 // The worker receives a message to load the index and to query the index
-function handleMessage(message: { data: WebWorkerMessage }): void {
+function handleMessage(message: {data: WebWorkerMessage}): void {
  const type = message.data.type;
  const id = message.data.id;
  const payload = message.data.payload;
  switch (type) {
    case 'load-index':
-      makeRequest(SEARCH_TERMS_URL, (searchInfo: PageInfo[]) => {
+      makeRequest(SEARCH_TERMS_URL, (encodedPages: EncodedPages) => {
-        index = createIndex(loadIndex(searchInfo));
+        index = createIndex(loadIndex(encodedPages));
-        postMessage({ type, id, payload: true });
+        postMessage({type, id, payload: true});
      });
      break;
    case 'query-index':
-      postMessage({ type, id, payload: { query: payload, results: queryIndex(payload) } });
+      postMessage({type, id, payload: {query: payload, results: queryIndex(payload)}});
      break;
    default:
-      postMessage({ type, id, payload: { error: 'invalid message type' } });
+      postMessage({type, id, payload: {error: 'invalid message type'}});
  }
 }
 // Use XHR to make a request to the server
 function makeRequest(url: string, callback: (response: any) => void): void {
  // The JSON file that is loaded should be an array of PageInfo:
  const searchDataRequest = new XMLHttpRequest();
  searchDataRequest.onload = function() {
@ -70,18 +88,29 @@ function makeRequest(url: string, callback: (response: any) => void): void {
 }
-// Create the search index from the searchInfo which contains the information about each page to be indexed
+// Create the search index from the searchInfo which contains the information about each page to be
-function loadIndex(pagesData: PageInfo[]): IndexLoader {
+// indexed
 function loadIndex({dictionary, pages}: EncodedPages): IndexLoader {
  return (indexBuilder: lunr.Builder) => {
    // Store the pages data to be used in mapping query results back to pages
    // Add search terms from each page to the search index
-    pagesData.forEach(page => {
+    pages.forEach(encodedPage => {
      const page = decodePage(encodedPage, dictionary);
      indexBuilder.add(page);
-      pages[page.path] = page;
+      pageMap[page.path] = page;
    });
  };
 }
 function decodePage(encodedPage: EncodedPage, dictionary: string[]): PageInfo {
  return {
    ...encodedPage,
    headings: encodedPage.headings?.map(i => dictionary[i]).join(' ') ?? '',
    keywords: encodedPage.keywords?.map(i => dictionary[i]).join(' ') ?? '',
    members: encodedPage.members?.map(i => dictionary[i]).join(' ') ?? '',
  };
 }
 // Query the index and return the processed results
 function queryIndex(query: string): PageInfo[] {
  // Strip off quotes
@ -105,7 +134,7 @@ function queryIndex(query: string): PageInfo[] {
      }
      // Map the hits into info about each page to be returned as results
-      return results.map(hit => pages[hit.ref]);
+      return results.map(hit => pageMap[hit.ref]);
    }
  } catch (e) {
    // If the search query cannot be parsed the index throws an error
--- a/aio/tools/transforms/angular-base-package/ignore-words.json
+++ b/aio/tools/transforms/angular-base-package/ignore-words.json
@ -0,0 +1,705 @@
 {
  "en": [
    "a",
    "able",
    "about",
    "above",
    "abst",
    "accordance",
    "according",
    "accordingly",
    "across",
    "act",
    "actually",
    "added",
    "adj",
    "adopted",
    "affected",
    "affecting",
    "affects",
    "after",
    "afterwards",
    "again",
    "against",
    "ah",
    "all",
    "almost",
    "alone",
    "along",
    "already",
    "also",
    "although",
    "always",
    "am",
    "among",
    "amongst",
    "an",
    "and",
    "announce",
    "another",
    "any",
    "anybody",
    "anyhow",
    "anymore",
    "anyone",
    "anything",
    "anyway",
    "anyways",
    "anywhere",
    "apparently",
    "approximately",
    "are",
    "aren",
    "arent",
    "arise",
    "around",
    "as",
    "aside",
    "ask",
    "asking",
    "at",
    "auth",
    "available",
    "away",
    "awfully",
    "b",
    "back",
    "be",
    "became",
    "because",
    "become",
    "becomes",
    "becoming",
    "been",
    "before",
    "beforehand",
    "begin",
    "beginning",
    "beginnings",
    "begins",
    "behind",
    "being",
    "believe",
    "below",
    "beside",
    "besides",
    "between",
    "beyond",
    "biol",
    "both",
    "brief",
    "briefly",
    "but",
    "by",
    "c",
    "ca",
    "came",
    "can",
    "cannot",
    "can't",
    "cant",
    "cause",
    "causes",
    "certain",
    "certainly",
    "co",
    "com",
    "come",
    "comes",
    "contain",
    "containing",
    "contains",
    "could",
    "couldnt",
    "d",
    "date",
    "did",
    "didn't",
    "didnt",
    "different",
    "do",
    "does",
    "doesn't",
    "doesnt",
    "doing",
    "done",
    "don't",
    "dont",
    "down",
    "downwards",
    "due",
    "during",
    "e",
    "each",
    "ed",
    "edu",
    "effect",
    "eg",
    "eight",
    "eighty",
    "either",
    "else",
    "elsewhere",
    "end",
    "ending",
    "enough",
    "especially",
    "et",
    "et-al",
    "etc",
    "even",
    "ever",
    "every",
    "everybody",
    "everyone",
    "everything",
    "everywhere",
    "ex",
    "except",
    "f",
    "far",
    "few",
    "ff",
    "fifth",
    "first",
    "five",
    "fix",
    "followed",
    "following",
    "follows",
    "for",
    "former",
    "formerly",
    "forth",
    "found",
    "four",
    "from",
    "further",
    "furthermore",
    "g",
    "gave",
    "get",
    "gets",
    "getting",
    "give",
    "given",
    "gives",
    "giving",
    "go",
    "goes",
    "gone",
    "got",
    "gotten",
    "h",
    "had",
    "happens",
    "hardly",
    "has",
    "hasn't",
    "hasnt",
    "have",
    "haven't",
    "havent",
    "having",
    "he",
    "hed",
    "hence",
    "her",
    "here",
    "hereafter",
    "hereby",
    "herein",
    "heres",
    "hereupon",
    "hers",
    "herself",
    "hes",
    "hi",
    "hid",
    "him",
    "himself",
    "his",
    "hither",
    "home",
    "how",
    "howbeit",
    "however",
    "hundred",
    "i",
    "id",
    "ie",
    "if",
    "i'll",
    "ill",
    "im",
    "immediate",
    "immediately",
    "importance",
    "important",
    "in",
    "inc",
    "indeed",
    "index",
    "information",
    "instead",
    "into",
    "invention",
    "inward",
    "is",
    "isn't",
    "isnt",
    "it",
    "itd",
    "it'll",
    "itll",
    "its",
    "itself",
    "i've",
    "ive",
    "j",
    "just",
    "k",
    "keep",
    "keeps",
    "kept",
    "keys",
    "kg",
    "km",
    "know",
    "known",
    "knows",
    "l",
    "largely",
    "last",
    "lately",
    "later",
    "latter",
    "latterly",
    "least",
    "less",
    "lest",
    "let",
    "lets",
    "like",
    "liked",
    "likely",
    "line",
    "little",
    "'ll",
    "'ll",
    "look",
    "looking",
    "looks",
    "ltd",
    "m",
    "made",
    "mainly",
    "make",
    "makes",
    "many",
    "may",
    "maybe",
    "me",
    "mean",
    "means",
    "meantime",
    "meanwhile",
    "merely",
    "mg",
    "might",
    "million",
    "miss",
    "ml",
    "more",
    "moreover",
    "most",
    "mostly",
    "mr",
    "mrs",
    "much",
    "mug",
    "must",
    "my",
    "myself",
    "n",
    "na",
    "name",
    "namely",
    "nay",
    "nd",
    "near",
    "nearly",
    "necessarily",
    "necessary",
    "need",
    "needs",
    "neither",
    "never",
    "nevertheless",
    "new",
    "next",
    "nine",
    "ninety",
    "no",
    "nobody",
    "non",
    "none",
    "nonetheless",
    "noone",
    "nor",
    "normally",
    "nos",
    "not",
    "noted",
    "nothing",
    "now",
    "nowhere",
    "o",
    "obtain",
    "obtained",
    "obviously",
    "of",
    "off",
    "often",
    "oh",
    "ok",
    "okay",
    "old",
    "omitted",
    "on",
    "once",
    "one",
    "ones",
    "only",
    "onto",
    "or",
    "ord",
    "other",
    "others",
    "otherwise",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "outside",
    "over",
    "overall",
    "owing",
    "own",
    "p",
    "page",
    "pages",
    "part",
    "particular",
    "particularly",
    "past",
    "per",
    "perhaps",
    "placed",
    "please",
    "plus",
    "poorly",
    "possible",
    "possibly",
    "potentially",
    "pp",
    "predominantly",
    "present",
    "previously",
    "primarily",
    "probably",
    "promptly",
    "proud",
    "provides",
    "put",
    "q",
    "que",
    "quickly",
    "quite",
    "qv",
    "r",
    "ran",
    "rather",
    "rd",
    "re",
    "readily",
    "really",
    "recent",
    "recently",
    "ref",
    "refs",
    "regarding",
    "regardless",
    "regards",
    "related",
    "relatively",
    "research",
    "respectively",
    "resulted",
    "resulting",
    "results",
    "right",
    "run",
    "s",
    "said",
    "same",
    "saw",
    "say",
    "saying",
    "says",
    "sec",
    "section",
    "see",
    "seeing",
    "seem",
    "seemed",
    "seeming",
    "seems",
    "seen",
    "self",
    "selves",
    "sent",
    "seven",
    "several",
    "shall",
    "she",
    "shed",
    "she'll",
    "shell",
    "shes",
    "should",
    "shouldn't",
    "shouldnt",
    "show",
    "showed",
    "shown",
    "showns",
    "shows",
    "significant",
    "significantly",
    "similar",
    "similarly",
    "since",
    "six",
    "slightly",
    "so",
    "some",
    "somebody",
    "somehow",
    "someone",
    "somethan",
    "something",
    "sometime",
    "sometimes",
    "somewhat",
    "somewhere",
    "soon",
    "sorry",
    "specifically",
    "specified",
    "specify",
    "specifying",
    "state",
    "states",
    "still",
    "stop",
    "strongly",
    "sub",
    "substantially",
    "successfully",
    "such",
    "sufficiently",
    "suggest",
    "sup",
    "sure",
    "t",
    "take",
    "taken",
    "taking",
    "tell",
    "tends",
    "th",
    "than",
    "thank",
    "thanks",
    "thanx",
    "that",
    "that'll",
    "thatll",
    "thats",
    "that've",
    "thatve",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "thence",
    "there",
    "thereafter",
    "thereby",
    "thered",
    "therefore",
    "therein",
    "there'll",
    "therell",
    "thereof",
    "therere",
    "theres",
    "thereto",
    "thereupon",
    "there've",
    "thereve",
    "these",
    "they",
    "theyd",
    "they'll",
    "theyll",
    "theyre",
    "they've",
    "theyve",
    "think",
    "this",
    "those",
    "thou",
    "though",
    "thoughh",
    "thousand",
    "throug",
    "through",
    "throughout",
    "thru",
    "thus",
    "til",
    "tip",
    "to",
    "together",
    "too",
    "took",
    "toward",
    "towards",
    "tried",
    "tries",
    "truly",
    "try",
    "trying",
    "ts",
    "twice",
    "two",
    "u",
    "un",
    "under",
    "unfortunately",
    "unless",
    "unlike",
    "unlikely",
    "until",
    "unto",
    "up",
    "upon",
    "ups",
    "us",
    "use",
    "used",
    "useful",
    "usefully",
    "usefulness",
    "uses",
    "using",
    "usually",
    "v",
    "value",
    "various",
    "'ve",
    "'ve",
    "very",
    "via",
    "viz",
    "vol",
    "vols",
    "vs",
    "w",
    "want",
    "wants",
    "was",
    "wasn't",
    "wasnt",
    "way",
    "we",
    "wed",
    "welcome",
    "we'll",
    "well",
    "went",
    "were",
    "weren't",
    "werent",
    "we've",
    "weve",
    "what",
    "whatever",
    "what'll",
    "whatll",
    "whats",
    "when",
    "whence",
    "whenever",
    "where",
    "whereafter",
    "whereas",
    "whereby",
    "wherein",
    "wheres",
    "whereupon",
    "wherever",
    "whether",
    "which",
    "while",
    "whim",
    "whither",
    "who",
    "whod",
    "whoever",
    "whole",
    "who'll",
    "wholl",
    "whom",
    "whomever",
    "whos",
    "whose",
    "why",
    "widely",
    "will",
    "willing",
    "wish",
    "with",
    "within",
    "without",
    "won't",
    "wont",
    "words",
    "would",
    "wouldn't",
    "wouldnt",
    "www",
    "x",
    "y",
    "yes",
    "yet",
    "you",
    "youd",
    "you'll",
    "youll",
    "your",
    "youre",
    "yours",
    "yourself",
    "yourselves",
    "you've",
    "youve",
    "z",
    "zero"
  ]
 }
--- a/aio/tools/transforms/angular-base-package/ignore.words
+++ b/aio/tools/transforms/angular-base-package/ignore.words
@ -1,701 +0,0 @@
 a
 able
 about
 above
 abst
 accordance
 according
 accordingly
 across
 act
 actually
 added
 adj
 adopted
 affected
 affecting
 affects
 after
 afterwards
 again
 against
 ah
 all
 almost
 alone
 along
 already
 also
 although
 always
 am
 among
 amongst
 an
 and
 announce
 another
 any
 anybody
 anyhow
 anymore
 anyone
 anything
 anyway
 anyways
 anywhere
 apparently
 approximately
 are
 aren
 arent
 arise
 around
 as
 aside
 ask
 asking
 at
 auth
 available
 away
 awfully
 b
 back
 be
 became
 because
 become
 becomes
 becoming
 been
 before
 beforehand
 begin
 beginning
 beginnings
 begins
 behind
 being
 believe
 below
 beside
 besides
 between
 beyond
 biol
 both
 brief
 briefly
 but
 by
 c
 ca
 came
 can
 cannot
 can't
 cant
 cause
 causes
 certain
 certainly
 co
 com
 come
 comes
 contain
 containing
 contains
 could
 couldnt
 d
 date
 did
 didn't
 didnt
 different
 do
 does
 doesn't
 doesnt
 doing
 done
 don't
 dont
 down
 downwards
 due
 during
 e
 each
 ed
 edu
 effect
 eg
 eight
 eighty
 either
 else
 elsewhere
 end
 ending
 enough
 especially
 et
 et-al
 etc
 even
 ever
 every
 everybody
 everyone
 everything
 everywhere
 ex
 except
 f
 far
 few
 ff
 fifth
 first
 five
 fix
 followed
 following
 follows
 for
 former
 formerly
 forth
 found
 four
 from
 further
 furthermore
 g
 gave
 get
 gets
 getting
 give
 given
 gives
 giving
 go
 goes
 gone
 got
 gotten
 h
 had
 happens
 hardly
 has
 hasn't
 hasnt
 have
 haven't
 havent
 having
 he
 hed
 hence
 her
 here
 hereafter
 hereby
 herein
 heres
 hereupon
 hers
 herself
 hes
 hi
 hid
 him
 himself
 his
 hither
 home
 how
 howbeit
 however
 hundred
 i
 id
 ie
 if
 i'll
 ill
 im
 immediate
 immediately
 importance
 important
 in
 inc
 indeed
 index
 information
 instead
 into
 invention
 inward
 is
 isn't
 isnt
 it
 itd
 it'll
 itll
 its
 itself
 i've
 ive
 j
 just
 k
 keep
 keeps
 kept
 keys
 kg
 km
 know
 known
 knows
 l
 largely
 last
 lately
 later
 latter
 latterly
 least
 less
 lest
 let
 lets
 like
 liked
 likely
 line
 little
 'll
 'll
 look
 looking
 looks
 ltd
 m
 made
 mainly
 make
 makes
 many
 may
 maybe
 me
 mean
 means
 meantime
 meanwhile
 merely
 mg
 might
 million
 miss
 ml
 more
 moreover
 most
 mostly
 mr
 mrs
 much
 mug
 must
 my
 myself
 n
 na
 name
 namely
 nay
 nd
 near
 nearly
 necessarily
 necessary
 need
 needs
 neither
 never
 nevertheless
 new
 next
 nine
 ninety
 no
 nobody
 non
 none
 nonetheless
 noone
 nor
 normally
 nos
 not
 noted
 nothing
 now
 nowhere
 o
 obtain
 obtained
 obviously
 of
 off
 often
 oh
 ok
 okay
 old
 omitted
 on
 once
 one
 ones
 only
 onto
 or
 ord
 other
 others
 otherwise
 ought
 our
 ours
 ourselves
 out
 outside
 over
 overall
 owing
 own
 p
 page
 pages
 part
 particular
 particularly
 past
 per
 perhaps
 placed
 please
 plus
 poorly
 possible
 possibly
 potentially
 pp
 predominantly
 present
 previously
 primarily
 probably
 promptly
 proud
 provides
 put
 q
 que
 quickly
 quite
 qv
 r
 ran
 rather
 rd
 re
 readily
 really
 recent
 recently
 ref
 refs
 regarding
 regardless
 regards
 related
 relatively
 research
 respectively
 resulted
 resulting
 results
 right
 run
 s
 said
 same
 saw
 say
 saying
 says
 sec
 section
 see
 seeing
 seem
 seemed
 seeming
 seems
 seen
 self
 selves
 sent
 seven
 several
 shall
 she
 shed
 she'll
 shell
 shes
 should
 shouldn't
 shouldnt
 show
 showed
 shown
 showns
 shows
 significant
 significantly
 similar
 similarly
 since
 six
 slightly
 so
 some
 somebody
 somehow
 someone
 somethan
 something
 sometime
 sometimes
 somewhat
 somewhere
 soon
 sorry
 specifically
 specified
 specify
 specifying
 state
 states
 still
 stop
 strongly
 sub
 substantially
 successfully
 such
 sufficiently
 suggest
 sup
 sure
 t
 take
 taken
 taking
 tell
 tends
 th
 than
 thank
 thanks
 thanx
 that
 that'll
 thatll
 thats
 that've
 thatve
 the
 their
 theirs
 them
 themselves
 then
 thence
 there
 thereafter
 thereby
 thered
 therefore
 therein
 there'll
 therell
 thereof
 therere
 theres
 thereto
 thereupon
 there've
 thereve
 these
 they
 theyd
 they'll
 theyll
 theyre
 they've
 theyve
 think
 this
 those
 thou
 though
 thoughh
 thousand
 throug
 through
 throughout
 thru
 thus
 til
 tip
 to
 together
 too
 took
 toward
 towards
 tried
 tries
 truly
 try
 trying
 ts
 twice
 two
 u
 un
 under
 unfortunately
 unless
 unlike
 unlikely
 until
 unto
 up
 upon
 ups
 us
 use
 used
 useful
 usefully
 usefulness
 uses
 using
 usually
 v
 value
 various
 've
 've
 very
 via
 viz
 vol
 vols
 vs
 w
 want
 wants
 was
 wasn't
 wasnt
 way
 we
 wed
 welcome
 we'll
 well
 went
 were
 weren't
 werent
 we've
 weve
 what
 whatever
 what'll
 whatll
 whats
 when
 whence
 whenever
 where
 whereafter
 whereas
 whereby
 wherein
 wheres
 whereupon
 wherever
 whether
 which
 while
 whim
 whither
 who
 whod
 whoever
 whole
 who'll
 wholl
 whom
 whomever
 whos
 whose
 why
 widely
 will
 willing
 wish
 with
 within
 without
 won't
 wont
 words
 would
 wouldn't
 wouldnt
 www
 x
 y
 yes
 yet
 you
 youd
 you'll
 youll
 your
 youre
 yours
 yourself
 yourselves
 you've
 youve
 z
 zero
--- a/aio/tools/transforms/angular-base-package/index.js
+++ b/aio/tools/transforms/angular-base-package/index.js
@ -65,9 +65,9 @@ module.exports = new Package('angular-base', [
    readFilesProcessor.sourceFiles = [];
    collectExamples.exampleFolders = [];
-    generateKeywordsProcessor.ignoreWordsFile = path.resolve(__dirname, 'ignore.words');
+    generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en'];
    generateKeywordsProcessor.docTypesToIgnore = ['example-region'];
-    generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent'];
+    generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle'];
  })
  // Where do we write the output files?
--- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.js
+++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.js
@ -1,7 +1,6 @@
 'use strict';
-var fs = require('fs');
+const stem = require('stemmer');
 var path = require('canonical-path');
 /**
 * @dgProcessor generateKeywordsProcessor
@ -10,103 +9,98 @@ var path = require('canonical-path');
 * a new document that will be rendered as a JavaScript file containing all
 * this data.
 */
-module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
+module.exports = function generateKeywordsProcessor(log) {
  return {
-    ignoreWordsFile: undefined,
+    ignoreWords: [],
    propertiesToIgnore: [],
    docTypesToIgnore: [],
    outputFolder: '',
    $validate: {
-      ignoreWordsFile: {},
+      ignoreWords: {},
      docTypesToIgnore: {},
      propertiesToIgnore: {},
      outputFolder: {presence: true}
    },
    $runAfter: ['postProcessHtml'],
    $runBefore: ['writing-files'],
-    $process: function(docs) {
+    $process(docs) {
      const dictionary = new Map();
      // Keywords to ignore
-      var wordsToIgnore = [];
+      const ignoreWords = new Set(this.ignoreWords);
-      var propertiesToIgnore;
+      log.debug('Words to ignore', ignoreWords);
-      var docTypesToIgnore;
+      const propertiesToIgnore = new Set(this.propertiesToIgnore);
      // Load up the keywords to ignore, if specified in the config
      if (this.ignoreWordsFile) {
        var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile);
        wordsToIgnore = fs.readFileSync(ignoreWordsPath, 'utf8').toString().split(/[,\s\n\r]+/gm);
        log.debug('Loaded ignore words from "' + ignoreWordsPath + '"');
        log.silly(wordsToIgnore);
      }
      propertiesToIgnore = convertToMap(this.propertiesToIgnore);
      log.debug('Properties to ignore', propertiesToIgnore);
-      docTypesToIgnore = convertToMap(this.docTypesToIgnore);
+      const docTypesToIgnore = new Set(this.docTypesToIgnore);
      log.debug('Doc types to ignore', docTypesToIgnore);
      var ignoreWordsMap = convertToMap(wordsToIgnore);
      const filteredDocs = docs
          // We are not interested in some docTypes
-          .filter(function(doc) { return !docTypesToIgnore[doc.docType]; })
+          .filter(doc => !docTypesToIgnore.has(doc.docType))
          // Ignore internals and private exports (indicated by the ɵ prefix)
-          .filter(function(doc) { return !doc.internal && !doc.privateExport; });
+          .filter(doc => !doc.internal && !doc.privateExport);
-      filteredDocs.forEach(function(doc) {
+      for(const doc of filteredDocs) {
        var words = [];
        var keywordMap = Object.assign({}, ignoreWordsMap);
        var members = [];
        var membersMap = Object.assign({}, ignoreWordsMap);
        const headingWords = [];
        const headingWordMap = Object.assign({}, ignoreWordsMap);
        // Search each top level property of the document for search terms
-        Object.keys(doc).forEach(function(key) {
+        let mainTokens = [];
        for(const key of Object.keys(doc)) {
          const value = doc[key];
-
+          if (isString(value) && !propertiesToIgnore.has(key)) {
-          if (isString(value) && !propertiesToIgnore[key]) {
+            mainTokens.push(...tokenize(value, ignoreWords, dictionary));
            extractWords(value, words, keywordMap);
          }
-        });
+        }
-        extractMemberWords(doc, members, membersMap);
+        const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary);
        // Extract all the keywords from the headings
        let headingTokens = [];
        if (doc.vFile && doc.vFile.headings) {
-          Object.keys(doc.vFile.headings).forEach(function(headingTag) {
+          for(const headingTag of Object.keys(doc.vFile.headings)) {
-            doc.vFile.headings[headingTag].forEach(function(headingText) {
+            for(const headingText of doc.vFile.headings[headingTag]) {
-              extractWords(headingText, headingWords, headingWordMap);
+              headingTokens.push(...tokenize(headingText, ignoreWords, dictionary));
-            });
+            }
-          });
+          }
        }
        // Extract the title to use in searches
        doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || '';
        // Attach all this search data to the document
-        doc.searchTerms = {
+        doc.searchTerms = {};
-          titleWords: tokenize(doc.searchTitle).join(' '),
+        if (headingTokens.length > 0) {
-          headingWords: headingWords.sort().join(' '),
+          doc.searchTerms.headings = headingTokens;
-          keywords: words.sort().join(' '),
+        }
-          members: members.sort().join(' '),
+        if (mainTokens.length > 0) {
-          topics: doc.searchKeywords
+          doc.searchTerms.keywords = mainTokens;
-        };
+        }
-
+        if (memberTokens.length > 0) {
-      });
+          doc.searchTerms.members = memberTokens;
        }
        if (doc.searchKeywords) {
          doc.searchTerms.topics = doc.searchKeywords.trim();
        }
      }
      // Now process all the search data and collect it up to be used in creating a new document
-      var searchData = filteredDocs.map(function(page) {
+      const searchData = {
-        // Copy the properties from the searchTerms object onto the search data object
+        dictionary: Array.from(dictionary.keys()),
-        return Object.assign({
+        pages: filteredDocs.map(page => {
-          path: page.path,
+          // Copy the properties from the searchTerms object onto the search data object
-          title: page.searchTitle,
+          const searchObj = {
-          type: page.docType,
+            path: page.path,
-          deprecated: !!page.deprecated,
+            title: page.searchTitle,
-        }, page.searchTerms);
+            type: page.docType,
-      });
+          };
          if (page.deprecated) {
            searchObj.deprecated = true;
          }
          return Object.assign(searchObj, page.searchTerms);
        }),
      };
      docs.push({
        docType: 'json-doc',
@ -120,63 +114,64 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
  };
 };
 function isString(value) {
  return typeof value == 'string';
 }
-function convertToMap(collection) {
+function tokenize(text, ignoreWords, dictionary) {
-  const obj = {};
+  // Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
-  collection.forEach(key => { obj[key] = true; });
+  const rawTokens = text.split(/[\s\/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img);
  return obj;
 }
 // If the heading contains a name starting with ng, e.g. "ngController", then add the
 // name without the ng to the text, e.g. "controller".
 function tokenize(text) {
  const rawTokens = text.split(/[\s\/]+/mg);
  const tokens = [];
-  rawTokens.forEach(token => {
+  for(let token of rawTokens) {
    token = token.trim();
    // Strip off unwanted trivial characters
-    token = token
+    token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
-        .trim()
+
-        .replace(/^[_\-"'`({[<$*)}\]>.]+/, '')
+    // Skip if in the ignored words list
-        .replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
+    if (ignoreWords.has(token.toLowerCase())) {
-    // Ignore tokens that contain weird characters
+      continue;
    if (/^[\w.\-]+$/.test(token)) {
      tokens.push(token.toLowerCase());
      const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token);
      if (ngTokenMatch) {
        tokens.push(ngTokenMatch[1].toLowerCase());
      }
    }
-  });
+
    // Skip tokens that contain weird characters
    if (!/^[\w._-]+$/.test(token)) {
      continue;
    }
    storeToken(token, tokens, dictionary);
    if (token.startsWith('ng')) {
      storeToken(token.substr(2), tokens, dictionary);
    }
  }
  return tokens;
 }
-function extractWords(text, words, keywordMap) {
+function storeToken(token, tokens, dictionary) {
-  var tokens = tokenize(text);
+  token = stem(token);
-  tokens.forEach(function(token) {
+  if (!dictionary.has(token)) {
-    if (!keywordMap[token]) {
+    dictionary.set(token, dictionary.size);
-      words.push(token);
+  }
-      keywordMap[token] = true;
+  tokens.push(dictionary.get(token));
    }
  });
 }
-function extractMemberWords(doc, members, membersMap) {
+function extractMemberTokens(doc, ignoreWords, dictionary) {
-  if (!doc) return;
+  if (!doc) return '';
  let memberContent = [];
  if (doc.members) {
-    doc.members.forEach(member => extractWords(member.name, members, membersMap));
+    doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
  }
  if (doc.statics) {
-    doc.statics.forEach(member => extractWords(member.name, members, membersMap));
+    doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
  }
  if (doc.extendsClauses) {
-    doc.extendsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap));
+    doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
  }
  if (doc.implementsClauses) {
-    doc.implementsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap));
+    doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
  }
-}
+
  return memberContent;
 }
--- a/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js
+++ b/aio/tools/transforms/angular-base-package/processors/generateKeywords.spec.js
@ -1,12 +1,22 @@
 const path = require('canonical-path');
 const Dgeni = require('dgeni');
 const testPackage = require('../../helpers/test-package');
 const mockLogger = require('dgeni/lib/mocks/log')(false);
 const processorFactory = require('./generateKeywords');
 const Dgeni = require('dgeni');
 const mockReadFilesProcessor = {
  basePath: 'base/path'
 };
 const ignoreWords = require(path.resolve(__dirname, '../ignore-words'))['en'];
 function createProcessor() {
  const processor = processorFactory(mockLogger, mockReadFilesProcessor);
  processor.ignoreWords = ignoreWords;
  return processor;
 }
 describe('generateKeywords processor', () => {
  it('should be available on the injector', () => {
@ -17,30 +27,81 @@ describe('generateKeywords processor', () => {
  });
  it('should run after the correct processor', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    expect(processor.$runAfter).toEqual(['postProcessHtml']);
  });
  it('should run before the correct processor', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    expect(processor.$runBefore).toEqual(['writing-files']);
  });
  it('should ignore internal and private exports', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      { docType: 'class', name: 'PublicExport' },
      { docType: 'class', name: 'PrivateExport', privateExport: true },
      { docType: 'class', name: 'InternalExport', internal: true }
    ];
    processor.$process(docs);
-    expect(docs[docs.length - 1].data).toEqual([
+    expect(docs[docs.length - 1].data.pages).toEqual([
-      jasmine.objectContaining({ title: 'PublicExport', type: 'class'})
+      jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
    ]);
  });
  it('should ignore docs that are in the `docTypesToIgnore` list', () => {
    const processor = createProcessor();
    processor.docTypesToIgnore = ['interface'];
    const docs = [
      { docType: 'class', name: 'Class' },
      { docType: 'interface', name: 'Interface' },
      { docType: 'content', name: 'Guide' },
    ];
    processor.$process(docs);
    expect(docs[docs.length - 1].data.pages).toEqual([
      jasmine.objectContaining({ title: 'Class', type: 'class' }),
      jasmine.objectContaining({ title: 'Guide', type: 'content' }),
    ]);
  });
  it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => {
    const processor = createProcessor();
    processor.propertiesToIgnore = ['docType', 'ignore'];
    const docs = [
      { docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
      { docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
    ];
    processor.$process(docs);
    expect(docs[docs.length - 1].data).toEqual({
      dictionary: [ 'fooclass', 'barinterfac', 'captur', 'content' ],
      pages: [
        jasmine.objectContaining({ title: 'FooClass', type: 'class', keywords: [0] }),
        jasmine.objectContaining({ title: 'BarInterface', type: 'interface', keywords: [1, 2, 3] }),
      ],
    });
  });
  it('should not collect keywords that look like HTML tags', () => {
    const processor = createProcessor();
    const docs = [
      { docType: 'class', name: 'FooClass', content: `
      <table id="foo">
        <tr class="moo" id="bar">
          <td>Content inside a table</td>
        </tr>
      </table>` },
    ];
    processor.$process(docs);
    expect(docs[docs.length - 1].data).toEqual({
      dictionary: ['class', 'fooclass', 'content', 'insid', 'tabl'],
      pages: [
        jasmine.objectContaining({keywords: [0, 1, 2, 3, 4] })
      ],
    });
  });
  it('should compute `doc.searchTitle` from the doc properties if not already provided', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      { docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
      { docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
@ -48,7 +109,7 @@ describe('generateKeywords processor', () => {
      { docType: 'class', name: 'D' },
    ];
    processor.$process(docs);
-    expect(docs[docs.length - 1].data).toEqual([
+    expect(docs[docs.length - 1].data.pages).toEqual([
      jasmine.objectContaining({ title: 'searchTitle A' }),
      jasmine.objectContaining({ title: 'title B' }),
      jasmine.objectContaining({ title: 'vFile C' }),
@ -57,34 +118,19 @@ describe('generateKeywords processor', () => {
  });
  it('should use `doc.searchTitle` as the title in the search index', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      { docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(keywordsDoc.data).toEqual([
+    expect(keywordsDoc.data.pages).toEqual([
-      jasmine.objectContaining({ title: 'class PublicExport', type: 'class'})
+      jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
    ]);
  });
  it('should add title words to the search terms', () => {
    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
    const docs = [
      {
        docType: 'class',
        name: 'PublicExport',
        searchTitle: 'class PublicExport',
        vFile: { headings: { h2: ['heading A', 'heading B'] } }
      },
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
    expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport');
  });
  it('should add heading words to the search terms', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      {
        docType: 'class',
@ -95,11 +141,16 @@ describe('generateKeywords processor', () => {
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(keywordsDoc.data[0].headingWords).toEqual('heading important secondary');
+    expect(keywordsDoc.data).toEqual({
      dictionary: ['class', 'publicexport', 'head', 'secondari'],
      pages: [
        jasmine.objectContaining({ headings: [2, 3, 2] })
      ]
    });
  });
  it('should add member doc properties to the search terms', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      {
        docType: 'class',
@ -123,13 +174,18 @@ describe('generateKeywords processor', () => {
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(keywordsDoc.data[0].members).toEqual(
+    expect(keywordsDoc.data).toEqual({
-      'instancemethoda instancemethodb instancepropertya instancepropertyb staticmethoda staticmethodb staticpropertya staticpropertyb'
+      dictionary: ['class', 'publicexport', 'content', 'ngclass', 'instancemethoda','instancepropertya','instancemethodb','instancepropertyb','staticmethoda','staticpropertya','staticmethodb','staticpropertyb', 'head'],
-    );
+      pages: [
        jasmine.objectContaining({
          members: [4, 5, 6, 7, 8, 9, 10, 11]
        })
      ]
    });
  });
  it('should add inherited member doc properties to the search terms', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const parentClass =       {
      docType: 'class',
      name: 'ParentClass',
@ -163,13 +219,27 @@ describe('generateKeywords processor', () => {
    const docs = [childClass, parentClass, parentInterface];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(keywordsDoc.data[0].members.split(' ').sort().join(' ')).toEqual(
+    expect(keywordsDoc.data).toEqual({
-      'childmember1 childmember2 parentmember1 parentmember2 parentmember3'
+      dictionary: ['class', 'child', 'childmember1', 'childmember2', 'parentmember1', 'parentmember2', 'parentmember3', 'parentclass', 'interfac', 'parentinterfac'],
-    );
+      pages: [
        jasmine.objectContaining({
          title: 'Child',
          members: [2, 3, 4, 5, 6]
        }),
        jasmine.objectContaining({
          title: 'ParentClass',
          members: [4, 5]
        }),
        jasmine.objectContaining({
          title: 'ParentInterface',
          members: [6]
        })
      ]
    });
  });
-  it('should process terms prefixed with "ng" to include the term stripped of "ng"', () => {
+  it('should include both stripped and unstripped "ng" prefixed tokens', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      {
        docType: 'class',
@ -181,14 +251,19 @@ describe('generateKeywords processor', () => {
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller');
+    expect(keywordsDoc.data).toEqual({
-    expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel');
+      dictionary: ['class', 'publicexport', 'ngcontrol', 'control', 'content', 'ngclass', 'ngmodel', 'model'],
-    expect(keywordsDoc.data[0].keywords).toContain('class');
+      pages: [
-    expect(keywordsDoc.data[0].keywords).toContain('ngclass');
+        jasmine.objectContaining({
          headings: [6, 7],
          keywords: [0, 1, 2, 3, 4, 5, 0],
        })
      ],
    });
  });
-  it('should generate renderedContent property', () => {
+  it('should generate compressed encoded renderedContent property', () => {
-    const processor = processorFactory(mockLogger, mockReadFilesProcessor);
+    const processor = createProcessor();
    const docs = [
      {
        docType: 'class',
@ -196,19 +271,33 @@ describe('generateKeywords processor', () => {
        description: 'The is the documentation for the SomeClass API.',
        vFile: { headings: { h1: ['SomeClass'], h2: ['Some heading'] } }
      },
      {
        docType: 'class',
        name: 'SomeClass2',
        description: 'description',
        members: [
          { name: 'member1' },
        ],
        deprecated: true
      },
    ];
    processor.$process(docs);
    const keywordsDoc = docs[docs.length - 1];
-    expect(JSON.parse(keywordsDoc.renderedContent)).toEqual(
+    expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
-      [{
+      dictionary: ['class', 'someclass', 'document', 'api', 'head', 'someclass2', 'descript', 'member1'],
      pages: [{
        'title':'SomeClass',
        'type':'class',
-        'titleWords':'someclass',
+        'headings': [1, 4],
-        'headingWords':'heading some someclass',
+        'keywords': [0, 1, 2, 1, 3],
-        'keywords':'api class documentation for is someclass the',
+      },
-        'members':'',
+      {
-        'deprecated': false,
+        'title':'SomeClass2',
        'type':'class',
        'keywords': [0, 5, 6],
        'members': [7],
        'deprecated': true,
      }]
-    );
+    });
  });
 });
--- a/aio/yarn.lock
+++ b/aio/yarn.lock
@ -2005,6 +2005,11 @@
  resolved "https://registry.yarnpkg.com/@types/source-list-map/-/source-list-map-0.1.2.tgz#0078836063ffaf17412349bba364087e0ac02ec9"
  integrity sha512-K5K+yml8LTo9bWJI/rECfIPrGgxdpeNbj+d53lwN4QjW1MCwlkhUms+gtdzigTeUyBr09+u8BwOIY3MXvHdcsA==
 "@types/stemmer@^1.0.2":
  version "1.0.2"
  resolved "https://registry.yarnpkg.com/@types/stemmer/-/stemmer-1.0.2.tgz#bd8354f50b3c9b87c351d169240e45cf1fa1f5e8"
  integrity sha512-2gWEIFqVZjjZxo8/TcugCAl7nW9Jd9ArEDpTAc5nH7d+ZUkreHA7GzuFcLZ0sflLrA5b1PZ+2yDyHJcuP9KWWw==
 "@types/unist@*", "@types/unist@^2.0.0", "@types/unist@^2.0.2":
  version "2.0.3"
  resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.3.tgz#9c088679876f374eb5983f150d4787aa6fb32d7e"
@ -12802,6 +12807,11 @@ static-extend@^0.1.1:
  resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c"
  integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=
 stemmer@^1.0.5:
  version "1.0.5"
  resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0"
  integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A==
 stream-browserify@^2.0.1:
  version "2.0.2"
  resolved "https://registry.yarnpkg.com/stream-browserify/-/stream-browserify-2.0.2.tgz#87521d38a44aa7ee91ce1cd2a47df0cb49dd660b"