refactor(docs-infra): include more info in search index data (#41368)

The AIO search index is built in a WebWorker on the browser from a set
of page information that is downloaded as a JSON file (`search-data.json`).
We want to keep this file as small as possible while providing enough
data to generate a useful index to query against.

Previously, we only included one copy of each (non-ignored) term from each
doc but this prevents more subtle ranking of query results, since the number
of occurences of a term in a doc is lost.

This commit changes the generated file in the following ways:

- All non-ignored terms are now included in the order in which they appear
  in the doc.
- The terms are indexed into a dictonary to avoid the text of the term being
  repeated in every doc that contains the term.
- Each term is pre-"stemmed" using the same Porter Stemming algorith that the
  Lunr search engine uses.

The web-worker has been updated to decode the new format of the file.
Now that all terms are included, it may enable some level of phrase based
matching in the future.

The size of the generated file is considerably larger than previously, but
on production HTTP servers the data is sent compressed, which reduces the
size dramatically.

PR Close #41368
This commit is contained in:
Pete Bacon Darwin 2021-03-28 20:34:09 +01:00 committed by Alex Rickabaugh
parent 55f7f1d446
commit fccffc647b
8 changed files with 1007 additions and 878 deletions

View File

@ -116,6 +116,7 @@
"@types/jasmine": "~3.6.0",
"@types/lunr": "^2.3.2",
"@types/node": "^12.7.9",
"@types/stemmer": "^1.0.2",
"@types/xregexp": "^3.0.30",
"@yarnpkg/lockfile": "^1.1.0",
"archiver": "^1.3.0",
@ -166,6 +167,7 @@
"rimraf": "^2.6.1",
"semver": "^5.3.0",
"shelljs": "^0.8.4",
"stemmer": "^1.0.5",
"timezone-mock": "^1.1.3",
"tree-kill": "^1.1.0",
"ts-node": "^8.4.1",

View File

@ -1,10 +1,11 @@
/// <reference lib="webworker" />
import { WebWorkerMessage } from '../shared/web-worker-message';
import * as lunr from 'lunr';
import {WebWorkerMessage} from '../shared/web-worker-message';
const SEARCH_TERMS_URL = '/generated/docs/app/search-data.json';
let index: lunr.Index;
const pages: SearchInfo = {};
const pageMap: SearchInfo = {};
interface SearchInfo {
[key: string]: PageInfo;
@ -13,8 +14,25 @@ interface SearchInfo {
interface PageInfo {
path: string;
type: string;
titleWords: string;
keyWords: string;
title: string;
headings: string;
keywords: string;
members: string;
topics: string;
}
interface EncodedPages {
dictionary: string[];
pages: EncodedPage[];
}
interface EncodedPage {
path: string;
type: string;
title: string;
headings: number[];
keywords: number[];
members: number[];
topics: string;
}
@ -24,42 +42,42 @@ addEventListener('message', handleMessage);
// the path and search terms for a page
function createIndex(loadIndexFn: IndexLoader): lunr.Index {
// The lunr typings are missing QueryLexer so we have to add them here manually.
const queryLexer = (lunr as any as { QueryLexer: { termSeparator: RegExp } }).QueryLexer;
const queryLexer = (lunr as any as {QueryLexer: {termSeparator: RegExp}}).QueryLexer;
queryLexer.termSeparator = lunr.tokenizer.separator = /\s+/;
return lunr(function() {
this.pipeline.remove(lunr.stemmer);
this.ref('path');
this.field('topics', { boost: 15 });
this.field('titleWords', { boost: 10 });
this.field('headingWords', { boost: 5 });
this.field('members', { boost: 4 });
this.field('keywords', { boost: 2 });
this.field('topics', {boost: 15});
this.field('title', {boost: 10});
this.field('headings', {boost: 5});
this.field('members', {boost: 4});
this.field('keywords', {boost: 2});
loadIndexFn(this);
});
}
// The worker receives a message to load the index and to query the index
function handleMessage(message: { data: WebWorkerMessage }): void {
function handleMessage(message: {data: WebWorkerMessage}): void {
const type = message.data.type;
const id = message.data.id;
const payload = message.data.payload;
switch (type) {
case 'load-index':
makeRequest(SEARCH_TERMS_URL, (searchInfo: PageInfo[]) => {
index = createIndex(loadIndex(searchInfo));
postMessage({ type, id, payload: true });
makeRequest(SEARCH_TERMS_URL, (encodedPages: EncodedPages) => {
index = createIndex(loadIndex(encodedPages));
postMessage({type, id, payload: true});
});
break;
case 'query-index':
postMessage({ type, id, payload: { query: payload, results: queryIndex(payload) } });
postMessage({type, id, payload: {query: payload, results: queryIndex(payload)}});
break;
default:
postMessage({ type, id, payload: { error: 'invalid message type' } });
postMessage({type, id, payload: {error: 'invalid message type'}});
}
}
// Use XHR to make a request to the server
function makeRequest(url: string, callback: (response: any) => void): void {
// The JSON file that is loaded should be an array of PageInfo:
const searchDataRequest = new XMLHttpRequest();
searchDataRequest.onload = function() {
@ -70,18 +88,29 @@ function makeRequest(url: string, callback: (response: any) => void): void {
}
// Create the search index from the searchInfo which contains the information about each page to be indexed
function loadIndex(pagesData: PageInfo[]): IndexLoader {
// Create the search index from the searchInfo which contains the information about each page to be
// indexed
function loadIndex({dictionary, pages}: EncodedPages): IndexLoader {
return (indexBuilder: lunr.Builder) => {
// Store the pages data to be used in mapping query results back to pages
// Add search terms from each page to the search index
pagesData.forEach(page => {
pages.forEach(encodedPage => {
const page = decodePage(encodedPage, dictionary);
indexBuilder.add(page);
pages[page.path] = page;
pageMap[page.path] = page;
});
};
}
function decodePage(encodedPage: EncodedPage, dictionary: string[]): PageInfo {
return {
...encodedPage,
headings: encodedPage.headings?.map(i => dictionary[i]).join(' ') ?? '',
keywords: encodedPage.keywords?.map(i => dictionary[i]).join(' ') ?? '',
members: encodedPage.members?.map(i => dictionary[i]).join(' ') ?? '',
};
}
// Query the index and return the processed results
function queryIndex(query: string): PageInfo[] {
// Strip off quotes
@ -105,7 +134,7 @@ function queryIndex(query: string): PageInfo[] {
}
// Map the hits into info about each page to be returned as results
return results.map(hit => pages[hit.ref]);
return results.map(hit => pageMap[hit.ref]);
}
} catch (e) {
// If the search query cannot be parsed the index throws an error

View File

@ -0,0 +1,705 @@
{
"en": [
"a",
"able",
"about",
"above",
"abst",
"accordance",
"according",
"accordingly",
"across",
"act",
"actually",
"added",
"adj",
"adopted",
"affected",
"affecting",
"affects",
"after",
"afterwards",
"again",
"against",
"ah",
"all",
"almost",
"alone",
"along",
"already",
"also",
"although",
"always",
"am",
"among",
"amongst",
"an",
"and",
"announce",
"another",
"any",
"anybody",
"anyhow",
"anymore",
"anyone",
"anything",
"anyway",
"anyways",
"anywhere",
"apparently",
"approximately",
"are",
"aren",
"arent",
"arise",
"around",
"as",
"aside",
"ask",
"asking",
"at",
"auth",
"available",
"away",
"awfully",
"b",
"back",
"be",
"became",
"because",
"become",
"becomes",
"becoming",
"been",
"before",
"beforehand",
"begin",
"beginning",
"beginnings",
"begins",
"behind",
"being",
"believe",
"below",
"beside",
"besides",
"between",
"beyond",
"biol",
"both",
"brief",
"briefly",
"but",
"by",
"c",
"ca",
"came",
"can",
"cannot",
"can't",
"cant",
"cause",
"causes",
"certain",
"certainly",
"co",
"com",
"come",
"comes",
"contain",
"containing",
"contains",
"could",
"couldnt",
"d",
"date",
"did",
"didn't",
"didnt",
"different",
"do",
"does",
"doesn't",
"doesnt",
"doing",
"done",
"don't",
"dont",
"down",
"downwards",
"due",
"during",
"e",
"each",
"ed",
"edu",
"effect",
"eg",
"eight",
"eighty",
"either",
"else",
"elsewhere",
"end",
"ending",
"enough",
"especially",
"et",
"et-al",
"etc",
"even",
"ever",
"every",
"everybody",
"everyone",
"everything",
"everywhere",
"ex",
"except",
"f",
"far",
"few",
"ff",
"fifth",
"first",
"five",
"fix",
"followed",
"following",
"follows",
"for",
"former",
"formerly",
"forth",
"found",
"four",
"from",
"further",
"furthermore",
"g",
"gave",
"get",
"gets",
"getting",
"give",
"given",
"gives",
"giving",
"go",
"goes",
"gone",
"got",
"gotten",
"h",
"had",
"happens",
"hardly",
"has",
"hasn't",
"hasnt",
"have",
"haven't",
"havent",
"having",
"he",
"hed",
"hence",
"her",
"here",
"hereafter",
"hereby",
"herein",
"heres",
"hereupon",
"hers",
"herself",
"hes",
"hi",
"hid",
"him",
"himself",
"his",
"hither",
"home",
"how",
"howbeit",
"however",
"hundred",
"i",
"id",
"ie",
"if",
"i'll",
"ill",
"im",
"immediate",
"immediately",
"importance",
"important",
"in",
"inc",
"indeed",
"index",
"information",
"instead",
"into",
"invention",
"inward",
"is",
"isn't",
"isnt",
"it",
"itd",
"it'll",
"itll",
"its",
"itself",
"i've",
"ive",
"j",
"just",
"k",
"keep",
"keeps",
"kept",
"keys",
"kg",
"km",
"know",
"known",
"knows",
"l",
"largely",
"last",
"lately",
"later",
"latter",
"latterly",
"least",
"less",
"lest",
"let",
"lets",
"like",
"liked",
"likely",
"line",
"little",
"'ll",
"'ll",
"look",
"looking",
"looks",
"ltd",
"m",
"made",
"mainly",
"make",
"makes",
"many",
"may",
"maybe",
"me",
"mean",
"means",
"meantime",
"meanwhile",
"merely",
"mg",
"might",
"million",
"miss",
"ml",
"more",
"moreover",
"most",
"mostly",
"mr",
"mrs",
"much",
"mug",
"must",
"my",
"myself",
"n",
"na",
"name",
"namely",
"nay",
"nd",
"near",
"nearly",
"necessarily",
"necessary",
"need",
"needs",
"neither",
"never",
"nevertheless",
"new",
"next",
"nine",
"ninety",
"no",
"nobody",
"non",
"none",
"nonetheless",
"noone",
"nor",
"normally",
"nos",
"not",
"noted",
"nothing",
"now",
"nowhere",
"o",
"obtain",
"obtained",
"obviously",
"of",
"off",
"often",
"oh",
"ok",
"okay",
"old",
"omitted",
"on",
"once",
"one",
"ones",
"only",
"onto",
"or",
"ord",
"other",
"others",
"otherwise",
"ought",
"our",
"ours",
"ourselves",
"out",
"outside",
"over",
"overall",
"owing",
"own",
"p",
"page",
"pages",
"part",
"particular",
"particularly",
"past",
"per",
"perhaps",
"placed",
"please",
"plus",
"poorly",
"possible",
"possibly",
"potentially",
"pp",
"predominantly",
"present",
"previously",
"primarily",
"probably",
"promptly",
"proud",
"provides",
"put",
"q",
"que",
"quickly",
"quite",
"qv",
"r",
"ran",
"rather",
"rd",
"re",
"readily",
"really",
"recent",
"recently",
"ref",
"refs",
"regarding",
"regardless",
"regards",
"related",
"relatively",
"research",
"respectively",
"resulted",
"resulting",
"results",
"right",
"run",
"s",
"said",
"same",
"saw",
"say",
"saying",
"says",
"sec",
"section",
"see",
"seeing",
"seem",
"seemed",
"seeming",
"seems",
"seen",
"self",
"selves",
"sent",
"seven",
"several",
"shall",
"she",
"shed",
"she'll",
"shell",
"shes",
"should",
"shouldn't",
"shouldnt",
"show",
"showed",
"shown",
"showns",
"shows",
"significant",
"significantly",
"similar",
"similarly",
"since",
"six",
"slightly",
"so",
"some",
"somebody",
"somehow",
"someone",
"somethan",
"something",
"sometime",
"sometimes",
"somewhat",
"somewhere",
"soon",
"sorry",
"specifically",
"specified",
"specify",
"specifying",
"state",
"states",
"still",
"stop",
"strongly",
"sub",
"substantially",
"successfully",
"such",
"sufficiently",
"suggest",
"sup",
"sure",
"t",
"take",
"taken",
"taking",
"tell",
"tends",
"th",
"than",
"thank",
"thanks",
"thanx",
"that",
"that'll",
"thatll",
"thats",
"that've",
"thatve",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"thence",
"there",
"thereafter",
"thereby",
"thered",
"therefore",
"therein",
"there'll",
"therell",
"thereof",
"therere",
"theres",
"thereto",
"thereupon",
"there've",
"thereve",
"these",
"they",
"theyd",
"they'll",
"theyll",
"theyre",
"they've",
"theyve",
"think",
"this",
"those",
"thou",
"though",
"thoughh",
"thousand",
"throug",
"through",
"throughout",
"thru",
"thus",
"til",
"tip",
"to",
"together",
"too",
"took",
"toward",
"towards",
"tried",
"tries",
"truly",
"try",
"trying",
"ts",
"twice",
"two",
"u",
"un",
"under",
"unfortunately",
"unless",
"unlike",
"unlikely",
"until",
"unto",
"up",
"upon",
"ups",
"us",
"use",
"used",
"useful",
"usefully",
"usefulness",
"uses",
"using",
"usually",
"v",
"value",
"various",
"'ve",
"'ve",
"very",
"via",
"viz",
"vol",
"vols",
"vs",
"w",
"want",
"wants",
"was",
"wasn't",
"wasnt",
"way",
"we",
"wed",
"welcome",
"we'll",
"well",
"went",
"were",
"weren't",
"werent",
"we've",
"weve",
"what",
"whatever",
"what'll",
"whatll",
"whats",
"when",
"whence",
"whenever",
"where",
"whereafter",
"whereas",
"whereby",
"wherein",
"wheres",
"whereupon",
"wherever",
"whether",
"which",
"while",
"whim",
"whither",
"who",
"whod",
"whoever",
"whole",
"who'll",
"wholl",
"whom",
"whomever",
"whos",
"whose",
"why",
"widely",
"will",
"willing",
"wish",
"with",
"within",
"without",
"won't",
"wont",
"words",
"would",
"wouldn't",
"wouldnt",
"www",
"x",
"y",
"yes",
"yet",
"you",
"youd",
"you'll",
"youll",
"your",
"youre",
"yours",
"yourself",
"yourselves",
"you've",
"youve",
"z",
"zero"
]
}

View File

@ -1,701 +0,0 @@
a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
adopted
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cant
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
d
date
did
didn't
didnt
different
do
does
doesn't
doesnt
doing
done
don't
dont
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
ff
fifth
first
five
fix
followed
following
follows
for
former
formerly
forth
found
four
from
further
furthermore
g
gave
get
gets
getting
give
given
gives
giving
go
goes
gone
got
gotten
h
had
happens
hardly
has
hasn't
hasnt
have
haven't
havent
having
he
hed
hence
her
here
hereafter
hereby
herein
heres
hereupon
hers
herself
hes
hi
hid
him
himself
his
hither
home
how
howbeit
however
hundred
i
id
ie
if
i'll
ill
im
immediate
immediately
importance
important
in
inc
indeed
index
information
instead
into
invention
inward
is
isn't
isnt
it
itd
it'll
itll
its
itself
i've
ive
j
just
k
keep
keeps
kept
keys
kg
km
know
known
knows
l
largely
last
lately
later
latter
latterly
least
less
lest
let
lets
like
liked
likely
line
little
'll
'll
look
looking
looks
ltd
m
made
mainly
make
makes
many
may
maybe
me
mean
means
meantime
meanwhile
merely
mg
might
million
miss
ml
more
moreover
most
mostly
mr
mrs
much
mug
must
my
myself
n
na
name
namely
nay
nd
near
nearly
necessarily
necessary
need
needs
neither
never
nevertheless
new
next
nine
ninety
no
nobody
non
none
nonetheless
noone
nor
normally
nos
not
noted
nothing
now
nowhere
o
obtain
obtained
obviously
of
off
often
oh
ok
okay
old
omitted
on
once
one
ones
only
onto
or
ord
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
owing
own
p
page
pages
part
particular
particularly
past
per
perhaps
placed
please
plus
poorly
possible
possibly
potentially
pp
predominantly
present
previously
primarily
probably
promptly
proud
provides
put
q
que
quickly
quite
qv
r
ran
rather
rd
re
readily
really
recent
recently
ref
refs
regarding
regardless
regards
related
relatively
research
respectively
resulted
resulting
results
right
run
s
said
same
saw
say
saying
says
sec
section
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sent
seven
several
shall
she
shed
she'll
shell
shes
should
shouldn't
shouldnt
show
showed
shown
showns
shows
significant
significantly
similar
similarly
since
six
slightly
so
some
somebody
somehow
someone
somethan
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specifically
specified
specify
specifying
state
states
still
stop
strongly
sub
substantially
successfully
such
sufficiently
suggest
sup
sure
t
take
taken
taking
tell
tends
th
than
thank
thanks
thanx
that
that'll
thatll
thats
that've
thatve
the
their
theirs
them
themselves
then
thence
there
thereafter
thereby
thered
therefore
therein
there'll
therell
thereof
therere
theres
thereto
thereupon
there've
thereve
these
they
theyd
they'll
theyll
theyre
they've
theyve
think
this
those
thou
though
thoughh
thousand
throug
through
throughout
thru
thus
til
tip
to
together
too
took
toward
towards
tried
tries
truly
try
trying
ts
twice
two
u
un
under
unfortunately
unless
unlike
unlikely
until
unto
up
upon
ups
us
use
used
useful
usefully
usefulness
uses
using
usually
v
value
various
've
've
very
via
viz
vol
vols
vs
w
want
wants
was
wasn't
wasnt
way
we
wed
welcome
we'll
well
went
were
weren't
werent
we've
weve
what
whatever
what'll
whatll
whats
when
whence
whenever
where
whereafter
whereas
whereby
wherein
wheres
whereupon
wherever
whether
which
while
whim
whither
who
whod
whoever
whole
who'll
wholl
whom
whomever
whos
whose
why
widely
will
willing
wish
with
within
without
won't
wont
words
would
wouldn't
wouldnt
www
x
y
yes
yet
you
youd
you'll
youll
your
youre
yours
yourself
yourselves
you've
youve
z
zero

View File

@ -65,9 +65,9 @@ module.exports = new Package('angular-base', [
readFilesProcessor.sourceFiles = [];
collectExamples.exampleFolders = [];
generateKeywordsProcessor.ignoreWordsFile = path.resolve(__dirname, 'ignore.words');
generateKeywordsProcessor.ignoreWords = require(path.resolve(__dirname, 'ignore-words'))['en'];
generateKeywordsProcessor.docTypesToIgnore = ['example-region'];
generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent'];
generateKeywordsProcessor.propertiesToIgnore = ['basePath', 'renderedContent', 'docType', 'searchTitle'];
})
// Where do we write the output files?

View File

@ -1,7 +1,6 @@
'use strict';
var fs = require('fs');
var path = require('canonical-path');
const stem = require('stemmer');
/**
* @dgProcessor generateKeywordsProcessor
@ -10,103 +9,98 @@ var path = require('canonical-path');
* a new document that will be rendered as a JavaScript file containing all
* this data.
*/
module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
module.exports = function generateKeywordsProcessor(log) {
return {
ignoreWordsFile: undefined,
ignoreWords: [],
propertiesToIgnore: [],
docTypesToIgnore: [],
outputFolder: '',
$validate: {
ignoreWordsFile: {},
ignoreWords: {},
docTypesToIgnore: {},
propertiesToIgnore: {},
outputFolder: {presence: true}
},
$runAfter: ['postProcessHtml'],
$runBefore: ['writing-files'],
$process: function(docs) {
$process(docs) {
const dictionary = new Map();
// Keywords to ignore
var wordsToIgnore = [];
var propertiesToIgnore;
var docTypesToIgnore;
// Load up the keywords to ignore, if specified in the config
if (this.ignoreWordsFile) {
var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile);
wordsToIgnore = fs.readFileSync(ignoreWordsPath, 'utf8').toString().split(/[,\s\n\r]+/gm);
log.debug('Loaded ignore words from "' + ignoreWordsPath + '"');
log.silly(wordsToIgnore);
}
propertiesToIgnore = convertToMap(this.propertiesToIgnore);
const ignoreWords = new Set(this.ignoreWords);
log.debug('Words to ignore', ignoreWords);
const propertiesToIgnore = new Set(this.propertiesToIgnore);
log.debug('Properties to ignore', propertiesToIgnore);
docTypesToIgnore = convertToMap(this.docTypesToIgnore);
const docTypesToIgnore = new Set(this.docTypesToIgnore);
log.debug('Doc types to ignore', docTypesToIgnore);
var ignoreWordsMap = convertToMap(wordsToIgnore);
const filteredDocs = docs
// We are not interested in some docTypes
.filter(function(doc) { return !docTypesToIgnore[doc.docType]; })
.filter(doc => !docTypesToIgnore.has(doc.docType))
// Ignore internals and private exports (indicated by the ɵ prefix)
.filter(function(doc) { return !doc.internal && !doc.privateExport; });
.filter(doc => !doc.internal && !doc.privateExport);
filteredDocs.forEach(function(doc) {
var words = [];
var keywordMap = Object.assign({}, ignoreWordsMap);
var members = [];
var membersMap = Object.assign({}, ignoreWordsMap);
const headingWords = [];
const headingWordMap = Object.assign({}, ignoreWordsMap);
for(const doc of filteredDocs) {
// Search each top level property of the document for search terms
Object.keys(doc).forEach(function(key) {
let mainTokens = [];
for(const key of Object.keys(doc)) {
const value = doc[key];
if (isString(value) && !propertiesToIgnore[key]) {
extractWords(value, words, keywordMap);
if (isString(value) && !propertiesToIgnore.has(key)) {
mainTokens.push(...tokenize(value, ignoreWords, dictionary));
}
});
}
extractMemberWords(doc, members, membersMap);
const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary);
// Extract all the keywords from the headings
let headingTokens = [];
if (doc.vFile && doc.vFile.headings) {
Object.keys(doc.vFile.headings).forEach(function(headingTag) {
doc.vFile.headings[headingTag].forEach(function(headingText) {
extractWords(headingText, headingWords, headingWordMap);
});
});
for(const headingTag of Object.keys(doc.vFile.headings)) {
for(const headingText of doc.vFile.headings[headingTag]) {
headingTokens.push(...tokenize(headingText, ignoreWords, dictionary));
}
}
}
// Extract the title to use in searches
doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || '';
// Attach all this search data to the document
doc.searchTerms = {
titleWords: tokenize(doc.searchTitle).join(' '),
headingWords: headingWords.sort().join(' '),
keywords: words.sort().join(' '),
members: members.sort().join(' '),
topics: doc.searchKeywords
};
});
doc.searchTerms = {};
if (headingTokens.length > 0) {
doc.searchTerms.headings = headingTokens;
}
if (mainTokens.length > 0) {
doc.searchTerms.keywords = mainTokens;
}
if (memberTokens.length > 0) {
doc.searchTerms.members = memberTokens;
}
if (doc.searchKeywords) {
doc.searchTerms.topics = doc.searchKeywords.trim();
}
}
// Now process all the search data and collect it up to be used in creating a new document
var searchData = filteredDocs.map(function(page) {
// Copy the properties from the searchTerms object onto the search data object
return Object.assign({
path: page.path,
title: page.searchTitle,
type: page.docType,
deprecated: !!page.deprecated,
}, page.searchTerms);
});
const searchData = {
dictionary: Array.from(dictionary.keys()),
pages: filteredDocs.map(page => {
// Copy the properties from the searchTerms object onto the search data object
const searchObj = {
path: page.path,
title: page.searchTitle,
type: page.docType,
};
if (page.deprecated) {
searchObj.deprecated = true;
}
return Object.assign(searchObj, page.searchTerms);
}),
};
docs.push({
docType: 'json-doc',
@ -120,63 +114,64 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
};
};
function isString(value) {
return typeof value == 'string';
}
function convertToMap(collection) {
const obj = {};
collection.forEach(key => { obj[key] = true; });
return obj;
}
// If the heading contains a name starting with ng, e.g. "ngController", then add the
// name without the ng to the text, e.g. "controller".
function tokenize(text) {
const rawTokens = text.split(/[\s\/]+/mg);
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(/[\s\/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img);
const tokens = [];
rawTokens.forEach(token => {
for(let token of rawTokens) {
token = token.trim();
// Strip off unwanted trivial characters
token = token
.trim()
.replace(/^[_\-"'`({[<$*)}\]>.]+/, '')
.replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
// Ignore tokens that contain weird characters
if (/^[\w.\-]+$/.test(token)) {
tokens.push(token.toLowerCase());
const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token);
if (ngTokenMatch) {
tokens.push(ngTokenMatch[1].toLowerCase());
}
token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
// Skip if in the ignored words list
if (ignoreWords.has(token.toLowerCase())) {
continue;
}
});
// Skip tokens that contain weird characters
if (!/^[\w._-]+$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
storeToken(token.substr(2), tokens, dictionary);
}
}
return tokens;
}
function extractWords(text, words, keywordMap) {
var tokens = tokenize(text);
tokens.forEach(function(token) {
if (!keywordMap[token]) {
words.push(token);
keywordMap[token] = true;
}
});
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberWords(doc, members, membersMap) {
if (!doc) return;
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return '';
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => extractWords(member.name, members, membersMap));
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => extractWords(member.name, members, membersMap));
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap));
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => extractMemberWords(clause.doc, members, membersMap));
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
}
return memberContent;
}

View File

@ -1,12 +1,22 @@
const path = require('canonical-path');
const Dgeni = require('dgeni');
const testPackage = require('../../helpers/test-package');
const mockLogger = require('dgeni/lib/mocks/log')(false);
const processorFactory = require('./generateKeywords');
const Dgeni = require('dgeni');
const mockReadFilesProcessor = {
basePath: 'base/path'
};
const ignoreWords = require(path.resolve(__dirname, '../ignore-words'))['en'];
function createProcessor() {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
processor.ignoreWords = ignoreWords;
return processor;
}
describe('generateKeywords processor', () => {
it('should be available on the injector', () => {
@ -17,30 +27,81 @@ describe('generateKeywords processor', () => {
});
it('should run after the correct processor', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
expect(processor.$runAfter).toEqual(['postProcessHtml']);
});
it('should run before the correct processor', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
expect(processor.$runBefore).toEqual(['writing-files']);
});
it('should ignore internal and private exports', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const docs = [
{ docType: 'class', name: 'PublicExport' },
{ docType: 'class', name: 'PrivateExport', privateExport: true },
{ docType: 'class', name: 'InternalExport', internal: true }
];
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual([
jasmine.objectContaining({ title: 'PublicExport', type: 'class'})
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'PublicExport', type: 'class' })
]);
});
it('should ignore docs that are in the `docTypesToIgnore` list', () => {
const processor = createProcessor();
processor.docTypesToIgnore = ['interface'];
const docs = [
{ docType: 'class', name: 'Class' },
{ docType: 'interface', name: 'Interface' },
{ docType: 'content', name: 'Guide' },
];
processor.$process(docs);
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'Class', type: 'class' }),
jasmine.objectContaining({ title: 'Guide', type: 'content' }),
]);
});
it('should not collect keywords from properties that are in the `propertiesToIgnore` list', () => {
const processor = createProcessor();
processor.propertiesToIgnore = ['docType', 'ignore'];
const docs = [
{ docType: 'class', name: 'FooClass', ignore: 'ignore this content' },
{ docType: 'interface', name: 'BarInterface', capture: 'capture this content' },
];
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual({
dictionary: [ 'fooclass', 'barinterfac', 'captur', 'content' ],
pages: [
jasmine.objectContaining({ title: 'FooClass', type: 'class', keywords: [0] }),
jasmine.objectContaining({ title: 'BarInterface', type: 'interface', keywords: [1, 2, 3] }),
],
});
});
it('should not collect keywords that look like HTML tags', () => {
const processor = createProcessor();
const docs = [
{ docType: 'class', name: 'FooClass', content: `
<table id="foo">
<tr class="moo" id="bar">
<td>Content inside a table</td>
</tr>
</table>` },
];
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual({
dictionary: ['class', 'fooclass', 'content', 'insid', 'tabl'],
pages: [
jasmine.objectContaining({keywords: [0, 1, 2, 3, 4] })
],
});
});
it('should compute `doc.searchTitle` from the doc properties if not already provided', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const docs = [
{ docType: 'class', name: 'A', searchTitle: 'searchTitle A', title: 'title A', vFile: { headings: { h1: ['vFile A'] } } },
{ docType: 'class', name: 'B', title: 'title B', vFile: { headings: { h1: ['vFile B'] } } },
@ -48,7 +109,7 @@ describe('generateKeywords processor', () => {
{ docType: 'class', name: 'D' },
];
processor.$process(docs);
expect(docs[docs.length - 1].data).toEqual([
expect(docs[docs.length - 1].data.pages).toEqual([
jasmine.objectContaining({ title: 'searchTitle A' }),
jasmine.objectContaining({ title: 'title B' }),
jasmine.objectContaining({ title: 'vFile C' }),
@ -57,34 +118,19 @@ describe('generateKeywords processor', () => {
});
it('should use `doc.searchTitle` as the title in the search index', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const docs = [
{ docType: 'class', name: 'PublicExport', searchTitle: 'class PublicExport' },
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data).toEqual([
jasmine.objectContaining({ title: 'class PublicExport', type: 'class'})
expect(keywordsDoc.data.pages).toEqual([
jasmine.objectContaining({ title: 'class PublicExport', type: 'class' })
]);
});
it('should add title words to the search terms', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const docs = [
{
docType: 'class',
name: 'PublicExport',
searchTitle: 'class PublicExport',
vFile: { headings: { h2: ['heading A', 'heading B'] } }
},
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport');
});
it('should add heading words to the search terms', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const docs = [
{
docType: 'class',
@ -95,11 +141,16 @@ describe('generateKeywords processor', () => {
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].headingWords).toEqual('heading important secondary');
expect(keywordsDoc.data).toEqual({
dictionary: ['class', 'publicexport', 'head', 'secondari'],
pages: [
jasmine.objectContaining({ headings: [2, 3, 2] })
]
});
});
it('should add member doc properties to the search terms', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const docs = [
{
docType: 'class',
@ -123,13 +174,18 @@ describe('generateKeywords processor', () => {
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].members).toEqual(
'instancemethoda instancemethodb instancepropertya instancepropertyb staticmethoda staticmethodb staticpropertya staticpropertyb'
);
expect(keywordsDoc.data).toEqual({
dictionary: ['class', 'publicexport', 'content', 'ngclass', 'instancemethoda','instancepropertya','instancemethodb','instancepropertyb','staticmethoda','staticpropertya','staticmethodb','staticpropertyb', 'head'],
pages: [
jasmine.objectContaining({
members: [4, 5, 6, 7, 8, 9, 10, 11]
})
]
});
});
it('should add inherited member doc properties to the search terms', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
const processor = createProcessor();
const parentClass = {
docType: 'class',
name: 'ParentClass',
@ -163,13 +219,27 @@ describe('generateKeywords processor', () => {
const docs = [childClass, parentClass, parentInterface];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].members.split(' ').sort().join(' ')).toEqual(
'childmember1 childmember2 parentmember1 parentmember2 parentmember3'
);
expect(keywordsDoc.data).toEqual({
dictionary: ['class', 'child', 'childmember1', 'childmember2', 'parentmember1', 'parentmember2', 'parentmember3', 'parentclass', 'interfac', 'parentinterfac'],
pages: [
jasmine.objectContaining({
title: 'Child',
members: [2, 3, 4, 5, 6]
}),
jasmine.objectContaining({
title: 'ParentClass',
members: [4, 5]
}),
jasmine.objectContaining({
title: 'ParentInterface',
members: [6]
})
]
});
});
it('should process terms prefixed with "ng" to include the term stripped of "ng"', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
it('should include both stripped and unstripped "ng" prefixed tokens', () => {
const processor = createProcessor();
const docs = [
{
docType: 'class',
@ -181,14 +251,19 @@ describe('generateKeywords processor', () => {
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller');
expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel');
expect(keywordsDoc.data[0].keywords).toContain('class');
expect(keywordsDoc.data[0].keywords).toContain('ngclass');
expect(keywordsDoc.data).toEqual({
dictionary: ['class', 'publicexport', 'ngcontrol', 'control', 'content', 'ngclass', 'ngmodel', 'model'],
pages: [
jasmine.objectContaining({
headings: [6, 7],
keywords: [0, 1, 2, 3, 4, 5, 0],
})
],
});
});
it('should generate renderedContent property', () => {
const processor = processorFactory(mockLogger, mockReadFilesProcessor);
it('should generate compressed encoded renderedContent property', () => {
const processor = createProcessor();
const docs = [
{
docType: 'class',
@ -196,19 +271,33 @@ describe('generateKeywords processor', () => {
description: 'The is the documentation for the SomeClass API.',
vFile: { headings: { h1: ['SomeClass'], h2: ['Some heading'] } }
},
{
docType: 'class',
name: 'SomeClass2',
description: 'description',
members: [
{ name: 'member1' },
],
deprecated: true
},
];
processor.$process(docs);
const keywordsDoc = docs[docs.length - 1];
expect(JSON.parse(keywordsDoc.renderedContent)).toEqual(
[{
expect(JSON.parse(keywordsDoc.renderedContent)).toEqual({
dictionary: ['class', 'someclass', 'document', 'api', 'head', 'someclass2', 'descript', 'member1'],
pages: [{
'title':'SomeClass',
'type':'class',
'titleWords':'someclass',
'headingWords':'heading some someclass',
'keywords':'api class documentation for is someclass the',
'members':'',
'deprecated': false,
'headings': [1, 4],
'keywords': [0, 1, 2, 1, 3],
},
{
'title':'SomeClass2',
'type':'class',
'keywords': [0, 5, 6],
'members': [7],
'deprecated': true,
}]
);
});
});
});

View File

@ -2005,6 +2005,11 @@
resolved "https://registry.yarnpkg.com/@types/source-list-map/-/source-list-map-0.1.2.tgz#0078836063ffaf17412349bba364087e0ac02ec9"
integrity sha512-K5K+yml8LTo9bWJI/rECfIPrGgxdpeNbj+d53lwN4QjW1MCwlkhUms+gtdzigTeUyBr09+u8BwOIY3MXvHdcsA==
"@types/stemmer@^1.0.2":
version "1.0.2"
resolved "https://registry.yarnpkg.com/@types/stemmer/-/stemmer-1.0.2.tgz#bd8354f50b3c9b87c351d169240e45cf1fa1f5e8"
integrity sha512-2gWEIFqVZjjZxo8/TcugCAl7nW9Jd9ArEDpTAc5nH7d+ZUkreHA7GzuFcLZ0sflLrA5b1PZ+2yDyHJcuP9KWWw==
"@types/unist@*", "@types/unist@^2.0.0", "@types/unist@^2.0.2":
version "2.0.3"
resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.3.tgz#9c088679876f374eb5983f150d4787aa6fb32d7e"
@ -12802,6 +12807,11 @@ static-extend@^0.1.1:
resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c"
integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=
stemmer@^1.0.5:
version "1.0.5"
resolved "https://registry.yarnpkg.com/stemmer/-/stemmer-1.0.5.tgz#fd89beaf8bff5d04b6643bfffcaed0fc420deec0"
integrity sha512-SLq7annzSKRDStasOJJoftCSCzBCKmBmH38jC4fDtCunAqOzpTpIm9zmaHmwNJiZ8gLe9qpVdBVbEG2DC5dE2A==
stream-browserify@^2.0.1:
version "2.0.2"
resolved "https://registry.yarnpkg.com/stream-browserify/-/stream-browserify-2.0.2.tgz#87521d38a44aa7ee91ce1cd2a47df0cb49dd660b"