Pete Bacon Darwin fccffc647b refactor(docs-infra): include more info in search index data (#41368)
The AIO search index is built in a WebWorker on the browser from a set
of page information that is downloaded as a JSON file (`search-data.json`).
We want to keep this file as small as possible while providing enough
data to generate a useful index to query against.

Previously, we only included one copy of each (non-ignored) term from each
doc but this prevents more subtle ranking of query results, since the number
of occurences of a term in a doc is lost.

This commit changes the generated file in the following ways:

- All non-ignored terms are now included in the order in which they appear
  in the doc.
- The terms are indexed into a dictonary to avoid the text of the term being
  repeated in every doc that contains the term.
- Each term is pre-"stemmed" using the same Porter Stemming algorith that the
  Lunr search engine uses.

The web-worker has been updated to decode the new format of the file.
Now that all terms are included, it may enable some level of phrase based
matching in the future.

The size of the generated file is considerably larger than previously, but
on production HTTP servers the data is sent compressed, which reduces the
size dramatically.

PR Close #41368
2021-04-01 12:02:37 -07:00

178 lines
5.5 KiB
JavaScript

'use strict';
const stem = require('stemmer');
/**
* @dgProcessor generateKeywordsProcessor
* @description
* This processor extracts all the keywords from each document and creates
* a new document that will be rendered as a JavaScript file containing all
* this data.
*/
module.exports = function generateKeywordsProcessor(log) {
return {
ignoreWords: [],
propertiesToIgnore: [],
docTypesToIgnore: [],
outputFolder: '',
$validate: {
ignoreWords: {},
docTypesToIgnore: {},
propertiesToIgnore: {},
outputFolder: {presence: true}
},
$runAfter: ['postProcessHtml'],
$runBefore: ['writing-files'],
$process(docs) {
const dictionary = new Map();
// Keywords to ignore
const ignoreWords = new Set(this.ignoreWords);
log.debug('Words to ignore', ignoreWords);
const propertiesToIgnore = new Set(this.propertiesToIgnore);
log.debug('Properties to ignore', propertiesToIgnore);
const docTypesToIgnore = new Set(this.docTypesToIgnore);
log.debug('Doc types to ignore', docTypesToIgnore);
const filteredDocs = docs
// We are not interested in some docTypes
.filter(doc => !docTypesToIgnore.has(doc.docType))
// Ignore internals and private exports (indicated by the ɵ prefix)
.filter(doc => !doc.internal && !doc.privateExport);
for(const doc of filteredDocs) {
// Search each top level property of the document for search terms
let mainTokens = [];
for(const key of Object.keys(doc)) {
const value = doc[key];
if (isString(value) && !propertiesToIgnore.has(key)) {
mainTokens.push(...tokenize(value, ignoreWords, dictionary));
}
}
const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary);
// Extract all the keywords from the headings
let headingTokens = [];
if (doc.vFile && doc.vFile.headings) {
for(const headingTag of Object.keys(doc.vFile.headings)) {
for(const headingText of doc.vFile.headings[headingTag]) {
headingTokens.push(...tokenize(headingText, ignoreWords, dictionary));
}
}
}
// Extract the title to use in searches
doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || '';
// Attach all this search data to the document
doc.searchTerms = {};
if (headingTokens.length > 0) {
doc.searchTerms.headings = headingTokens;
}
if (mainTokens.length > 0) {
doc.searchTerms.keywords = mainTokens;
}
if (memberTokens.length > 0) {
doc.searchTerms.members = memberTokens;
}
if (doc.searchKeywords) {
doc.searchTerms.topics = doc.searchKeywords.trim();
}
}
// Now process all the search data and collect it up to be used in creating a new document
const searchData = {
dictionary: Array.from(dictionary.keys()),
pages: filteredDocs.map(page => {
// Copy the properties from the searchTerms object onto the search data object
const searchObj = {
path: page.path,
title: page.searchTitle,
type: page.docType,
};
if (page.deprecated) {
searchObj.deprecated = true;
}
return Object.assign(searchObj, page.searchTerms);
}),
};
docs.push({
docType: 'json-doc',
id: 'search-data-json',
path: this.outputFolder + '/search-data.json',
outputPath: this.outputFolder + '/search-data.json',
data: searchData,
renderedContent: JSON.stringify(searchData)
});
}
};
};
function isString(value) {
return typeof value == 'string';
}
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(/[\s\/]+|<\/?[a-z]+(?:\s+\w+(?:="[^"]+")?)*>/img);
const tokens = [];
for(let token of rawTokens) {
token = token.trim();
// Strip off unwanted trivial characters
token = token.replace(/^[_\-"'`({[<$*)}\]>.]+/, '').replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
// Skip if in the ignored words list
if (ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^[\w._-]+$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
storeToken(token.substr(2), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return '';
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}