angular-docs-cn/aio/tools/transforms/angular-base-package/processors/generateKeywords.js

191 lines
6.8 KiB
JavaScript

'use strict';
/**
* @dgProcessor generateKeywordsProcessor
* @description
* This processor extracts all the keywords from each document and creates
* a new document that will be rendered as a JavaScript file containing all
* this data.
*/
module.exports = function generateKeywordsProcessor(log) {
return {
ignoreWords: [],
propertiesToIgnore: [],
docTypesToIgnore: [],
outputFolder: '',
$validate: {
ignoreWords: {},
docTypesToIgnore: {},
propertiesToIgnore: {},
outputFolder: {presence: true}
},
$runAfter: ['postProcessHtml'],
$runBefore: ['writing-files'],
async $process(docs) {
const {stemmer: stem} = await import('stemmer');
const dictionary = new Map();
// Keywords to ignore
const ignoreWords = new Set(this.ignoreWords);
log.debug('Words to ignore', ignoreWords);
const propertiesToIgnore = new Set(this.propertiesToIgnore);
log.debug('Properties to ignore', propertiesToIgnore);
const docTypesToIgnore = new Set(this.docTypesToIgnore);
log.debug('Doc types to ignore', docTypesToIgnore);
const filteredDocs = docs
// We are not interested in some docTypes
.filter(doc => !docTypesToIgnore.has(doc.docType))
// Ignore internals and private exports (indicated by the ɵ prefix)
.filter(doc => !doc.internal && !doc.privateExport);
for (const doc of filteredDocs) {
// Search each top level property of the document for search terms
let mainTokens = [];
for (const key of Object.keys(doc)) {
const value = doc[key];
if (isString(value) && !propertiesToIgnore.has(key)) {
mainTokens.push(...tokenize(value, ignoreWords, dictionary));
}
}
const memberTokens = extractMemberTokens(doc, ignoreWords, dictionary);
// Extract all the keywords from the headings
let headingTokens = [];
if (doc.vFile && doc.vFile.headings) {
for (const headingTag of Object.keys(doc.vFile.headings)) {
for (const headingText of doc.vFile.headings[headingTag]) {
headingTokens.push(...tokenize(headingText, ignoreWords, dictionary));
}
}
}
// Extract the title to use in searches
doc.searchTitle = doc.searchTitle || doc.title || doc.vFile && doc.vFile.title || doc.name || '';
// Attach all this search data to the document
doc.searchTerms = {};
if (headingTokens.length > 0) {
doc.searchTerms.headings = headingTokens;
}
if (mainTokens.length > 0) {
doc.searchTerms.keywords = mainTokens;
}
if (memberTokens.length > 0) {
doc.searchTerms.members = memberTokens;
}
if (doc.searchKeywords) {
doc.searchTerms.topics = doc.searchKeywords.trim();
}
}
// Now process all the search data and collect it up to be used in creating a new document
const searchData = {
dictionary: Array.from(dictionary.keys()).join(' '),
pages: filteredDocs.map(page => {
// Copy the properties from the searchTerms object onto the search data object
const searchObj = {
path: page.path,
title: page.searchTitle,
type: page.docType,
};
if (page.deprecated) {
searchObj.deprecated = true;
}
return Object.assign(searchObj, page.searchTerms);
}),
};
docs.push({
docType: 'json-doc',
id: 'search-data-json',
path: this.outputFolder + '/search-data.json',
outputPath: this.outputFolder + '/search-data.json',
data: searchData,
renderedContent: JSON.stringify(searchData)
});
return docs;
// Helpers
function tokenize(text, ignoreWords, dictionary) {
// Split on whitespace and things that are likely to be HTML tags (this is not exhaustive but reduces the unwanted tokens that are indexed).
const rawTokens = text.split(new RegExp(
'[\\s/]+' + // whitespace
'|' + // or
'</?[a-z]+(?:\\s+\\w+(?:="[^"]+")?)*/?>', // simple HTML tags (e.g. <td>, <hr/>, </table>, etc.)
'ig'));
const tokens = [];
for (let token of rawTokens) {
token = token.trim();
// Trim unwanted trivia characters from the start and end of the token
const TRIVIA_CHARS = '[\\s_"\'`({[<$*)}\\]>.,-]';
// Tokens can contain letters, numbers, underscore, dot or hyphen but not at the start or end.
// The leading TRIVIA_CHARS will capture any leading `.`, '-`' or `_` so we don't have to avoid them in this regular expression.
// But we do need to ensure we don't capture the at the end of the token.
const POSSIBLE_TOKEN = '[a-z0-9_.-]*[a-z0-9]';
token = token.replace(new RegExp(`^${TRIVIA_CHARS}*(${POSSIBLE_TOKEN})${TRIVIA_CHARS}*$`, 'i'), '$1');
// Skip if blank or in the ignored words list
if (token === '' || ignoreWords.has(token.toLowerCase())) {
continue;
}
// Skip tokens that contain weird characters
if (!/^\w[\w.-]*$/.test(token)) {
continue;
}
storeToken(token, tokens, dictionary);
if (token.startsWith('ng')) {
// Strip off `ng`, `ng-`, `ng1`, `ng2`, etc
storeToken(token.replace(/^ng[-12]*/, ''), tokens, dictionary);
}
}
return tokens;
}
function storeToken(token, tokens, dictionary) {
token = stem(token);
if (!dictionary.has(token)) {
dictionary.set(token, dictionary.size);
}
tokens.push(dictionary.get(token));
}
function extractMemberTokens(doc, ignoreWords, dictionary) {
if (!doc) return [];
let memberContent = [];
if (doc.members) {
doc.members.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.statics) {
doc.statics.forEach(member => memberContent.push(...tokenize(member.name, ignoreWords, dictionary)));
}
if (doc.extendsClauses) {
doc.extendsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
if (doc.implementsClauses) {
doc.implementsClauses.forEach(clause => memberContent.push(...extractMemberTokens(clause.doc, ignoreWords, dictionary)));
}
return memberContent;
}
}
};
};
function isString(value) {
return typeof value == 'string';
}