Add indexing of documentation for custom-search
Signed-off-by: Miki <mehranb@amazon.com>
This commit is contained in:
parent
1ae31bfccc
commit
16d851768b
|
@ -0,0 +1,118 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require "jekyll/hooks"
|
||||
require "jekyll/document"
|
||||
require "json"
|
||||
|
||||
##
|
||||
# This singleton facilitates production of an indexable JSON representation of the content to populate a data source
|
||||
# to provide search functionality.
|
||||
|
||||
module Jekyll::ContentIndexer
|
||||
|
||||
##
|
||||
# The collection that will get stores as the output
|
||||
|
||||
@data = []
|
||||
|
||||
##
|
||||
# Pattern to identify documents that should be excluded based on their URL
|
||||
|
||||
@excluded_paths = /\.(css|js|json|map|xml|txt|yml)$/i.freeze
|
||||
|
||||
##
|
||||
# Pattern to identify block HTML tags (not comprehensive)
|
||||
|
||||
@html_block_tags = /\s*<[?\/]?(article|blockquote|d[dlt]|div|fieldset|form|h|li|main|nav|[ou]l|p|section|table|t[rd]).*?>\s*/im.freeze
|
||||
|
||||
##
|
||||
# Pattern to identify certain HTML tags whose content should be excluded from indexing
|
||||
|
||||
@html_excluded_tags = /\s*<(head|style|script|h1).*?>.*?<\/\1>/im.freeze
|
||||
|
||||
##
|
||||
# Initializes the singleton by recording the site
|
||||
|
||||
def self.init(site)
|
||||
@site = site
|
||||
end
|
||||
|
||||
##
|
||||
# Processes a Document or Page and adds it to the collection
|
||||
|
||||
def self.add(page)
|
||||
return if @excluded_paths.match(page.url)
|
||||
|
||||
content = page.content
|
||||
.gsub(@html_excluded_tags, ' ') # Strip certain HTML blocks
|
||||
.gsub(@html_block_tags, "\n") # Strip some block HTML tags, replacing with newline
|
||||
.gsub(/\s*<[?\/!]?[a-z]+.*?>\s*/im, ' ') # Strip all remaining HTML tags
|
||||
.gsub(/\s*[\r\n]+\s*/, "\n") # Clean line-breaks
|
||||
.gsub(/\s{2,}/, ' ') # Trim long spaces
|
||||
.gsub(/\s+([.:;,)!\]?])/, '\1') # Remove spaces before some punctuations
|
||||
.strip # Trim leading and tailing whitespaces
|
||||
|
||||
return if content.empty?
|
||||
|
||||
url = @site.config["baseurl"] + page.url
|
||||
|
||||
# Produce a breadcrumb
|
||||
ancestors = []
|
||||
if page.instance_of?(Jekyll::Document)
|
||||
ancestors.push(@site.config.dig("just_the_docs", "collections", page.collection&.label, "name"))
|
||||
end
|
||||
|
||||
ancestors.push(page.data["grand_parent"]) unless
|
||||
page.data["grand_parent"].nil? ||
|
||||
page.data["grand_parent"]&.empty? ||
|
||||
ancestors.include?(page.data["grand_parent"]) # Make sure collection name is not added
|
||||
|
||||
ancestors.push(page.data["parent"]) unless
|
||||
page.data["parent"].nil? ||
|
||||
page.data["parent"]&.empty? ||
|
||||
ancestors.include?(page.data["parent"]) # Make sure collection name is not added
|
||||
|
||||
data = {
|
||||
url: url,
|
||||
title: page.data["title"],
|
||||
content: content,
|
||||
ancestors: ancestors,
|
||||
type: "DOCS"
|
||||
}
|
||||
|
||||
@data.push(data)
|
||||
end
|
||||
|
||||
##
|
||||
# Saves the collection as a JSON file
|
||||
|
||||
def self.save
|
||||
File.open(File.join(@site.config["destination"], "search-index.json"), 'w') do |f|
|
||||
f.puts JSON.pretty_generate(@data)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Before any Document or Page is processed, initialize the ContentIndexer
|
||||
|
||||
Jekyll::Hooks.register :site, :pre_render do |site|
|
||||
Jekyll::ContentIndexer.init(site)
|
||||
end
|
||||
|
||||
# Process a Page as soon as its content is ready
|
||||
|
||||
Jekyll::Hooks.register :pages, :post_convert do |page|
|
||||
Jekyll::ContentIndexer.add(page)
|
||||
end
|
||||
|
||||
# Process a Document as soon as its content is ready
|
||||
|
||||
Jekyll::Hooks.register :documents, :post_convert do |document|
|
||||
Jekyll::ContentIndexer.add(document)
|
||||
end
|
||||
|
||||
# Save the produced collection after Jekyll is done writing all its stuff
|
||||
|
||||
Jekyll::Hooks.register :site, :post_write do |_|
|
||||
Jekyll::ContentIndexer.save()
|
||||
end
|
Loading…
Reference in New Issue