rework the search indexing process

- reconfigure the docsearch scraper to store component and version for each res
- switch from the Docker image to a resuable GitHub Action
- add publish-docsearch-config extension to transform Handlebars into YAML
This commit is contained in:
Dan Allen 2022-12-21 12:57:51 -07:00 committed by Marcus Hert Da Coregio
parent 2fb91e6266
commit 409bd29abd
8 changed files with 159 additions and 47 deletions

View File

@ -1,20 +0,0 @@
{
"index_name": "security-docs",
"start_urls": [
"https://docs.spring.io/spring-security/reference/"
],
"selectors": {
"lvl0": {
"selector": "//nav[@class='crumbs']//li[@class='crumb'][last()-1]",
"type": "xpath",
"global": true,
"default_value": "Home"
},
"lvl1": ".doc h1",
"lvl2": ".doc h2",
"lvl3": ".doc h3",
"lvl4": ".doc h4",
"text": ".doc p, .doc td.content, .doc th.tableblock"
}
}

View File

@ -1,21 +0,0 @@
#!/bin/bash
###
# Docs
# config.json https://docsearch.algolia.com/docs/config-file
# Run the crawler https://docsearch.algolia.com/docs/run-your-own/#run-the-crawl-from-the-docker-image
### USAGE
if [ "$#" -ne 3 ]; then
echo -e "not enough arguments USAGE:\n\n$0 \$ALGOLIA_APPLICATION_ID \$ALGOLIA_API_KEY \$CONFIG_FILE\n\n" >&2
exit 1
fi
# Script Parameters
APPLICATION_ID=$1
API_KEY=$2
CONFIG_FILE=$3
#### Script
script_dir=$(dirname $0)
docker run -e "APPLICATION_ID=$APPLICATION_ID" -e "API_KEY=$API_KEY" -e "CONFIG=$(cat $CONFIG_FILE | jq -r tostring)" algolia/docsearch-scraper

View File

@ -0,0 +1,67 @@
{
"index_name": "spring-security-docs",
"start_urls": [
{{#each components}}
{{#each versions}}
{
"url": "{{{@root.site.url}}}/{{#if (eq ./activeVersionSegment '')}}(?:$|index.html$|[a-z].*){{else}}{{{./activeVersionSegment}}}/{{/if}}",
"extra_attributes": {
"component": "{{#if (eq ./name 'ROOT')}}spring-security{{else}}{{{./name}}}{{/if}}",
"version": "{{{./version}}}",
"version_rank": {{#if (eq this ../latest)}}1{{else}}2{{/if}}
}
}{{#unless (and @last @../last)}},{{/unless}}
{{/each}}
{{/each}}
],
"sitemap_urls": [
"{{{site.url}}}/sitemap.xml"
],
"scrape_start_urls": true,
"stop_urls": [
{{#each stopPages}}
"{{{@root.site.url}}}{{{./pub.url}}}"{{#unless @last}},{{/unless}}
{{/each}}
],
"selectors": {
"default": {
"lvl0": {
"global": true,
"selector": ".nav-panel-explore .context .title, .nav-panel-explore .context .version"
},
"lvl1": ".doc > h1.page",
"lvl2": ".doc .sect1 > h2:first-child",
"lvl3": ".doc .sect2 > h3:first-child",
"lvl4": ".doc .sect3 > h4:first-child",
"text": ".doc p, .doc dt, .doc td.content, .doc th.tableblock"
}
},
"selectors_exclude": [
"#section-summary"
],
"min_indexed_level": 1,
"custom_settings": {
"advancedSyntax": true,
"attributesForFaceting": [
"component",
"version"
],
"attributesToRetrieve": [
"anchor",
"content",
"hierarchy",
"url",
"component",
"version"
],
"attributesToSnippet": [
"content:25"
],
"customRanking": [
"desc(weight.page_rank)",
"asc(version_rank)",
"desc(weight.level)",
"asc(weight.position)"
]
}
}

View File

@ -8,8 +8,21 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
fetch-depth: 5
- name: Run Docsearch Scraper
run: $GITHUB_WORKSPACE/.github/actions/algolia-docsearch-scraper.sh "${{ secrets.ALGOLIA_APPLICATION_ID }}" "${{ secrets.ALGOLIA_WRITE_API_KEY }}" $GITHUB_WORKSPACE/.github/actions/algolia-config.json
- name: Configure Indexer
run: |
CONFIG_FILE=.github/actions/docsearch-config.json
if [ ! -f $CONFIG_FILE ]; then
curl -sL -o $CONFIG_FILE $(node -p "require('fs').readFileSync('antora-playbook.yml', 'utf8').match(/^ url: (.*)/m)[1]")/docsearch-config.json
fi
INDEX_NAME=$(node -p "JSON.parse(require('fs').readFileSync('$CONFIG_FILE')).index_name")
echo "CONFIG_FILE=${CONFIG_FILE}" >> $GITHUB_ENV
echo "INDEX_NAME_TMP=${INDEX_NAME}-${GITHUB_RUN_ID}" >> $GITHUB_ENV
- name: Run Indexer
uses: darrenjennings/algolia-docsearch-action@master
with:
algolia_application_id: ${{ secrets.ALGOLIA_APP_ID }}
algolia_api_key: ${{ secrets.ALGOLIA_API_KEY }}
file: ${{ env.CONFIG_FILE }}

View File

@ -0,0 +1,43 @@
antora:
extensions:
- '@springio/antora-extensions/partial-build-extension'
- ./lib/antora/extensions/inject-collector-config.js
- '@antora/collector-extension'
- ./lib/antora/extensions/version-fix.js
- '@antora/atlas-extension'
- '@opendevise/antora-release-line-extension'
- require: '@springio/antora-extensions/tabs-migration-extension'
# uncomment this option to save the migrated content to the worktree
#save_result: true
unwrap_example_block: always
- id: publish-docsearch-config
require: ./lib/antora/extensions/publish-docsearch-config
template_path: ./.github/actions/docsearch-config.json.hbs
site:
title: Spring Security
url: https://docs.spring.io/spring-security/reference
robots: allow
git:
ensure_git_suffix: false
content:
sources:
- url: https://github.com/spring-projects/spring-security
branches: main
tags: 6.0.1
start_path: docs
asciidoc:
attributes:
page-pagination: ''
hide-uri-scheme: '@'
tabs-sync-option: '@'
extensions:
- '@asciidoctor/tabs'
- '@springio/asciidoctor-extensions'
urls:
latest_version_segment_strategy: redirect:to
latest_version_segment: ''
redirect_facility: httpd
ui:
bundle:
url: https://github.com/spring-io/antora-ui-spring/releases/download/latest/ui-bundle.zip
snapshot: true

View File

@ -10,6 +10,8 @@ antora:
# uncomment this option to save the migrated content to the worktree
#save_result: true
unwrap_example_block: always
- require: ./lib/antora/extensions/publish-docsearch-config
template_path: ./.github/actions/docsearch-config.json.hbs
site:
title: Spring Security
url: https://docs.spring.io/spring-security/reference

View File

@ -4,13 +4,13 @@ plugins {
}
antora {
version = '3.2.0-alpha.2'
version = '3.2.0-alpha.2'
options = ['--clean', '--fetch', '--stacktrace']
environment = [
'ALGOLIA_API_KEY': '82c7ead946afbac3cf98c32446154691',
'ALGOLIA_APP_ID': '244V8V9FGG',
'ALGOLIA_INDEX_NAME': 'security-docs',
]
'ALGOLIA_INDEX_NAME': 'spring-security-docs',
]
dependencies = [
'@antora/atlas-extension': '1.0.0-alpha.1',
'@antora/collector-extension': '1.0.0-alpha.2',

View File

@ -0,0 +1,28 @@
'use strict'
const fsp = require('node:fs/promises')
const ospath = require('node:path')
/**
* An Antora extension that generates the docsearch config file from a Handlebars template and publishes it with the
* site, where the scraper job can retrieve it.
*/
module.exports.register = function ({ config: { templatePath = './docsearch/config.json.hbs' } }) {
const expandPath = this.require('@antora/expand-path-helper')
const handlebars = this.require('handlebars').create()
handlebars.registerHelper('eq', (a, b) => a === b)
handlebars.registerHelper('and', (a, b) => a && b)
this.on('beforePublish', async ({ playbook, contentCatalog, siteCatalog }) => {
templatePath = expandPath(templatePath, { dot: playbook.dir })
const templateSrc = await fsp.readFile(templatePath, 'utf8')
const templateBasename = ospath.basename(templatePath)
const template = handlebars.compile(templateSrc, { noEscape: true, preventIndent: true, srcName: templateBasename })
const components = contentCatalog.getComponentsSortedBy('name').filter((component) => component.latest.version)
const stopPages = contentCatalog.getPages((page) => {
return page.out && ('page-archived' in page.asciidoc.attributes || 'page-noindex' in page.asciidoc.attributes)
})
const compiled = template({ components, site: playbook.site, stopPages })
siteCatalog.addFile({ contents: Buffer.from(compiled), out: { path: 'docsearch-config.json' } })
})
}