2021-08-10 14:54:40 -04:00
# frozen_string_literal: true
2023-04-04 15:49:27 -04:00
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: BSD-3-Clause
require 'net/http'
require 'jekyll/hooks'
require 'jekyll/document'
require 'json'
require 'set'
require 'uri'
require 'pathname'
require 'typhoeus'
require 'ruby-link-checker'
2023-08-09 12:37:10 -04:00
require 'ruby-enum'
2021-08-10 14:54:40 -04:00
##
# This singleton checks links during build to warn or fail upon finding dead links.
#
2023-04-04 15:49:27 -04:00
# `JEKYLL_LINK_CHECKER`, set on the environment, will cause verification of external links
2023-08-09 12:37:10 -04:00
# Valid values: internal, all.
2023-04-04 15:49:27 -04:00
# Usage: `JEKYLL_LINK_CHECKER=internal bundle exec jekyll build --trace`
2021-08-10 14:54:40 -04:00
#
2023-04-04 15:49:27 -04:00
# `JEKYLL_FATAL_LINK_CHECKER`, set on the environment, is the same as `JEKYLL_LINK_CHECKER`
# except that it fails the build if there are broken links. it takes the same valid values
# Usage: `JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --trace`
2021-08-10 14:54:40 -04:00
module Jekyll::LinkChecker
2023-08-09 12:37:10 -04:00
class CheckTypes
include Ruby :: Enum
define :INTERNAL , 'internal'
define :EXTERNAL , 'external'
define :ALL , 'all'
end
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# The collection that will get stores as the output
2023-04-04 15:49:27 -04:00
@urls
##
2021-08-10 14:54:40 -04:00
# Pattern to identify documents that should be excluded based on their URL
2023-04-04 15:49:27 -04:00
@excluded_paths = %r{ ( \ .(css|js|json|map|xml|txt|yml)$|/version-selector \ .tpl$) }i . freeze
##
2021-08-10 14:54:40 -04:00
# Pattern to identify certain HTML tags whose content should be excluded from indexing
2023-04-04 15:49:27 -04:00
2021-08-10 14:54:40 -04:00
@href_matcher = / <a[^>]+href=(['"])(.+?) \ 1 /im . freeze
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# Pattern to check for external URLs
2023-04-04 15:49:27 -04:00
@external_matcher = %r{ ^https?:// } . freeze
2021-08-10 14:54:40 -04:00
2023-04-04 15:49:27 -04:00
##
# List of domains to ignore
# playground.opensearch.org is causing an infinite redirect
# LinkedIn mostly fails with 999 status codes
@ignored_domains = [
'localhost' ,
'playground.opensearch.org' , # inifite redirect, https://github.com/opensearch-project/dashboards-anywhere/issues/172
'crates.io' , # 404s on bots
'www.cloudflare.com' , # 403s on bots
2023-11-28 17:18:01 -05:00
'openai.com' , # 403s on bots
2023-08-09 12:37:10 -04:00
'example.issue.link' # a fake example link from the template
2023-04-04 15:49:27 -04:00
]
##
2021-08-10 14:54:40 -04:00
# Pattern of local paths to ignore
2023-04-04 15:49:27 -04:00
@ignored_paths = %r{ (^/javadocs|^mailto:) } . freeze
2021-08-10 14:54:40 -04:00
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# Holds the list of failures
2023-04-04 15:49:27 -04:00
@failures
##
# Build flags driven by environment variables
2023-08-09 12:37:10 -04:00
@check_internal_links # Enables checking internal links
2023-04-04 15:49:27 -04:00
@check_external_links # Enables checking external links
2023-08-09 12:37:10 -04:00
@fail_on_error # Indicates the need to fail the build for dead links
2023-04-04 15:49:27 -04:00
##
# Defines the priority of the plugin
# The hooks are registered with a very low priority to make sure they runs after any content modifying hook
def self . priority
10
end
2021-08-10 14:54:40 -04:00
2023-08-09 12:37:10 -04:00
def self . check_links?
check_external_links? || check_internal_links?
end
def self . check_external_links?
! ! @check_external_links
end
def self . check_internal_links?
! ! @check_internal_links
end
def self . fail_on_error?
! ! @fail_on_error
end
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# Initializes the singleton by recording the site
def self . init ( site )
@site = site
@urls = { }
@failures = [ ]
2023-04-04 15:49:27 -04:00
begin
2023-08-09 12:37:10 -04:00
@fail_on_error = true if ENV . key? ( 'JEKYLL_FATAL_LINK_CHECKER' )
check_flag = fail_on_error? ? ENV [ 'JEKYLL_FATAL_LINK_CHECKER' ] : ENV [ 'JEKYLL_LINK_CHECKER' ]
2023-04-04 15:49:27 -04:00
unless check_flag
return Jekyll . logger . info 'LinkChecker:' , 'disabled. Enable with JEKYLL_LINK_CHECKER on the environment'
end
2023-08-09 12:37:10 -04:00
unless CheckTypes . values . include? ( check_flag )
Jekyll . logger . info " LinkChecker: [Notice] Could not initialize, Valid values for #{ fail_on_error? ? 'JEKYLL_FATAL_LINK_CHECKER' : 'JEKYLL_LINK_CHECKER' } are #{ CheckTypes . values } "
2023-04-04 15:49:27 -04:00
return
end
@external_link_checker = LinkChecker :: Typhoeus :: Hydra :: Checker . new (
logger : Jekyll . logger ,
hydra : { max_concurrency : 2 } ,
2023-06-13 16:25:08 -04:00
retries : 3 ,
user_agent : 'OpenSearch Documentation Website Link Checker/1.0'
2023-04-04 15:49:27 -04:00
)
@external_link_checker . on :failure , :error do | result |
@failures << " #{ result } , linked to in #{ result . options [ :location ] } "
end
2023-08-09 12:37:10 -04:00
@check_external_links = [ CheckTypes :: EXTERNAL , CheckTypes :: ALL ] . include? ( check_flag )
@check_internal_links = [ CheckTypes :: INTERNAL , CheckTypes :: ALL ] . include? ( check_flag )
2023-04-04 15:49:27 -04:00
# Process a Page as soon as its content is ready
Jekyll :: Hooks . register :pages , :post_convert , priority : priority do | page |
process ( page )
end
# Process a Document as soon as its content is ready
Jekyll :: Hooks . register :documents , :post_convert , priority : priority do | document |
process ( document )
end
# Verify gathered links after Jekyll is done writing all its stuff
Jekyll :: Hooks . register :site , :post_write , priority : priority do | site |
verify ( site )
end
2023-08-09 12:37:10 -04:00
if check_links?
Jekyll . logger . info " LinkChecker: [Notice] Initialized successfully and will check #{ check_flag } links "
2023-04-04 15:49:27 -04:00
end
2023-08-09 12:37:10 -04:00
Jekyll . logger . info 'LinkChecker: [Notice] The build will fail if a dead link is found' if fail_on_error?
2023-04-04 15:49:27 -04:00
rescue StandardError = > e
Jekyll . logger . error 'LinkChecker: [Error] Failed to initialize Link Checker'
raise
end
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# Processes a Document or Page and adds the links to a collection
2023-04-04 15:49:27 -04:00
# It also checks for anchors to parts of the same page/doc
2021-08-10 14:54:40 -04:00
def self . process ( page )
2023-08-09 12:37:10 -04:00
return unless check_links?
2021-08-10 14:54:40 -04:00
return if @excluded_paths . match ( page . path )
hrefs = page . content . scan ( @href_matcher )
hrefs . each do | ( _ , href ) |
relative_path = page . path [ 0 ] == '/' ? Pathname . new ( page . path ) . relative_path_from ( Dir . getwd ) : page . path
2023-04-04 15:49:27 -04:00
if href . eql? '#'
next
elsif href . start_with? '#'
Jekyll . logger . info relative_path if ( page . content =~ / <[a-z0-9-]+[^>]+(?:id|name)=" #{ href [ 1 .. ] } " /i ) . nil?
if ( page . content =~ / <[a-z0-9-]+[^>]+(?:id|name)=" #{ href [ 1 .. ] } " /i ) . nil?
@failures << " # #{ href [ 1 .. ] } , linked in ./ #{ relative_path } "
2022-09-19 14:55:18 -04:00
end
2023-04-04 15:49:27 -04:00
else
2021-08-10 14:54:40 -04:00
@urls [ href ] = Set [ ] unless @urls . key? ( href )
@urls [ href ] << relative_path
end
end
end
2023-04-04 15:49:27 -04:00
##
# Saves the collection as a JSON file
2021-08-10 14:54:40 -04:00
2023-04-04 15:49:27 -04:00
def self . verify ( _site )
2023-08-09 12:37:10 -04:00
return unless check_links?
2023-04-04 15:49:27 -04:00
@base_url_matcher = %r{ ^ #{ @site . config [ " url " ] } #{ @site . baseurl } (/.*)$ } . freeze
2023-08-09 12:37:10 -04:00
@urls . sort_by { | _url , _pages | rand } . each do | url , pages |
2023-04-04 15:49:27 -04:00
location = " ./ #{ pages . to_a . join ( ', ./' ) } "
@failures << " #{ url } , linked to in #{ location } " unless check ( url , location )
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
@external_link_checker . run
unless @failures . empty?
msg = " Found #{ @failures . size } dead link #{ @failures . size > 1 ? 's' : '' } : \n #{ @failures . join ( " \n " ) } "
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
if ! @failures . empty?
2023-08-09 12:37:10 -04:00
if fail_on_error?
2023-04-04 15:49:27 -04:00
Jekyll . logger . error " \n LinkChecker: [Error] #{ msg } \n " . red
raise msg
else
Jekyll . logger . warn " \n LinkChecker: [Warning] #{ msg } \n " . red
end
2021-08-10 14:54:40 -04:00
else
2023-04-04 15:49:27 -04:00
Jekyll . logger . info " \n LinkChecker: [Success] No broken links! \n " . green
2021-08-10 14:54:40 -04:00
end
end
2023-04-04 15:49:27 -04:00
##
# Check if URL is accessible
def self . check ( url , location )
2021-08-10 14:54:40 -04:00
match = @base_url_matcher . match ( url )
2023-04-04 15:49:27 -04:00
url = match [ 1 ] unless match . nil?
url = @site . config [ 'url' ] + url if url . start_with? '/docs/'
2021-08-10 14:54:40 -04:00
if @external_matcher =~ url
2023-08-09 12:37:10 -04:00
return true unless check_external_links?
2023-04-04 15:49:27 -04:00
2023-08-09 12:37:10 -04:00
check_external ( url , location )
else
return true unless check_internal_links?
2021-08-10 14:54:40 -04:00
2023-08-09 12:37:10 -04:00
check_internal ( url , location )
end
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
##
# Check if an external URL is accessible
2021-08-10 14:54:40 -04:00
2023-04-04 15:49:27 -04:00
def self . check_external ( url , location )
url = begin
URI ( url )
rescue StandardError
url
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
return true if url . is_a? ( URI ) && @ignored_domains . include? ( url . host )
@external_link_checker . check ( url , { location : location } )
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
##
2021-08-10 14:54:40 -04:00
# Check if an internal link is accessible
2023-04-04 15:49:27 -04:00
def self . check_internal ( url , location )
Jekyll . logger . info " LinkChecker: [Info] Checking #{ url } " . cyan
2021-08-10 14:54:40 -04:00
return true if @ignored_paths =~ url
path , hash = url . split ( '#' )
2023-04-04 15:49:27 -04:00
unless path =~ %r{ \ .[^/] { 2, } $ }
path << '/' unless path . end_with? '/'
path << 'index.html' unless path . end_with? 'index.html'
end
2021-10-07 16:19:36 -04:00
2023-04-04 15:49:27 -04:00
filename = File . join ( @site . config [ 'destination' ] , path )
2021-10-07 16:19:36 -04:00
2023-04-04 15:49:27 -04:00
return false unless File . file? ( filename )
2021-10-07 16:19:36 -04:00
2023-04-04 15:49:27 -04:00
content = File . read ( filename )
unless content . include? '<title>Redirecting'
return true if hash . nil? || hash . empty?
2021-10-07 16:19:36 -04:00
2023-04-04 15:49:27 -04:00
return ! ( content =~ / <[a-z0-9-]+[^>]+id=" #{ hash } " /i ) . nil?
end
2021-10-07 16:19:36 -04:00
2023-04-04 15:49:27 -04:00
match = content . match ( @href_matcher )
if match . nil?
Jekyll . logger . warn " LinkChecker: [Warning] Cannot check #{ url } due to an unfollowable redirect "
return true
2021-08-10 14:54:40 -04:00
end
2023-04-04 15:49:27 -04:00
redirect = match [ 2 ]
redirect << '#' + hash unless hash . nil? || hash . empty?
check ( redirect , location )
2021-08-10 14:54:40 -04:00
end
end
# Before any Document or Page is processed, initialize the LinkChecker
2023-04-04 15:49:27 -04:00
Jekyll :: Hooks . register :site , :pre_render , priority : Jekyll :: LinkChecker . priority do | site |
2021-08-10 14:54:40 -04:00
Jekyll :: LinkChecker . init ( site )
end