# frozen_string_literal: true # Copyright OpenSearch Contributors # SPDX-License-Identifier: BSD-3-Clause require 'net/http' require 'jekyll/hooks' require 'jekyll/document' require 'json' require 'set' require 'uri' require 'pathname' require 'typhoeus' require 'ruby-link-checker' require 'ruby-enum' ## # This singleton checks links during build to warn or fail upon finding dead links. # # `JEKYLL_LINK_CHECKER`, set on the environment, will cause verification of external links # Valid values: internal, all. # Usage: `JEKYLL_LINK_CHECKER=internal bundle exec jekyll build --trace` # # `JEKYLL_FATAL_LINK_CHECKER`, set on the environment, is the same as `JEKYLL_LINK_CHECKER` # except that it fails the build if there are broken links. it takes the same valid values # Usage: `JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --trace` module Jekyll::LinkChecker class CheckTypes include Ruby::Enum define :INTERNAL, 'internal' define :EXTERNAL, 'external' define :ALL, 'all' end ## # The collection that will get stores as the output @urls ## # Pattern to identify documents that should be excluded based on their URL @excluded_paths = %r{(\.(css|js|json|map|xml|txt|yml)$|/version-selector\.tpl$)}i.freeze ## # Pattern to identify certain HTML tags whose content should be excluded from indexing @href_matcher = /<a[^>]+href=(['"])(.+?)\1/im.freeze ## # Pattern to check for external URLs @external_matcher = %r{^https?://}.freeze ## # List of domains to ignore # playground.opensearch.org is causing an infinite redirect # LinkedIn mostly fails with 999 status codes @ignored_domains = [ 'localhost', 'playground.opensearch.org', # inifite redirect, https://github.com/opensearch-project/dashboards-anywhere/issues/172 'crates.io', # 404s on bots 'www.cloudflare.com', # 403s on bots 'platform.openai.com', # 403s on bots 'example.issue.link' # a fake example link from the template ] ## # Pattern of local paths to ignore @ignored_paths = %r{(^/javadocs|^mailto:)}.freeze ## # Holds the list of failures @failures ## # Build flags driven by environment variables @check_internal_links # Enables checking internal links @check_external_links # Enables checking external links @fail_on_error # Indicates the need to fail the build for dead links ## # Defines the priority of the plugin # The hooks are registered with a very low priority to make sure they runs after any content modifying hook def self.priority 10 end def self.check_links? check_external_links? || check_internal_links? end def self.check_external_links? !!@check_external_links end def self.check_internal_links? !!@check_internal_links end def self.fail_on_error? !!@fail_on_error end ## # Initializes the singleton by recording the site def self.init(site) @site = site @urls = {} @failures = [] begin @fail_on_error = true if ENV.key?('JEKYLL_FATAL_LINK_CHECKER') check_flag = fail_on_error? ? ENV['JEKYLL_FATAL_LINK_CHECKER'] : ENV['JEKYLL_LINK_CHECKER'] unless check_flag return Jekyll.logger.info 'LinkChecker:', 'disabled. Enable with JEKYLL_LINK_CHECKER on the environment' end unless CheckTypes.values.include?(check_flag) Jekyll.logger.info "LinkChecker: [Notice] Could not initialize, Valid values for #{fail_on_error? ? 'JEKYLL_FATAL_LINK_CHECKER' : 'JEKYLL_LINK_CHECKER'} are #{CheckTypes.values}" return end @external_link_checker = LinkChecker::Typhoeus::Hydra::Checker.new( logger: Jekyll.logger, hydra: { max_concurrency: 2 }, retries: 3, user_agent: 'OpenSearch Documentation Website Link Checker/1.0' ) @external_link_checker.on :failure, :error do |result| @failures << "#{result}, linked to in #{result.options[:location]}" end @check_external_links = [CheckTypes::EXTERNAL, CheckTypes::ALL].include?(check_flag) @check_internal_links = [CheckTypes::INTERNAL, CheckTypes::ALL].include?(check_flag) # Process a Page as soon as its content is ready Jekyll::Hooks.register :pages, :post_convert, priority: priority do |page| process(page) end # Process a Document as soon as its content is ready Jekyll::Hooks.register :documents, :post_convert, priority: priority do |document| process(document) end # Verify gathered links after Jekyll is done writing all its stuff Jekyll::Hooks.register :site, :post_write, priority: priority do |site| verify(site) end if check_links? Jekyll.logger.info "LinkChecker: [Notice] Initialized successfully and will check #{check_flag} links" end Jekyll.logger.info 'LinkChecker: [Notice] The build will fail if a dead link is found' if fail_on_error? rescue StandardError => e Jekyll.logger.error 'LinkChecker: [Error] Failed to initialize Link Checker' raise end end ## # Processes a Document or Page and adds the links to a collection # It also checks for anchors to parts of the same page/doc def self.process(page) return unless check_links? return if @excluded_paths.match(page.path) hrefs = page.content.scan(@href_matcher) hrefs.each do |(_, href)| relative_path = page.path[0] == '/' ? Pathname.new(page.path).relative_path_from(Dir.getwd) : page.path if href.eql? '#' next elsif href.start_with? '#' Jekyll.logger.info relative_path if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil? if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil? @failures << "##{href[1..]}, linked in ./#{relative_path}" end else @urls[href] = Set[] unless @urls.key?(href) @urls[href] << relative_path end end end ## # Saves the collection as a JSON file def self.verify(_site) return unless check_links? @base_url_matcher = %r{^#{@site.config["url"]}#{@site.baseurl}(/.*)$}.freeze @urls.sort_by { |_url, _pages| rand }.each do |url, pages| location = "./#{pages.to_a.join(', ./')}" @failures << "#{url}, linked to in #{location}" unless check(url, location) end @external_link_checker.run unless @failures.empty? msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}" end if !@failures.empty? if fail_on_error? Jekyll.logger.error "\nLinkChecker: [Error] #{msg}\n".red raise msg else Jekyll.logger.warn "\nLinkChecker: [Warning] #{msg}\n".red end else Jekyll.logger.info "\nLinkChecker: [Success] No broken links!\n".green end end ## # Check if URL is accessible def self.check(url, location) match = @base_url_matcher.match(url) url = match[1] unless match.nil? url = @site.config['url'] + url if url.start_with? '/docs/' if @external_matcher =~ url return true unless check_external_links? check_external(url, location) else return true unless check_internal_links? check_internal(url, location) end end ## # Check if an external URL is accessible def self.check_external(url, location) url = begin URI(url) rescue StandardError url end return true if url.is_a?(URI) && @ignored_domains.include?(url.host) @external_link_checker.check(url, { location: location }) end ## # Check if an internal link is accessible def self.check_internal(url, location) Jekyll.logger.info "LinkChecker: [Info] Checking #{url}".cyan return true if @ignored_paths =~ url path, hash = url.split('#') unless path =~ %r{\.[^/]{2,}$} path << '/' unless path.end_with? '/' path << 'index.html' unless path.end_with? 'index.html' end filename = File.join(@site.config['destination'], path) return false unless File.file?(filename) content = File.read(filename) unless content.include? '<title>Redirecting' return true if hash.nil? || hash.empty? return !(content =~ /<[a-z0-9-]+[^>]+id="#{hash}"/i).nil? end match = content.match(@href_matcher) if match.nil? Jekyll.logger.warn "LinkChecker: [Warning] Cannot check #{url} due to an unfollowable redirect" return true end redirect = match[2] redirect << '#' + hash unless hash.nil? || hash.empty? check(redirect, location) end end # Before any Document or Page is processed, initialize the LinkChecker Jekyll::Hooks.register :site, :pre_render, priority: Jekyll::LinkChecker.priority do |site| Jekyll::LinkChecker.init(site) end