From 52313c51acb0486312c78c3d104917064a2c1637 Mon Sep 17 00:00:00 2001 From: Victoria Lim Date: Mon, 8 Jan 2024 15:19:05 -0800 Subject: [PATCH] docs: Anchor link checker (#15624) Co-authored-by: 317brian <53799971+317brian@users.noreply.github.com> --- .github/workflows/static-checks.yml | 1 + docs/_bin/broken-link-check.py | 101 ---------------------- docs/data-management/delete.md | 4 +- docs/data-management/index.md | 2 +- docs/data-management/schema-changes.md | 2 +- docs/data-management/update.md | 2 +- docs/development/docs-contribute.md | 10 ++- docs/development/experimental-features.md | 2 +- docs/ingestion/index.md | 2 +- docs/ingestion/ingestion-spec.md | 8 +- docs/multi-stage-query/concepts.md | 2 +- docs/operations/basic-cluster-tuning.md | 2 +- docs/operations/web-console.md | 2 +- docs/querying/sql-data-types.md | 2 +- docs/querying/sql-metadata-tables.md | 6 +- docs/release-info/upgrade-notes.md | 2 +- website/package.json | 2 +- website/script/link-lint.js | 97 +++++++++++++++++++++ 18 files changed, 126 insertions(+), 123 deletions(-) delete mode 100755 docs/_bin/broken-link-check.py create mode 100644 website/script/link-lint.js diff --git a/.github/workflows/static-checks.yml b/.github/workflows/static-checks.yml index 28535b334b0..838291d4e09 100644 --- a/.github/workflows/static-checks.yml +++ b/.github/workflows/static-checks.yml @@ -168,6 +168,7 @@ jobs: (cd website && npm install) cd website npm run build + npm run link-lint npm run spellcheck - name: web console diff --git a/docs/_bin/broken-link-check.py b/docs/_bin/broken-link-check.py deleted file mode 100755 index 0d8e9b98748..00000000000 --- a/docs/_bin/broken-link-check.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import re -import sys - -# -# Checks for broken redirects (in _redirects.json) and links from markdown files to -# nonexistent pages. Does _not_ check for links to anchors that don't exist. -# - -# Targets to these 'well known' pages are OK. -WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"] - -def normalize_link(source, target): - dirname = os.path.dirname(source) - normalized = os.path.normpath(os.path.join(dirname, target)) - return normalized - -def verify_redirects(docs_directory, redirect_json): - ok = True - - with open(redirect_json, 'r') as f: - redirects = json.loads(f.read()) - - for redirect in redirects: - if redirect["target"] in WELL_KNOWN_PAGES: - continue - - # Replace .html and named anchors with .md, and check the file on the filesystem. - target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"])) - if not os.path.exists(os.path.join(docs_directory, target)): - sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n") - ok = False - - return ok - -def verify_markdown(docs_directory): - ok = True - - # Get list of markdown files. - markdowns = [] - for root, dirs, files in os.walk(docs_directory): - for name in files: - if name.endswith('.md'): - markdowns.append(os.path.join(root, name)) - - for markdown_file in markdowns: - with open(markdown_file, 'r') as f: - content = f.read() - - for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content): - target = m.group(2) - - if target in WELL_KNOWN_PAGES: - continue - - if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']: - # Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links. - continue - - target = re.sub(r'^/docs/VERSION/', '', target) - target = re.sub(r'#.*$', '', target) - target = re.sub(r'\.html$', '.md', target) - target = re.sub(r'/$', '/index.md', target) - if target and not (target.startswith('http://') or target.startswith('https://')): - target_normalized = normalize_link(markdown_file, target) - - if not os.path.exists(target_normalized): - sys.stderr.write('Page [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n") - ok = False - - return ok - -def main(): - if len(sys.argv) != 3: - sys.stderr.write('usage: program \n') - sys.exit(1) - - ok = verify_redirects(sys.argv[1], sys.argv[2]) - ok = verify_markdown(sys.argv[1]) and ok - if not ok: - sys.exit(1) - -main() diff --git a/docs/data-management/delete.md b/docs/data-management/delete.md index 9e59c751bc2..6acd2fc782b 100644 --- a/docs/data-management/delete.md +++ b/docs/data-management/delete.md @@ -24,12 +24,12 @@ title: "Data deletion" ## By time range, manually -Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) and supports +Apache Druid stores data [partitioned by time chunk](../design/storage.md) and supports deleting data for time chunks by dropping segments. This is a fast, metadata-only operation. Deletion by time range happens in two steps: -1. Segments to be deleted must first be marked as ["unused"](../design/architecture.md#segment-lifecycle). This can +1. Segments to be deleted must first be marked as ["unused"](../design/storage.md#segment-lifecycle). This can happen when a segment is dropped by a [drop rule](../operations/rule-configuration.md) or when you manually mark a segment unused through the Coordinator API or web console. This is a soft delete: the data is not available for querying, but the segment files remains in deep storage, and the segment records remains in the metadata store. diff --git a/docs/data-management/index.md b/docs/data-management/index.md index 410cb0c3fde..0e0e09ac890 100644 --- a/docs/data-management/index.md +++ b/docs/data-management/index.md @@ -23,7 +23,7 @@ sidebar_label: "Overview" ~ under the License. --> -Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) in immutable +Apache Druid stores data [partitioned by time chunk](../design/storage.md) in immutable files called [segments](../design/segments.md). Data management operations involving replacing, or deleting, these segments include: diff --git a/docs/data-management/schema-changes.md b/docs/data-management/schema-changes.md index 2dc535e3bb1..0771da3ce2c 100644 --- a/docs/data-management/schema-changes.md +++ b/docs/data-management/schema-changes.md @@ -28,7 +28,7 @@ title: "Schema changes" Apache Druid allows you to provide a new schema for new data without the need to update the schema of any existing data. It is sufficient to update your supervisor spec, if using [streaming ingestion](../ingestion/index.md#streaming), or to provide the new schema the next time you do a [batch ingestion](../ingestion/index.md#batch). This is made possible by -the fact that each [segment](../design/architecture.md#datasources-and-segments), at the time it is created, stores a +the fact that each [segment](../design/segments.md), at the time it is created, stores a copy of its own schema. Druid reconciles all of these individual segment schemas automatically at query time. ## For existing data diff --git a/docs/data-management/update.md b/docs/data-management/update.md index cb36a9d6da4..a8c75a5d349 100644 --- a/docs/data-management/update.md +++ b/docs/data-management/update.md @@ -24,7 +24,7 @@ title: "Data updates" ## Overwrite -Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) and supports +Apache Druid stores data [partitioned by time chunk](../design/storage.md) and supports overwriting existing data using time ranges. Data outside the replacement time range is not touched. Overwriting of existing data is done using the same mechanisms as [batch ingestion](../ingestion/index.md#batch). diff --git a/docs/development/docs-contribute.md b/docs/development/docs-contribute.md index 2c4bfab494f..d50b86b4478 100644 --- a/docs/development/docs-contribute.md +++ b/docs/development/docs-contribute.md @@ -101,7 +101,8 @@ Now you're up to date, and you can make your changes. git checkout -b MY-BRANCH ``` -Provide a name for your feature branch in `MY-BRANCH`. + Provide a name for your feature branch in `MY-BRANCH`. + 2. Find the file that you want to make changes to. All the source files for the docs are written in Markdown and located in the `docs` directory. The URL for the page includes the subdirectory the source file is in. For example, the SQL-based ingestion tutorial found at `https://druid.apache.org/docs/latest/tutorials/tutorial-msq-extern.html` is in the `tutorials` subdirectory. If you're adding a page, create a new Markdown file in the appropriate subdirectory. Then, copy the front matter and Apache license from an existing file. Update the `title` and `id` fields. Don't forget to add it to `website/sidebars.json` so that your new page shows up in the navigation. @@ -111,6 +112,11 @@ Provide a name for your feature branch in `MY-BRANCH`. 5. Use the following commands to run the link and spellcheckers locally: ```bash + cd website + # You only need to install once + npm install + npm run build + npm run spellcheck npm run link-lint ``` @@ -216,4 +222,4 @@ Before publishing new content or updating an existing topic, you can audit your * When American spelling is different from Commonwealth/"British" spelling, use the American spelling. * Don’t use terms considered disrespectful. Refer to a list like Google’s [Word list](https://developers.google.com/style/word-list) for guidance and alternatives. * Use straight quotation marks and straight apostrophes instead of the curly versions. -* Introduce a list, a table, or a procedure with an introductory sentence that prepares the reader for what they're about to read. \ No newline at end of file +* Introduce a list, a table, or a procedure with an introductory sentence that prepares the reader for what they're about to read. diff --git a/docs/development/experimental-features.md b/docs/development/experimental-features.md index 9e5252e9fe7..302db71f7ec 100644 --- a/docs/development/experimental-features.md +++ b/docs/development/experimental-features.md @@ -47,7 +47,7 @@ Note that this document does not track the status of contrib extensions, all of - [Configuration reference](../configuration/index.md#overlord-operations) - [Task reference](../ingestion/tasks.md#locking) -- [Design](../design/architecture.md#availability-and-consistency) +- [Design](../design/storage.md#availability-and-consistency) ## Front coding diff --git a/docs/ingestion/index.md b/docs/ingestion/index.md index fe3e6e4ec5b..26c71d1ec0b 100644 --- a/docs/ingestion/index.md +++ b/docs/ingestion/index.md @@ -24,7 +24,7 @@ sidebar_label: Overview --> Loading data in Druid is called _ingestion_ or _indexing_. When you ingest data into Druid, Druid reads the data from -your source system and stores it in data files called [_segments_](../design/architecture.md#datasources-and-segments). +your source system and stores it in data files called [_segments_](../design/segments.md). In general, segment files contain a few million rows each. For most ingestion methods, the Druid [MiddleManager](../design/middlemanager.md) processes or the diff --git a/docs/ingestion/ingestion-spec.md b/docs/ingestion/ingestion-spec.md index 017b4f38bec..4aec1de80c5 100644 --- a/docs/ingestion/ingestion-spec.md +++ b/docs/ingestion/ingestion-spec.md @@ -149,7 +149,7 @@ An example `dataSchema` is: ### `dataSource` The `dataSource` is located in `dataSchema` → `dataSource` and is simply the name of the -[datasource](../design/architecture.md#datasources-and-segments) that data will be written to. An example +[datasource](../design/storage.md) that data will be written to. An example `dataSource` is: ``` @@ -304,7 +304,7 @@ An example `metricsSpec` is: The `granularitySpec` is located in `dataSchema` → `granularitySpec` and is responsible for configuring the following operations: -1. Partitioning a datasource into [time chunks](../design/architecture.md#datasources-and-segments) (via `segmentGranularity`). +1. Partitioning a datasource into [time chunks](../design/storage.md) (via `segmentGranularity`). 2. Truncating the timestamp, if desired (via `queryGranularity`). 3. Specifying which time chunks of segments should be created, for batch ingestion (via `intervals`). 4. Specifying whether ingestion-time [rollup](./rollup.md) should be used or not (via `rollup`). @@ -329,7 +329,7 @@ A `granularitySpec` can have the following components: | Field | Description | Default | |-------|-------------|---------| | type |`uniform`| `uniform` | -| segmentGranularity | [Time chunking](../design/architecture.md#datasources-and-segments) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here. Note that all segments in the same time chunk should have the same segment granularity.

Avoid `WEEK` granularity for data partitioning because weeks don't align neatly with months and years, making it difficult to change partitioning by coarser granularity. Instead, opt for other partitioning options such as `DAY` or `MONTH`, which offer more flexibility.| `day` | +| segmentGranularity | [Time chunking](../design/storage.md) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here. Note that all segments in the same time chunk should have the same segment granularity.

Avoid `WEEK` granularity for data partitioning because weeks don't align neatly with months and years, making it difficult to change partitioning by coarser granularity. Instead, opt for other partitioning options such as `DAY` or `MONTH`, which offer more flexibility.| `day` | | queryGranularity | The resolution of timestamp storage within each segment. This must be equal to, or finer, than `segmentGranularity`. This will be the finest granularity that you can query at and still receive sensible results, but note that you can still query at anything coarser than this granularity. E.g., a value of `minute` will mean that records will be stored at minutely granularity, and can be sensibly queried at any multiple of minutes (including minutely, 5-minutely, hourly, etc).

Any [granularity](../querying/granularities.md) can be provided here. Use `none` to store timestamps as-is, without any truncation. Note that `rollup` will be applied if it is set even when the `queryGranularity` is set to `none`. | `none` | | rollup | Whether to use ingestion-time [rollup](./rollup.md) or not. Note that rollup is still effective even when `queryGranularity` is set to `none`. Your data will be rolled up if they have the exactly same timestamp. | `true` | | intervals | A list of intervals defining time chunks for segments. Specify interval values using ISO8601 format. For example, `["2021-12-06T21:27:10+00:00/2021-12-07T00:00:00+00:00"]`. If you omit the time, the time defaults to "00:00:00".

Druid breaks the list up and rounds off the list values based on the `segmentGranularity`.

If `null` or not provided, batch ingestion tasks generally determine which time chunks to output based on the timestamps found in the input data.

If specified, batch ingestion tasks may be able to skip a determining-partitions phase, which can result in faster ingestion. Batch ingestion tasks may also be able to request all their locks up-front instead of one by one. Batch ingestion tasks throw away any records with timestamps outside of the specified intervals.

Ignored for any form of streaming ingestion. | `null` | @@ -529,4 +529,4 @@ You can enable front coding with all types of ingestion. For information on defi ::: Beyond these properties, each ingestion method has its own specific tuning properties. See the documentation for each -[ingestion method](./index.md#ingestion-methods) for details. \ No newline at end of file +[ingestion method](./index.md#ingestion-methods) for details. diff --git a/docs/multi-stage-query/concepts.md b/docs/multi-stage-query/concepts.md index a7e59caf19c..7100e14d01c 100644 --- a/docs/multi-stage-query/concepts.md +++ b/docs/multi-stage-query/concepts.md @@ -34,7 +34,7 @@ sidebar_label: "Key concepts" The `druid-multi-stage-query` extension adds a multi-stage query (MSQ) task engine that executes SQL statements as batch tasks in the indexing service, which execute on [Middle Managers](../design/architecture.md#druid-services). [INSERT](reference.md#insert) and [REPLACE](reference.md#replace) tasks publish -[segments](../design/architecture.md#datasources-and-segments) just like [all other forms of batch +[segments](../design/storage.md) just like [all other forms of batch ingestion](../ingestion/index.md#batch). Each query occupies at least two task slots while running: one controller task, and at least one worker task. As an experimental feature, the MSQ task engine also supports running SELECT queries as batch tasks. The behavior and result format of plain SELECT (without INSERT or REPLACE) is subject to change. diff --git a/docs/operations/basic-cluster-tuning.md b/docs/operations/basic-cluster-tuning.md index 538ae33d75f..7bc7c1e2761 100644 --- a/docs/operations/basic-cluster-tuning.md +++ b/docs/operations/basic-cluster-tuning.md @@ -123,7 +123,7 @@ Be sure to check out [segment size optimization](./segment-optimization.md) to h The biggest contributions to heap usage on Brokers are: - Partial unmerged query results from Historicals and Tasks -- The segment timeline: this consists of location information (which Historical/Task is serving a segment) for all currently [available](../design/architecture.md#segment-lifecycle) segments. +- The segment timeline: this consists of location information (which Historical/Task is serving a segment) for all currently [available](../design/storage.md#segment-lifecycle) segments. - Cached segment metadata: this consists of metadata, such as per-segment schemas, for all currently available segments. The Broker heap requirements scale based on the number of segments in the cluster, and the total data size of the segments. diff --git a/docs/operations/web-console.md b/docs/operations/web-console.md index ae4c30142a9..db25792d3e0 100644 --- a/docs/operations/web-console.md +++ b/docs/operations/web-console.md @@ -26,7 +26,7 @@ Druid includes a web console for loading data, managing datasources and tasks, a You can also run SQL and native Druid queries in the console. Enable the following cluster settings to use the web console. Note that these settings are enabled by default. -- Enable the Router's [management proxy](../design/router.md#enabling-the-management-proxy). +- Enable the Router's [management proxy](../design/router.md#enable-the-management-proxy). - Enable [Druid SQL](../configuration/index.md#sql) for the Broker processes in the cluster. The [Router](../design/router.md) service hosts the web console. diff --git a/docs/querying/sql-data-types.md b/docs/querying/sql-data-types.md index c49ac293f5e..6ef91a6b9fd 100644 --- a/docs/querying/sql-data-types.md +++ b/docs/querying/sql-data-types.md @@ -77,7 +77,7 @@ When `druid.generic.useDefaultValueForNull = true` (legacy mode), Druid instead ## Arrays -Druid supports [`ARRAY` types](arrays.md), which behave as standard SQL arrays, where results are grouped by matching entire arrays. The [`UNNEST` operator](./sql-array-functions.md#unn) can be used to perform operations on individual array elements, translating each element into a separate row. +Druid supports [`ARRAY` types](arrays.md), which behave as standard SQL arrays, where results are grouped by matching entire arrays. The [`UNNEST` operator](./sql.md#unnest) can be used to perform operations on individual array elements, translating each element into a separate row. `ARRAY` typed columns can be stored in segments with JSON-based ingestion using the 'auto' typed dimension schema shared with [schema auto-discovery](../ingestion/schema-design.md#schema-auto-discovery-for-dimensions) to detect and ingest arrays as ARRAY typed columns. For [SQL based ingestion](../multi-stage-query/index.md), the query context parameter `arrayIngestMode` must be specified as `"array"` to ingest ARRAY types. In Druid 28, the default mode for this parameter is `"mvd"` for backwards compatibility, which instead can only handle `ARRAY` which it stores in [multi-value string columns](#multi-value-strings). diff --git a/docs/querying/sql-metadata-tables.md b/docs/querying/sql-metadata-tables.md index 8e9bce9fad9..331774d34ab 100644 --- a/docs/querying/sql-metadata-tables.md +++ b/docs/querying/sql-metadata-tables.md @@ -155,10 +155,10 @@ Segments table provides details on all Druid segments, whether they are publishe |num_replicas|BIGINT|Number of replicas of this segment currently being served| |num_rows|BIGINT|Number of rows in this segment, or zero if the number of rows is not known.

This row count is gathered by the Broker in the background. It will be zero if the Broker has not gathered a row count for this segment yet. For segments ingested from streams, the reported row count may lag behind the result of a `count(*)` query because the cached `num_rows` on the Broker may be out of date. This will settle shortly after new rows stop being written to that particular segment.| |is_active|BIGINT|True for segments that represent the latest state of a datasource.

Equivalent to `(is_published = 1 AND is_overshadowed = 0) OR is_realtime = 1`. In steady state, when no ingestion or data management operations are happening, `is_active` will be equivalent to `is_available`. However, they may differ from each other when ingestion or data management operations have executed recently. In these cases, Druid will load and unload segments appropriately to bring actual availability in line with the expected state given by `is_active`.| -|is_published|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment has been published to the metadata store and is marked as used. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| -|is_available|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is currently being served by any data serving process, like a Historical or a realtime ingestion task. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| +|is_published|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment has been published to the metadata store and is marked as used. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.| +|is_available|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is currently being served by any data serving process, like a Historical or a realtime ingestion task. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.| |is_realtime|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is _only_ served by realtime tasks, and 0 if any Historical process is serving this segment.| -|is_overshadowed|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is published and is _fully_ overshadowed by some other published segments. Currently, `is_overshadowed` is always 0 for unpublished segments, although this may change in the future. You can filter for segments that "should be published" by filtering for `is_published = 1 AND is_overshadowed = 0`. Segments can briefly be both published and overshadowed if they were recently replaced, but have not been unpublished yet. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| +|is_overshadowed|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is published and is _fully_ overshadowed by some other published segments. Currently, `is_overshadowed` is always 0 for unpublished segments, although this may change in the future. You can filter for segments that "should be published" by filtering for `is_published = 1 AND is_overshadowed = 0`. Segments can briefly be both published and overshadowed if they were recently replaced, but have not been unpublished yet. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.| |shard_spec|VARCHAR|JSON-serialized form of the segment `ShardSpec`| |dimensions|VARCHAR|JSON-serialized form of the segment dimensions| |metrics|VARCHAR|JSON-serialized form of the segment metrics| diff --git a/docs/release-info/upgrade-notes.md b/docs/release-info/upgrade-notes.md index 48040145b66..46e5ed6fc1a 100644 --- a/docs/release-info/upgrade-notes.md +++ b/docs/release-info/upgrade-notes.md @@ -413,7 +413,7 @@ To restore old behavior, you can set `sqlFinalizeOuterSketches=true` in the quer #### Kill tasks mark segments as unused only if specified -When you issue a kill task, Druid marks the underlying segments as unused only if explicitly specified. For more information, see the [API reference](https://druid.apache.org/docs/latest/operations/api-reference.html#coordinator). +When you issue a kill task, Druid marks the underlying segments as unused only if explicitly specified. For more information, see the [API reference](https://druid.apache.org/docs/latest/api-reference/data-management-api). [#13104](https://github.com/apache/druid/pull/13104) diff --git a/website/package.json b/website/package.json index 89a2b15b08b..c618537a18c 100644 --- a/website/package.json +++ b/website/package.json @@ -9,7 +9,7 @@ "version": "docusaurus-version", "rename-version": "docusaurus-rename-version", "compile-scss": "sass scss/custom.scss > static/css/custom.css", - "link-lint": "npm run build && node script/link-lint.js", + "link-lint": "node script/link-lint.js", "spellcheck": "mdspell --en-us --ignore-numbers --report '../docs/**/*.md' || (./script/notify-spellcheck-issues && false)", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", diff --git a/website/script/link-lint.js b/website/script/link-lint.js new file mode 100644 index 00000000000..37c2623fc46 --- /dev/null +++ b/website/script/link-lint.js @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const path = require('path'); +const fg = require('fast-glob'); +const fs = require('fs-extra'); + +const entries = fg.sync(['./build/docs/**/*.html']); + +function hasAnchor(html, anchor) { + anchor = anchor.replace('#', ''); + return html.includes(`name="${anchor}"`) || html.includes(`id="${anchor}"`); +} + +const issues = []; +entries.forEach((entry) => { + const cnt = fs.readFileSync(entry, 'utf-8'); + const links = cnt.match(/href="([^"#]+)?(#[^"]+)?"/g); + if (!links) return; + + links.forEach(link => { + if (link === `href=""`) return; + const match = link.match(/^href="([^"#]+)?(#[^"]+)?"$/); + if (!match) throw new Error(`something went wrong for: ${link}`); + + const url = match[1]; + const anchor = match[2]; + + if (url) { + // Ignore external links + if (url.includes('://')) return; + + // Ignore external doc links + if (url.startsWith('/') && !url.startsWith('/docs/')) return; + + // Ignore mailto links + if (url.startsWith('mailto:')) return; + + // This one will get created externally + if (url === '/docs/latest') return; + + let target = url.startsWith('/') + ? './build' + url + : path.resolve(path.dirname(entry), url); + + if (target.endsWith('/')) { + target += 'index.html'; + } else { + target += '/index.html'; + } + + + let targetHtml; + try { + targetHtml = fs.readFileSync(target, 'utf-8'); + } catch (e) { + issues.push(`Could not find '${url}' linked from '${entry}' [${target}]`); + return; + } + + if (anchor && !hasAnchor(targetHtml, anchor)) { + issues.push(`Could not find anchor '${anchor}' in '${url}' linked from '${entry}'`) + } + } else { + if (anchor) { + if (!hasAnchor(cnt, anchor)) { + issues.push(`Could not find self anchor '${anchor}' in '${entry}'`) + } + } else { + throw new Error(`should not get here with: ${link} in '${entry}'`); + } + } + }); +}); + +if (issues.length) { + issues.push(`There are ${issues.length} issues`); + console.error(issues.join('\n')); + process.exit(1); +} else { + console.log('No link-lint issues found'); +}