docs: Anchor link checker (#15624)

Co-authored-by: 317brian <53799971+317brian@users.noreply.github.com>
This commit is contained in:
Victoria Lim 2024-01-08 15:19:05 -08:00 committed by GitHub
parent df5bcd1367
commit 52313c51ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 126 additions and 123 deletions

View File

@ -168,6 +168,7 @@ jobs:
(cd website && npm install) (cd website && npm install)
cd website cd website
npm run build npm run build
npm run link-lint
npm run spellcheck npm run spellcheck
- name: web console - name: web console

View File

@ -1,101 +0,0 @@
#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
import sys
#
# Checks for broken redirects (in _redirects.json) and links from markdown files to
# nonexistent pages. Does _not_ check for links to anchors that don't exist.
#
# Targets to these 'well known' pages are OK.
WELL_KNOWN_PAGES = ["/libraries.html", "/downloads.html", "/community/", "/thanks.html"]
def normalize_link(source, target):
dirname = os.path.dirname(source)
normalized = os.path.normpath(os.path.join(dirname, target))
return normalized
def verify_redirects(docs_directory, redirect_json):
ok = True
with open(redirect_json, 'r') as f:
redirects = json.loads(f.read())
for redirect in redirects:
if redirect["target"] in WELL_KNOWN_PAGES:
continue
# Replace .html and named anchors with .md, and check the file on the filesystem.
target = re.sub(r'\.html(#.*)?$', '.md', normalize_link(redirect["source"], redirect["target"]))
if not os.path.exists(os.path.join(docs_directory, target)):
sys.stderr.write('Redirect [' + redirect["source"] + '] target does not exist: ' + redirect["target"] + "\n")
ok = False
return ok
def verify_markdown(docs_directory):
ok = True
# Get list of markdown files.
markdowns = []
for root, dirs, files in os.walk(docs_directory):
for name in files:
if name.endswith('.md'):
markdowns.append(os.path.join(root, name))
for markdown_file in markdowns:
with open(markdown_file, 'r') as f:
content = f.read()
for m in re.finditer(r'\[([^\[]*?)\]\((.*?)(?: \"[^\"]+\")?\)', content):
target = m.group(2)
if target in WELL_KNOWN_PAGES:
continue
if markdown_file.endswith("/druid-kerberos.md") and target in ['regexp', 'druid@EXAMPLE.COM']:
# Hack to support the fact that rule examples in druid-kerberos docs look sort of like markdown links.
continue
target = re.sub(r'^/docs/VERSION/', '', target)
target = re.sub(r'#.*$', '', target)
target = re.sub(r'\.html$', '.md', target)
target = re.sub(r'/$', '/index.md', target)
if target and not (target.startswith('http://') or target.startswith('https://')):
target_normalized = normalize_link(markdown_file, target)
if not os.path.exists(target_normalized):
sys.stderr.write('Page [' + markdown_file + '] target does not exist: ' + m.group(2) + "\n")
ok = False
return ok
def main():
if len(sys.argv) != 3:
sys.stderr.write('usage: program <docs dir> <redirect.json>\n')
sys.exit(1)
ok = verify_redirects(sys.argv[1], sys.argv[2])
ok = verify_markdown(sys.argv[1]) and ok
if not ok:
sys.exit(1)
main()

View File

@ -24,12 +24,12 @@ title: "Data deletion"
## By time range, manually ## By time range, manually
Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) and supports Apache Druid stores data [partitioned by time chunk](../design/storage.md) and supports
deleting data for time chunks by dropping segments. This is a fast, metadata-only operation. deleting data for time chunks by dropping segments. This is a fast, metadata-only operation.
Deletion by time range happens in two steps: Deletion by time range happens in two steps:
1. Segments to be deleted must first be marked as ["unused"](../design/architecture.md#segment-lifecycle). This can 1. Segments to be deleted must first be marked as ["unused"](../design/storage.md#segment-lifecycle). This can
happen when a segment is dropped by a [drop rule](../operations/rule-configuration.md) or when you manually mark a happen when a segment is dropped by a [drop rule](../operations/rule-configuration.md) or when you manually mark a
segment unused through the Coordinator API or web console. This is a soft delete: the data is not available for segment unused through the Coordinator API or web console. This is a soft delete: the data is not available for
querying, but the segment files remains in deep storage, and the segment records remains in the metadata store. querying, but the segment files remains in deep storage, and the segment records remains in the metadata store.

View File

@ -23,7 +23,7 @@ sidebar_label: "Overview"
~ under the License. ~ under the License.
--> -->
Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) in immutable Apache Druid stores data [partitioned by time chunk](../design/storage.md) in immutable
files called [segments](../design/segments.md). Data management operations involving replacing, or deleting, files called [segments](../design/segments.md). Data management operations involving replacing, or deleting,
these segments include: these segments include:

View File

@ -28,7 +28,7 @@ title: "Schema changes"
Apache Druid allows you to provide a new schema for new data without the need to update the schema of any existing data. Apache Druid allows you to provide a new schema for new data without the need to update the schema of any existing data.
It is sufficient to update your supervisor spec, if using [streaming ingestion](../ingestion/index.md#streaming), or to It is sufficient to update your supervisor spec, if using [streaming ingestion](../ingestion/index.md#streaming), or to
provide the new schema the next time you do a [batch ingestion](../ingestion/index.md#batch). This is made possible by provide the new schema the next time you do a [batch ingestion](../ingestion/index.md#batch). This is made possible by
the fact that each [segment](../design/architecture.md#datasources-and-segments), at the time it is created, stores a the fact that each [segment](../design/segments.md), at the time it is created, stores a
copy of its own schema. Druid reconciles all of these individual segment schemas automatically at query time. copy of its own schema. Druid reconciles all of these individual segment schemas automatically at query time.
## For existing data ## For existing data

View File

@ -24,7 +24,7 @@ title: "Data updates"
## Overwrite ## Overwrite
Apache Druid stores data [partitioned by time chunk](../design/architecture.md#datasources-and-segments) and supports Apache Druid stores data [partitioned by time chunk](../design/storage.md) and supports
overwriting existing data using time ranges. Data outside the replacement time range is not touched. Overwriting of overwriting existing data using time ranges. Data outside the replacement time range is not touched. Overwriting of
existing data is done using the same mechanisms as [batch ingestion](../ingestion/index.md#batch). existing data is done using the same mechanisms as [batch ingestion](../ingestion/index.md#batch).

View File

@ -101,7 +101,8 @@ Now you're up to date, and you can make your changes.
git checkout -b MY-BRANCH git checkout -b MY-BRANCH
``` ```
Provide a name for your feature branch in `MY-BRANCH`. Provide a name for your feature branch in `MY-BRANCH`.
2. Find the file that you want to make changes to. All the source files for the docs are written in Markdown and located in the `docs` directory. The URL for the page includes the subdirectory the source file is in. For example, the SQL-based ingestion tutorial found at `https://druid.apache.org/docs/latest/tutorials/tutorial-msq-extern.html` is in the `tutorials` subdirectory. 2. Find the file that you want to make changes to. All the source files for the docs are written in Markdown and located in the `docs` directory. The URL for the page includes the subdirectory the source file is in. For example, the SQL-based ingestion tutorial found at `https://druid.apache.org/docs/latest/tutorials/tutorial-msq-extern.html` is in the `tutorials` subdirectory.
If you're adding a page, create a new Markdown file in the appropriate subdirectory. Then, copy the front matter and Apache license from an existing file. Update the `title` and `id` fields. Don't forget to add it to `website/sidebars.json` so that your new page shows up in the navigation. If you're adding a page, create a new Markdown file in the appropriate subdirectory. Then, copy the front matter and Apache license from an existing file. Update the `title` and `id` fields. Don't forget to add it to `website/sidebars.json` so that your new page shows up in the navigation.
@ -111,6 +112,11 @@ Provide a name for your feature branch in `MY-BRANCH`.
5. Use the following commands to run the link and spellcheckers locally: 5. Use the following commands to run the link and spellcheckers locally:
```bash ```bash
cd website
# You only need to install once
npm install
npm run build
npm run spellcheck npm run spellcheck
npm run link-lint npm run link-lint
``` ```
@ -216,4 +222,4 @@ Before publishing new content or updating an existing topic, you can audit your
* When American spelling is different from Commonwealth/"British" spelling, use the American spelling. * When American spelling is different from Commonwealth/"British" spelling, use the American spelling.
* Dont use terms considered disrespectful. Refer to a list like Googles [Word list](https://developers.google.com/style/word-list) for guidance and alternatives. * Dont use terms considered disrespectful. Refer to a list like Googles [Word list](https://developers.google.com/style/word-list) for guidance and alternatives.
* Use straight quotation marks and straight apostrophes instead of the curly versions. * Use straight quotation marks and straight apostrophes instead of the curly versions.
* Introduce a list, a table, or a procedure with an introductory sentence that prepares the reader for what they're about to read. * Introduce a list, a table, or a procedure with an introductory sentence that prepares the reader for what they're about to read.

View File

@ -47,7 +47,7 @@ Note that this document does not track the status of contrib extensions, all of
- [Configuration reference](../configuration/index.md#overlord-operations) - [Configuration reference](../configuration/index.md#overlord-operations)
- [Task reference](../ingestion/tasks.md#locking) - [Task reference](../ingestion/tasks.md#locking)
- [Design](../design/architecture.md#availability-and-consistency) - [Design](../design/storage.md#availability-and-consistency)
## Front coding ## Front coding

View File

@ -24,7 +24,7 @@ sidebar_label: Overview
--> -->
Loading data in Druid is called _ingestion_ or _indexing_. When you ingest data into Druid, Druid reads the data from Loading data in Druid is called _ingestion_ or _indexing_. When you ingest data into Druid, Druid reads the data from
your source system and stores it in data files called [_segments_](../design/architecture.md#datasources-and-segments). your source system and stores it in data files called [_segments_](../design/segments.md).
In general, segment files contain a few million rows each. In general, segment files contain a few million rows each.
For most ingestion methods, the Druid [MiddleManager](../design/middlemanager.md) processes or the For most ingestion methods, the Druid [MiddleManager](../design/middlemanager.md) processes or the

View File

@ -149,7 +149,7 @@ An example `dataSchema` is:
### `dataSource` ### `dataSource`
The `dataSource` is located in `dataSchema``dataSource` and is simply the name of the The `dataSource` is located in `dataSchema``dataSource` and is simply the name of the
[datasource](../design/architecture.md#datasources-and-segments) that data will be written to. An example [datasource](../design/storage.md) that data will be written to. An example
`dataSource` is: `dataSource` is:
``` ```
@ -304,7 +304,7 @@ An example `metricsSpec` is:
The `granularitySpec` is located in `dataSchema``granularitySpec` and is responsible for configuring The `granularitySpec` is located in `dataSchema``granularitySpec` and is responsible for configuring
the following operations: the following operations:
1. Partitioning a datasource into [time chunks](../design/architecture.md#datasources-and-segments) (via `segmentGranularity`). 1. Partitioning a datasource into [time chunks](../design/storage.md) (via `segmentGranularity`).
2. Truncating the timestamp, if desired (via `queryGranularity`). 2. Truncating the timestamp, if desired (via `queryGranularity`).
3. Specifying which time chunks of segments should be created, for batch ingestion (via `intervals`). 3. Specifying which time chunks of segments should be created, for batch ingestion (via `intervals`).
4. Specifying whether ingestion-time [rollup](./rollup.md) should be used or not (via `rollup`). 4. Specifying whether ingestion-time [rollup](./rollup.md) should be used or not (via `rollup`).
@ -329,7 +329,7 @@ A `granularitySpec` can have the following components:
| Field | Description | Default | | Field | Description | Default |
|-------|-------------|---------| |-------|-------------|---------|
| type |`uniform`| `uniform` | | type |`uniform`| `uniform` |
| segmentGranularity | [Time chunking](../design/architecture.md#datasources-and-segments) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here. Note that all segments in the same time chunk should have the same segment granularity.<br /><br />Avoid `WEEK` granularity for data partitioning because weeks don't align neatly with months and years, making it difficult to change partitioning by coarser granularity. Instead, opt for other partitioning options such as `DAY` or `MONTH`, which offer more flexibility.| `day` | | segmentGranularity | [Time chunking](../design/storage.md) granularity for this datasource. Multiple segments can be created per time chunk. For example, when set to `day`, the events of the same day fall into the same time chunk which can be optionally further partitioned into multiple segments based on other configurations and input size. Any [granularity](../querying/granularities.md) can be provided here. Note that all segments in the same time chunk should have the same segment granularity.<br /><br />Avoid `WEEK` granularity for data partitioning because weeks don't align neatly with months and years, making it difficult to change partitioning by coarser granularity. Instead, opt for other partitioning options such as `DAY` or `MONTH`, which offer more flexibility.| `day` |
| queryGranularity | The resolution of timestamp storage within each segment. This must be equal to, or finer, than `segmentGranularity`. This will be the finest granularity that you can query at and still receive sensible results, but note that you can still query at anything coarser than this granularity. E.g., a value of `minute` will mean that records will be stored at minutely granularity, and can be sensibly queried at any multiple of minutes (including minutely, 5-minutely, hourly, etc).<br /><br />Any [granularity](../querying/granularities.md) can be provided here. Use `none` to store timestamps as-is, without any truncation. Note that `rollup` will be applied if it is set even when the `queryGranularity` is set to `none`. | `none` | | queryGranularity | The resolution of timestamp storage within each segment. This must be equal to, or finer, than `segmentGranularity`. This will be the finest granularity that you can query at and still receive sensible results, but note that you can still query at anything coarser than this granularity. E.g., a value of `minute` will mean that records will be stored at minutely granularity, and can be sensibly queried at any multiple of minutes (including minutely, 5-minutely, hourly, etc).<br /><br />Any [granularity](../querying/granularities.md) can be provided here. Use `none` to store timestamps as-is, without any truncation. Note that `rollup` will be applied if it is set even when the `queryGranularity` is set to `none`. | `none` |
| rollup | Whether to use ingestion-time [rollup](./rollup.md) or not. Note that rollup is still effective even when `queryGranularity` is set to `none`. Your data will be rolled up if they have the exactly same timestamp. | `true` | | rollup | Whether to use ingestion-time [rollup](./rollup.md) or not. Note that rollup is still effective even when `queryGranularity` is set to `none`. Your data will be rolled up if they have the exactly same timestamp. | `true` |
| intervals | A list of intervals defining time chunks for segments. Specify interval values using ISO8601 format. For example, `["2021-12-06T21:27:10+00:00/2021-12-07T00:00:00+00:00"]`. If you omit the time, the time defaults to "00:00:00".<br /><br />Druid breaks the list up and rounds off the list values based on the `segmentGranularity`.<br /><br />If `null` or not provided, batch ingestion tasks generally determine which time chunks to output based on the timestamps found in the input data.<br /><br />If specified, batch ingestion tasks may be able to skip a determining-partitions phase, which can result in faster ingestion. Batch ingestion tasks may also be able to request all their locks up-front instead of one by one. Batch ingestion tasks throw away any records with timestamps outside of the specified intervals.<br /><br />Ignored for any form of streaming ingestion. | `null` | | intervals | A list of intervals defining time chunks for segments. Specify interval values using ISO8601 format. For example, `["2021-12-06T21:27:10+00:00/2021-12-07T00:00:00+00:00"]`. If you omit the time, the time defaults to "00:00:00".<br /><br />Druid breaks the list up and rounds off the list values based on the `segmentGranularity`.<br /><br />If `null` or not provided, batch ingestion tasks generally determine which time chunks to output based on the timestamps found in the input data.<br /><br />If specified, batch ingestion tasks may be able to skip a determining-partitions phase, which can result in faster ingestion. Batch ingestion tasks may also be able to request all their locks up-front instead of one by one. Batch ingestion tasks throw away any records with timestamps outside of the specified intervals.<br /><br />Ignored for any form of streaming ingestion. | `null` |
@ -529,4 +529,4 @@ You can enable front coding with all types of ingestion. For information on defi
::: :::
Beyond these properties, each ingestion method has its own specific tuning properties. See the documentation for each Beyond these properties, each ingestion method has its own specific tuning properties. See the documentation for each
[ingestion method](./index.md#ingestion-methods) for details. [ingestion method](./index.md#ingestion-methods) for details.

View File

@ -34,7 +34,7 @@ sidebar_label: "Key concepts"
The `druid-multi-stage-query` extension adds a multi-stage query (MSQ) task engine that executes SQL statements as batch The `druid-multi-stage-query` extension adds a multi-stage query (MSQ) task engine that executes SQL statements as batch
tasks in the indexing service, which execute on [Middle Managers](../design/architecture.md#druid-services). tasks in the indexing service, which execute on [Middle Managers](../design/architecture.md#druid-services).
[INSERT](reference.md#insert) and [REPLACE](reference.md#replace) tasks publish [INSERT](reference.md#insert) and [REPLACE](reference.md#replace) tasks publish
[segments](../design/architecture.md#datasources-and-segments) just like [all other forms of batch [segments](../design/storage.md) just like [all other forms of batch
ingestion](../ingestion/index.md#batch). Each query occupies at least two task slots while running: one controller task, ingestion](../ingestion/index.md#batch). Each query occupies at least two task slots while running: one controller task,
and at least one worker task. As an experimental feature, the MSQ task engine also supports running SELECT queries as and at least one worker task. As an experimental feature, the MSQ task engine also supports running SELECT queries as
batch tasks. The behavior and result format of plain SELECT (without INSERT or REPLACE) is subject to change. batch tasks. The behavior and result format of plain SELECT (without INSERT or REPLACE) is subject to change.

View File

@ -123,7 +123,7 @@ Be sure to check out [segment size optimization](./segment-optimization.md) to h
The biggest contributions to heap usage on Brokers are: The biggest contributions to heap usage on Brokers are:
- Partial unmerged query results from Historicals and Tasks - Partial unmerged query results from Historicals and Tasks
- The segment timeline: this consists of location information (which Historical/Task is serving a segment) for all currently [available](../design/architecture.md#segment-lifecycle) segments. - The segment timeline: this consists of location information (which Historical/Task is serving a segment) for all currently [available](../design/storage.md#segment-lifecycle) segments.
- Cached segment metadata: this consists of metadata, such as per-segment schemas, for all currently available segments. - Cached segment metadata: this consists of metadata, such as per-segment schemas, for all currently available segments.
The Broker heap requirements scale based on the number of segments in the cluster, and the total data size of the segments. The Broker heap requirements scale based on the number of segments in the cluster, and the total data size of the segments.

View File

@ -26,7 +26,7 @@ Druid includes a web console for loading data, managing datasources and tasks, a
You can also run SQL and native Druid queries in the console. You can also run SQL and native Druid queries in the console.
Enable the following cluster settings to use the web console. Note that these settings are enabled by default. Enable the following cluster settings to use the web console. Note that these settings are enabled by default.
- Enable the Router's [management proxy](../design/router.md#enabling-the-management-proxy). - Enable the Router's [management proxy](../design/router.md#enable-the-management-proxy).
- Enable [Druid SQL](../configuration/index.md#sql) for the Broker processes in the cluster. - Enable [Druid SQL](../configuration/index.md#sql) for the Broker processes in the cluster.
The [Router](../design/router.md) service hosts the web console. The [Router](../design/router.md) service hosts the web console.

View File

@ -77,7 +77,7 @@ When `druid.generic.useDefaultValueForNull = true` (legacy mode), Druid instead
## Arrays ## Arrays
Druid supports [`ARRAY` types](arrays.md), which behave as standard SQL arrays, where results are grouped by matching entire arrays. The [`UNNEST` operator](./sql-array-functions.md#unn) can be used to perform operations on individual array elements, translating each element into a separate row. Druid supports [`ARRAY` types](arrays.md), which behave as standard SQL arrays, where results are grouped by matching entire arrays. The [`UNNEST` operator](./sql.md#unnest) can be used to perform operations on individual array elements, translating each element into a separate row.
`ARRAY` typed columns can be stored in segments with JSON-based ingestion using the 'auto' typed dimension schema shared with [schema auto-discovery](../ingestion/schema-design.md#schema-auto-discovery-for-dimensions) to detect and ingest arrays as ARRAY typed columns. For [SQL based ingestion](../multi-stage-query/index.md), the query context parameter `arrayIngestMode` must be specified as `"array"` to ingest ARRAY types. In Druid 28, the default mode for this parameter is `"mvd"` for backwards compatibility, which instead can only handle `ARRAY<STRING>` which it stores in [multi-value string columns](#multi-value-strings). `ARRAY` typed columns can be stored in segments with JSON-based ingestion using the 'auto' typed dimension schema shared with [schema auto-discovery](../ingestion/schema-design.md#schema-auto-discovery-for-dimensions) to detect and ingest arrays as ARRAY typed columns. For [SQL based ingestion](../multi-stage-query/index.md), the query context parameter `arrayIngestMode` must be specified as `"array"` to ingest ARRAY types. In Druid 28, the default mode for this parameter is `"mvd"` for backwards compatibility, which instead can only handle `ARRAY<STRING>` which it stores in [multi-value string columns](#multi-value-strings).

View File

@ -155,10 +155,10 @@ Segments table provides details on all Druid segments, whether they are publishe
|num_replicas|BIGINT|Number of replicas of this segment currently being served| |num_replicas|BIGINT|Number of replicas of this segment currently being served|
|num_rows|BIGINT|Number of rows in this segment, or zero if the number of rows is not known.<br /><br />This row count is gathered by the Broker in the background. It will be zero if the Broker has not gathered a row count for this segment yet. For segments ingested from streams, the reported row count may lag behind the result of a `count(*)` query because the cached `num_rows` on the Broker may be out of date. This will settle shortly after new rows stop being written to that particular segment.| |num_rows|BIGINT|Number of rows in this segment, or zero if the number of rows is not known.<br /><br />This row count is gathered by the Broker in the background. It will be zero if the Broker has not gathered a row count for this segment yet. For segments ingested from streams, the reported row count may lag behind the result of a `count(*)` query because the cached `num_rows` on the Broker may be out of date. This will settle shortly after new rows stop being written to that particular segment.|
|is_active|BIGINT|True for segments that represent the latest state of a datasource.<br /><br />Equivalent to `(is_published = 1 AND is_overshadowed = 0) OR is_realtime = 1`. In steady state, when no ingestion or data management operations are happening, `is_active` will be equivalent to `is_available`. However, they may differ from each other when ingestion or data management operations have executed recently. In these cases, Druid will load and unload segments appropriately to bring actual availability in line with the expected state given by `is_active`.| |is_active|BIGINT|True for segments that represent the latest state of a datasource.<br /><br />Equivalent to `(is_published = 1 AND is_overshadowed = 0) OR is_realtime = 1`. In steady state, when no ingestion or data management operations are happening, `is_active` will be equivalent to `is_available`. However, they may differ from each other when ingestion or data management operations have executed recently. In these cases, Druid will load and unload segments appropriately to bring actual availability in line with the expected state given by `is_active`.|
|is_published|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment has been published to the metadata store and is marked as used. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| |is_published|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment has been published to the metadata store and is marked as used. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.|
|is_available|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is currently being served by any data serving process, like a Historical or a realtime ingestion task. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| |is_available|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is currently being served by any data serving process, like a Historical or a realtime ingestion task. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.|
|is_realtime|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is _only_ served by realtime tasks, and 0 if any Historical process is serving this segment.| |is_realtime|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is _only_ served by realtime tasks, and 0 if any Historical process is serving this segment.|
|is_overshadowed|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is published and is _fully_ overshadowed by some other published segments. Currently, `is_overshadowed` is always 0 for unpublished segments, although this may change in the future. You can filter for segments that "should be published" by filtering for `is_published = 1 AND is_overshadowed = 0`. Segments can briefly be both published and overshadowed if they were recently replaced, but have not been unpublished yet. See the [segment lifecycle documentation](../design/architecture.md#segment-lifecycle) for more details.| |is_overshadowed|BIGINT|Boolean represented as long type where 1 = true, 0 = false. 1 if this segment is published and is _fully_ overshadowed by some other published segments. Currently, `is_overshadowed` is always 0 for unpublished segments, although this may change in the future. You can filter for segments that "should be published" by filtering for `is_published = 1 AND is_overshadowed = 0`. Segments can briefly be both published and overshadowed if they were recently replaced, but have not been unpublished yet. See the [segment lifecycle documentation](../design/storage.md#segment-lifecycle) for more details.|
|shard_spec|VARCHAR|JSON-serialized form of the segment `ShardSpec`| |shard_spec|VARCHAR|JSON-serialized form of the segment `ShardSpec`|
|dimensions|VARCHAR|JSON-serialized form of the segment dimensions| |dimensions|VARCHAR|JSON-serialized form of the segment dimensions|
|metrics|VARCHAR|JSON-serialized form of the segment metrics| |metrics|VARCHAR|JSON-serialized form of the segment metrics|

View File

@ -413,7 +413,7 @@ To restore old behavior, you can set `sqlFinalizeOuterSketches=true` in the quer
#### Kill tasks mark segments as unused only if specified #### Kill tasks mark segments as unused only if specified
When you issue a kill task, Druid marks the underlying segments as unused only if explicitly specified. For more information, see the [API reference](https://druid.apache.org/docs/latest/operations/api-reference.html#coordinator). When you issue a kill task, Druid marks the underlying segments as unused only if explicitly specified. For more information, see the [API reference](https://druid.apache.org/docs/latest/api-reference/data-management-api).
[#13104](https://github.com/apache/druid/pull/13104) [#13104](https://github.com/apache/druid/pull/13104)

View File

@ -9,7 +9,7 @@
"version": "docusaurus-version", "version": "docusaurus-version",
"rename-version": "docusaurus-rename-version", "rename-version": "docusaurus-rename-version",
"compile-scss": "sass scss/custom.scss > static/css/custom.css", "compile-scss": "sass scss/custom.scss > static/css/custom.css",
"link-lint": "npm run build && node script/link-lint.js", "link-lint": "node script/link-lint.js",
"spellcheck": "mdspell --en-us --ignore-numbers --report '../docs/**/*.md' || (./script/notify-spellcheck-issues && false)", "spellcheck": "mdspell --en-us --ignore-numbers --report '../docs/**/*.md' || (./script/notify-spellcheck-issues && false)",
"swizzle": "docusaurus swizzle", "swizzle": "docusaurus swizzle",
"deploy": "docusaurus deploy", "deploy": "docusaurus deploy",

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const path = require('path');
const fg = require('fast-glob');
const fs = require('fs-extra');
const entries = fg.sync(['./build/docs/**/*.html']);
function hasAnchor(html, anchor) {
anchor = anchor.replace('#', '');
return html.includes(`name="${anchor}"`) || html.includes(`id="${anchor}"`);
}
const issues = [];
entries.forEach((entry) => {
const cnt = fs.readFileSync(entry, 'utf-8');
const links = cnt.match(/href="([^"#]+)?(#[^"]+)?"/g);
if (!links) return;
links.forEach(link => {
if (link === `href=""`) return;
const match = link.match(/^href="([^"#]+)?(#[^"]+)?"$/);
if (!match) throw new Error(`something went wrong for: ${link}`);
const url = match[1];
const anchor = match[2];
if (url) {
// Ignore external links
if (url.includes('://')) return;
// Ignore external doc links
if (url.startsWith('/') && !url.startsWith('/docs/')) return;
// Ignore mailto links
if (url.startsWith('mailto:')) return;
// This one will get created externally
if (url === '/docs/latest') return;
let target = url.startsWith('/')
? './build' + url
: path.resolve(path.dirname(entry), url);
if (target.endsWith('/')) {
target += 'index.html';
} else {
target += '/index.html';
}
let targetHtml;
try {
targetHtml = fs.readFileSync(target, 'utf-8');
} catch (e) {
issues.push(`Could not find '${url}' linked from '${entry}' [${target}]`);
return;
}
if (anchor && !hasAnchor(targetHtml, anchor)) {
issues.push(`Could not find anchor '${anchor}' in '${url}' linked from '${entry}'`)
}
} else {
if (anchor) {
if (!hasAnchor(cnt, anchor)) {
issues.push(`Could not find self anchor '${anchor}' in '${entry}'`)
}
} else {
throw new Error(`should not get here with: ${link} in '${entry}'`);
}
}
});
});
if (issues.length) {
issues.push(`There are ${issues.length} issues`);
console.error(issues.join('\n'));
process.exit(1);
} else {
console.log('No link-lint issues found');
}