Breaks site

This commit is contained in:
aetter 2021-05-28 10:48:19 -07:00
parent 6150e42eec
commit 2729ce3ccb
186 changed files with 30908 additions and 51 deletions

View File

@ -1,18 +1,3 @@
# Welcome to Jekyll!
#
# This config file is meant for settings that affect your whole blog, values
# which you are expected to set up once and rarely edit after that. If you find
# yourself editing this file very often, consider using Jekyll's data files
# feature for the data you need to update frequently.
#
# For technical reasons, this file is *NOT* reloaded automatically when you use
# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
# Site settings
# These are used to personalize your new site. If you look in the HTML files,
# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
# You can create any custom variable you would like, and they will be accessible
# in the templates via {{ site.myvariable }}.
title: OpenSearch documentation
description: >- # this means to ignore newlines until "baseurl:"
Documentation for OpenSearch, the Apache 2.0 search, analytics, and visualization suite with advanced security, alerting, SQL support, automated index management, deep performance analysis, and more.
@ -39,6 +24,43 @@ aux_links:
- "https://opensearch.org"
color_scheme: opensearch
# Define Jekyll collections
collections:
# Define a collection named "tests", its documents reside in the "_tests" directory
opensearch_docs:
permalink: "/:collection/:path/"
output: true
opensearch_dashboards_docs:
permalink: "/:collection/:path/"
output: true
opensearch_plugins_docs:
permalink: "/:collection/:path/"
output: true
external_links:
permalink: "/:collection/:path/"
output: true
just_the_docs:
# Define the collections used in the theme
collections:
opensearch_docs:
name: OpenSearch
# nav_exclude: true
# nav_fold: true
# search_exclude: true
opensearch_dashboards_docs:
name: OpenSearch Dashboards
#nav_fold: true
opensearch_plugins_docs:
name: Plugins
#nav_fold: true
opensearch_troubleshooting_docs:
name: Troubleshooting
#nav_fold: true
external_links:
name: External links
# Enable or disable the site search
# Supports true (default) or false
search_enabled: true

View File

@ -0,0 +1,7 @@
---
layout: default
title: Javadoc
nav_order: 1
permalink: /javadoc/
redirect_to: https://opensearch.org/docs/javadocs/
---

View File

@ -60,7 +60,7 @@
{%- for node in pages_list -%}
{%- if node.parent == nil -%}
{%- unless node.nav_exclude -%}
<li class="nav-list-item{% if page.url == node.url or page.parent == node.title or page.grand_parent == node.title %} active{% endif %}">
<li class="nav-list-item{% if page.collection == include.key and page.url == node.url or page.parent == node.title or page.grand_parent == node.title %} active{% endif %}">
{%- if node.has_children -%}
<a href="#" class="nav-list-expander"><svg viewBox="0 0 24 24"><use xlink:href="#svg-arrow-right"></use></svg></a>
{%- endif -%}
@ -90,14 +90,36 @@
</li>
{%- endunless -%}
{%- endfor -%}
</ul>
{%- endif -%}
</li>
{%- endunless -%}
{%- endif -%}
{%- endfor -%}
<li class="nav-list-item">
<a href="https://opensearch.org/docs/javadocs/" target="_blank" class="nav-list-link">Javadoc <svg class="external-arrow" width="16" height="16" fill="#002A3A"><use xlink:href="#external-arrow"></use></svg></a>
</li>
</ul>
{%- if page.collection == include.key -%}
{%- for node in pages_list -%}
{%- if node.parent == nil -%}
{%- if page.parent == node.title or page.grand_parent == node.title -%}
{%- assign first_level_url = node.url | absolute_url -%}
{%- endif -%}
{%- if node.has_children -%}
{%- assign children_list = pages_list | where: "parent", node.title -%}
{%- for child in children_list -%}
{%- if child.has_children -%}
{%- if page.url == child.url or page.parent == child.title and page.grand_parent == child.parent -%}
{%- assign second_level_url = child.url | absolute_url -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{% if page.has_children == true and page.has_toc != false %}
{%- assign toc_list = pages_list | where: "parent", page.title | where: "grand_parent", page.parent -%}
{%- endif -%}
{%- endif -%}

View File

@ -38,13 +38,6 @@ layout: table_wrappers
<path d="M13 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"></path><polyline points="13 2 13 9 20 9"></polyline>
</svg>
</symbol>
<symbol id="external-arrow" viewBox="0 0 16 16">
<title>External</title>
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" viewBox="0 0 16 16">
<path fill-rule="evenodd" d="M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z"/>
<path fill-rule="evenodd" d="M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z"/>
</svg>
</symbol>
</svg>
<div class="side-bar">
@ -55,6 +48,14 @@ layout: table_wrappers
</a>
</div>
<nav role="navigation" aria-label="Main" id="site-nav" class="site-nav">
{% assign pages_top_size = site.html_pages
| where_exp:"item", "item.title != nil"
| where_exp:"item", "item.parent == nil"
| where_exp:"item", "item.nav_exclude != true"
| size %}
{% if pages_top_size > 0 %}
{% include nav.html pages=site.html_pages key=nil %}
{% endif %}
{% if site.just_the_docs.collections %}
{% assign collections_size = site.just_the_docs.collections | size %}
{% for collection_entry in site.just_the_docs.collections %}
@ -62,14 +63,26 @@ layout: table_wrappers
{% assign collection_value = collection_entry[1] %}
{% assign collection = site[collection_key] %}
{% if collection_value.nav_exclude != true %}
{% if collections_size > 1 %}
{% if collections_size > 1 or pages_top_size > 0 %}
{% if collection_value.nav_fold == true %}
<ul class="nav-list nav-category-list">
<li class="nav-list-item{% if page.collection == collection_key %} active{% endif %}">
{%- if collection.size > 0 -%}
<a href="#" class="nav-list-expander"><svg viewBox="0 0 24 24"><use xlink:href="#svg-arrow-right"></use></svg></a>
{%- endif -%}
<div class="nav-category">{{ collection_value.name }}</div>
{% include nav.html pages=collection key=collection_key %}
</li>
</ul>
{% else %}
<div class="nav-category">{{ collection_value.name }}</div>
{% include nav.html pages=collection key=collection_key %}
{% endif %}
{% else %}
{% include nav.html pages=collection key=collection_key %}
{% endif %}
{% include nav.html pages=collection %}
{% endif %}
{% endfor %}
{% else %}
{% include nav.html pages=site.html_pages %}
{% endif %}
</nav>
<footer class="site-footer">
@ -110,21 +123,6 @@ layout: table_wrappers
<div id="main-content-wrap" class="main-content-wrap">
{% unless page.url == "/" %}
{% if page.parent %}
{%- for node in pages_list -%}
{%- if node.parent == nil -%}
{%- if page.parent == node.title or page.grand_parent == node.title -%}
{%- assign first_level_url = node.url | absolute_url -%}
{%- endif -%}
{%- if node.has_children -%}
{%- assign children_list = pages_list | where: "parent", node.title -%}
{%- for child in children_list -%}
{%- if page.url == child.url or page.parent == child.title -%}
{%- assign second_level_url = child.url | absolute_url -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
<nav aria-label="Breadcrumb" class="breadcrumb-nav">
<ol class="breadcrumb-nav-list">
{% if page.grand_parent %}
@ -142,7 +140,6 @@ layout: table_wrappers
{% if site.heading_anchors != false %}
{% include vendor/anchor_headings.html html=content beforeHeading="true" anchorBody="<svg viewBox=\"0 0 16 16\" aria-hidden=\"true\"><use xlink:href=\"#svg-link\"></use></svg>" anchorClass="anchor-heading" anchorAttrs="aria-labelledby=\"%html_id%\"" %}
{% else %}
<p class="warning" style="margin-top: 0">Like OpenSearch itself, this documentation is a beta. It has content gaps and might contain bugs.</p>
{{ content }}
{% endif %}
@ -150,8 +147,7 @@ layout: table_wrappers
<hr>
<h2 class="text-delta">Table of contents</h2>
<ul>
{%- assign children_list = pages_list | where: "parent", page.title | where: "grand_parent", page.parent -%}
{% for child in children_list %}
{% for child in toc_list %}
<li>
<a href="{{ child.url | absolute_url }}">{{ child.title }}</a>{% if child.summary %} - {{ child.summary }}{% endif %}
</li>

View File

@ -0,0 +1,26 @@
---
layout: default
title: Gantt charts
nav_order: 10
has_toc: false
---
# Gantt charts
OpenSearch Dashboards includes a Gantt chart visualization. Gantt charts show the start, end, and duration of unique events in a sequence. Gantt charts are useful in trace analytics, telemetry, and anomaly detection use cases, where you want to understand interactions and dependencies between various events in a schedule.
For example, consider an index of log data. The fields in a typical set of log data, especially audit logs, contain a specific operation or event with a start time and duration.
To create a Gantt chart, perform the following steps:
1. In the visualizations menu, choose **Create visualization** and **Gantt Chart**.
1. Choose a source for the chart (e.g. some log data).
1. Under **Metrics**, choose **Event**. For log data, each log is an event.
1. Select the **Start Time** and **Duration** fields from your data set. The start time is the timestamp for the begining of an event. The duration is the amount of time to add to the start time.
1. Under **Results**, choose the number of events to display on the chart. Gantt charts sequence events from earliest to latest based on start time.
1. Choose **Panel settings** to adjust axis labels, time format, and colors.
1. Choose **Update**.
![Gantt Chart](../../images/gantt-chart.png)
This Gantt chart displays the ID of each log on the y-axis. Each bar is a unique event that spans some amount of time. Hover over a bar to see the duration of that event.

View File

@ -0,0 +1,18 @@
---
layout: default
title: Introduction to Dashboards
nav_order: 1
---
# About OpenSearch Dashboards
OpenSearch Dashboards is the default visualization tool for data in OpenSearch. It also serves as a user interface for many of the OpenSearch plugins, including security, alerting, Index State Management, SQL, and more.
## Get started with OpenSearch Dashboards
1. After starting OpenSearch Dashboards, you can access it at port 5601. For example, http://localhost:5601.
1. Log in with the default username `admin` and password `admin`.
1. Choose **Try our sample data** and add the sample flight data.
1. Choose **Discover** and search for a few flights.
1. Choose **Dashboard**, **[Flights] Global Flight Dashboard**, and wait for the dashboard to load.

View File

@ -0,0 +1,23 @@
---
layout: default
title: Docker
parent: Install OpenSearch Dashboards
nav_order: 1
---
# Run OpenSearch Dashboards using Docker
You *can* start OpenSearch Dashboards using `docker run` after [creating a Docker network](https://docs.docker.com/engine/reference/commandline/network_create/) and starting OpenSearch, but the process of connecting OpenSearch Dashboards to OpenSearch is significantly easier with a Docker Compose file.
1. Run `docker pull opensearchproject/opensearch-dashboards:{{site.opensearch_version}}`.
1. Create a [`docker-compose.yml`](https://docs.docker.com/compose/compose-file/) file appropriate for your environment. A sample file that includes OpenSearch Dashboards is available on the OpenSearch [Docker installation page](../opensearch/docker/#sample-docker-compose-file).
Just like `opensearch.yml`, you can pass a custom `opensearch_dashboards.yml` to the container in the Docker Compose file.
{: .tip }
1. Run `docker-compose up`.
Wait for the containers to start. Then see the [OpenSearch Dashboards documentation](../../../opensearch-dashboards/).
1. When finished, run `docker-compose down`.

View File

@ -0,0 +1,10 @@
---
layout: default
title: Install OpenSearch Dashboards
nav_order: 2
has_children: true
---
# Install and configure OpenSearch Dashboards
OpenSearch Dashboards has two installation options at this time: Docker images and tarballs.

View File

@ -0,0 +1,205 @@
---
layout: default
title: OpenSearch Dashboards plugins
parent: Install OpenSearch Dashboards
nav_order: 50
---
# Standalone plugin install
If you don't want to use the all-in-one installation options, you can install the various plugins for OpenSearch Dashboards individually.
---
#### Table of contents
1. TOC
{:toc}
---
## Plugin compatibility
<table>
<thead style="text-align: left">
<tr>
<th>OpenSearch Dashboards version</th>
<th>Plugin versions</th>
</tr>
</thead>
<tbody>
<tr>
<td>1.0.0-beta1</td>
<td>
<pre>
alertingDashboards 1.0.0.0-beta1
anomalyDetectionDashboards 1.0.0.0-beta1
ganttChartDashboards 1.0.0.0-beta1
indexManagementDashboards 1.0.0.0-beta1
notebooksDashboards 1.0.0.0-beta1
queryWorkbenchDashboards 1.0.0.0-beta1
reportsDashboards 1.0.0.0-beta1
securityDashboards 1.0.0.0-beta1
traceAnalyticsDashboards 1.0.0.0-beta1
</pre>
</td>
</tr>
</tbody>
</table>
## Prerequisites
- A compatible OpenSearch cluster
- The corresponding OpenSearch plugins [installed on that cluster](../../install/plugins)
- The corresponding version of [OpenSearch Dashboards](../) (e.g. OpenSearch Dashboards 1.0.0 works with OpenSearch 1.0.0)
## Install
Navigate to the OpenSearch Dashboards home directory (likely `/usr/share/opensearch-dashboards`) and run the install command for each plugin.
#### Security OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-security/opensearchSecurityOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.1.zip
```
This plugin provides a user interface for managing users, roles, mappings, action groups, and tenants.
#### Alerting OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-alerting/opensearchAlertingOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.0.zip
```
This plugin provides a user interface for creating monitors and managing alerts.
#### Index State Management OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-index-management/opensearchIndexManagementOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.1.zip
```
This plugin provides a user interface for managing policies.
#### Anomaly Detection OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-anomaly-detection/opensearchAnomalyDetectionOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.0.zip
```
This plugin provides a user interface for adding detectors.
#### Query Workbench OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-query-workbench/opensearchQueryWorkbenchOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.0.zip
```
This plugin provides a user interface for using SQL queries to explore your data.
#### Trace Analytics
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-trace-analytics/opensearchTraceAnalyticsOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.2.0.zip
```
This plugin uses distributed trace data (indexed in OpenSearch using Data Prepper) to display latency trends, error rates, and more.
#### Notebooks OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-notebooks/opensearchNotebooksOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.2.0.zip
```
This plugin lets you combine OpenSearch Dashboards visualizations and narrative text in a single interface.
#### Reports OpenSearch Dashboards
```bash
# x86 Linux
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-reports/linux/x64/opensearchReportsOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.2.0-linux-x64.zip
# ARM64 Linux
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-reports/linux/arm64/opensearchReportsOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.2.0-linux-arm64.zip
# x86 Windows
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-reports/windows/x64/opensearchReportsOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.2.0-windows-x64.zip
```
This plugin lets you export and share reports from OpenSearch Dashboards dashboards, visualizations, and saved searches.
#### Gantt Chart OpenSearch Dashboards
```bash
sudo bin/opensearch-dashboards-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-dashboards-plugins/opensearch-gantt-chart/opensearchGanttChartOpenSearch Dashboards-{{site.opensearch_major_minor_version}}.0.0.zip
```
This plugin adds a new Gantt chart visualization.
## List installed plugins
To check your installed plugins:
```bash
sudo bin/opensearch-dashboards-plugin list
```
## Remove plugins
To remove a plugin:
```bash
sudo bin/opensearch-dashboards-plugin remove <plugin-name>
```
For certain plugins, you must also remove the "optimze" bundle. This is a sample command for the Anomaly Detection plugin:
```bash
sudo rm /usr/share/opensearch-dashboards/optimize/bundles/opensearch-anomaly-detection-opensearch-dashboards.*
```
Then restart OpenSearch Dashboards. After you remove any plugin, OpenSearch Dashboards performs an optimize operation the next time you start it. This operation takes several minutes even on fast machines, so be patient.
## Update plugins
OpenSearch Dashboards doesnt update plugins. Instead, you have to remove the old version and its optimized bundle, reinstall them, and restart OpenSearch Dashboards:
1. Remove the old version:
```bash
sudo bin/opensearch-dashboards-plugin remove <plugin-name>
```
1. Remove the optimized bundle:
```bash
sudo rm /usr/share/opensearch-dashboards/optimize/bundles/<bundle-name>
```
1. Reinstall the new version:
```bash
sudo bin/opensearch-dashboards-plugin install <plugin-name>
```
1. Restart OpenSearch Dashboards.
For example, to remove and reinstall the anomaly detection plugin:
```bash
sudo bin/opensearch-plugin remove opensearch-anomaly-detection
sudo rm /usr/share/opensearch-dashboards/optimize/bundles/opensearch-anomaly-detection-opensearch-dashboards.*
sudo bin/opensearch-dashboards-plugin install <AD OpenSearch Dashboards plugin artifact URL>
```

View File

@ -0,0 +1,30 @@
---
layout: default
title: Tarball
parent: Install OpenSearch Dashboards
nav_order: 30
---
# Run OpenSearch Dashboards using the tarball
1. Download the tarball from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}.
1. Extract the TAR file to a directory and change to that directory:
```bash
# x64
tar -zxf opensearch-dashboards-{{site.opensearch_version}}-linux-x64.tar.gz
cd opensearch-dashboards{% comment %}# ARM64
tar -zxf opensearch-dashboards-{{site.opensearch_version}}-linux-arm64.tar.gz
cd opensearch-dashboards{% endcomment %}
```
1. If desired, modify `config/opensearch_dashboards.yml`.
1. Run OpenSearch Dashboards:
```bash
./bin/opensearch-dashboards
```
1. See the [OpenSearch Dashboards documentation](../../opensearch-dashboards/).

View File

@ -0,0 +1,29 @@
---
layout: default
title: WMS map server
nav_order: 5
---
# Configure WMS map server
OpenSearch Dashboards includes default map tiles, but if you need more specialized maps, you can configure OpenSearch Dashboards to use a WMS map server:
1. Open OpenSearch Dashboards at `https://<host>:<port>`. For example, [https://localhost:5601](https://localhost:5601).
1. If necessary, log in.
1. Choose **Management** and **Advanced Settings**.
1. Locate `visualization:tileMap:WMSdefaults`.
1. Change `enabled` to true and add the URL of a valid WMS map server:
```json
{
"enabled": true,
"url": "<wms-map-server-url>",
"options": {
"format": "image/png",
"transparent": true
}
}
```
Map services often have licensing fees or restrictions. You're responsible for all such considerations on any map server that you specify.
{: .note }

View File

@ -0,0 +1,63 @@
---
layout: default
title: Notebooks (experimental)
nav_order: 50
redirect_from: /docs/notebooks/
---
# OpenSearch Dashboards notebooks (experimental)
Notebooks have a known issue with [tenants](../../security/access-control/multi-tenancy/). If you open a notebook and can't see its visualizations, you might be under the wrong tenant, or you might not have access to the tenant at all.
{: .warning }
An OpenSearch Dashboards notebook is an interface that lets you easily combine live visualizations and narrative text in a single notebook interface.
Notebooks let you interactively explore data by running different visualizations that you can share with team members to collaborate on a project.
A notebook is a document composed of two elements: OpenSearch Dashboards visualizations and paragraphs (Markdown). Choose multiple timelines to compare and contrast visualizations.
Common use cases include creating postmortem reports, designing runbooks, building live infrastructure reports, and writing documentation.
## Get Started with notebooks
To get started, choose **OpenSearch Dashboards Notebooks** within OpenSearch Dashboards.
### Step 1: Create a notebook
A notebook is an interface for creating reports.
1. Choose **Create notebook** and enter a descriptive name.
1. Choose **Create**.
Choose **Notebook actions** to rename, duplicate, or delete a notebook.
### Step 2: Add a paragraph
Paragraphs combine text and visualizations for describing data.
#### Add a markdown paragraph
1. To add text, choose **Add markdown paragraph**.
1. Add rich text with markdown syntax.
![Markdown paragraph](../../images/markdown-notebook.png)
#### Add a visualization paragraph
1. To add a visualization, choose **Add OpenSearch Dashboards visualization paragraph**.
1. In **Title**, select your visualization and choose a date range. You can choose multiple timelines to compare and contrast visualizations.
1. To run and save a paragraph, choose **Run**.
You can perform the following actions on paragraphs:
- Add a new paragraph to the top of a report.
- Add a new paragraph to the bottom of a report.
- Run all the paragraphs at the same time.
- Clear the outputs of all paragraphs.
- Delete all the paragraphs.
- Move paragraphs up and down.

View File

@ -0,0 +1,54 @@
---
layout: default
title: Reporting
nav_order: 20
---
# Reporting
You can use OpenSearch Dashboards to create PNG, PDF, and CSV reports. To create reports, you must have the correct permissions. For a summary of the predefined roles and the permissions they grant, see the [security plugin](../../security/access-control/users-roles/#predefined-roles).
## Create reports from Discovery, Visualize, or Dashboard
Quickly generate an on-demand report from the current view.
1. From the top menu bar, choose **Reporting**.
1. For dashboards or visualizations, choose **Download PDF** or **Download PNG**. From the Discover page, choose **Download CSV**.
Reports generate asynchronously in the background and might take a few minutes, depending on the size of the report. A notification appears when your report is ready to download.
1. To create a schedule-based report, choose **Create report definition**. Then proceed to [Create reports using a definition](#create-reports-using-a-definition). This option pre-fills many of the fields for you based on the visualization, dashboard, or data you were viewing.
## Create reports using a definition
Definitions let you generate reports on a periodic schedule.
1. From the navigation panel, choose **Reporting**.
1. Choose **Create**.
1. Under **Report settings**, enter a name and optional description for your report.
1. Choose the **Report Source** (i.e. the page from which the report is generated). You can generate reports from the **Dashboard**, **Visualize**, or **Discover** pages.
1. Select your dashboard, visualization, or saved search. Then choose a time range for the report.
1. Choose an appropriate file format for the report.
1. (Optional) Add a header or footer to the report. Headers and footers are only available for dashboard or visualization reports.
1. Under **Report trigger**, choose either **On-demand** or **Schedule**.
For scheduled reports, select either **Recurring** or **Cron based**. You can receive reports daily or at some other time interval. Cron expressions give you even more flexiblity. See [Cron expression reference](../../alerting/cron/) for more information.
1. Choose **Create**.
## Troubleshooting
### Chromium fails to launch with OpenSearch Dashboards
While creating a report for dashboards or visualizations, you might see a the following error:
![OpenSearch Dashboards reporting pop-up error message](../../images/reporting-error.png)
This problem can occur for two reasons:
- You don't have the correct version of `headless-chrome` to match the operating system on which OpenSearch Dashboards is running. Download the correct version [here](https://github.com/opensearch-project/dashboards-reports/releases/tag/chromium-1.12.0.0).
- You're missing additional dependencies. Install the required dependencies for your operating system from the [additional libraries](https://github.com/opensearch-project/dashboards-reports/blob/main/dashboards-reports/rendering-engine/headless-chrome/README.md#additional-libaries) section.

View File

@ -0,0 +1,160 @@
---
layout: default
title: Aggregations
nav_order: 13
has_children: true
---
# Aggregations
OpenSearch isnt just for search. Aggregations let you tap into OpenSearch's powerful analytics engine to analyze your data and extract statistics from it.
The use cases of aggregations vary from analyzing data in real time to take some action to using OpenSearch Dashboards to create a visualization dashboard.
OpenSearch can perform aggregations on massive datasets in milliseconds. Compared to queries, aggregations consume more CPU cycles and memory.
## Aggregations on text fields
By default, OpenSearch doesn't support aggregations on a text field.
Because text fields are tokenized, an aggregation on a text field has to reverse the tokenization process back to its original string and then formulate an aggregation based on that. Such an operation consumes significant memory and degrades cluster performance.
While you can enable aggregations on text fields by setting the `fielddata` parameter to `true` in the mapping, the aggregations are still based on the tokenized words and not on the raw text.
We recommend keeping a raw version of the text field as a `keyword` field that you can aggregate on.
In this case, you can perform aggregations on the `title.raw` field, instead of the `title` field:
```json
PUT movies
{
"mappings": {
"properties": {
"title": {
"type": "text",
"fielddata": true,
"fields": {
"raw": {
"type": "keyword"
}
}
}
}
}
}
```
## General aggregation structure
The structure of an aggregation query is as follows:
```json
GET _search
{
"size": 0,
"aggs": {
"NAME": {
"AGG_TYPE": {}
}
}
}
```
If youre only interested in the aggregation result and not in the results of the query, set `size` to 0.
In the `aggs` property (you can use `aggregations` if you want), you can define any number of aggregations.
Each aggregation is defined by its name and one of the types of aggregations that OpenSearch supports.
The name of the aggregation helps you to distinguish between different aggregations in the response.
The `AGG_TYPE` property is where you specify the type of aggregation.
## Sample aggregation
This section uses the OpenSearch Dashboards sample e-commerce data and web log data. To add the sample data, log in to OpenSearch Dashboards, choose **Home** and **Try our sample data**. For **Sample eCommerce orders** and **Sample web logs**, choose **Add data**.
### avg
To find the average value of the `taxful_total_price` field:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"avg_taxful_total_price": {
"avg": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample response
```json
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4675,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"avg_taxful_total_price" : {
"value" : 75.05542864304813
}
}
}
```
The aggregation block in the response shows the average value for the `taxful_total_price` field.
## Types of aggregations
There are three main types of aggregations:
- Metric aggregations - Calculate metrics such as `sum`, `min`, `max`, and `avg` on numeric fields.
- Bucket aggregations - Sort query results into groups based on some criteria.
- Pipeline aggregations - Pipe the output of one aggregation as an input to another.
## Nested aggregations
Aggregations within aggregations are called nested or sub aggregations.
Metric aggregations produce simple results and can't contain nested aggregations.
Bucket aggregations produce buckets of documents that you can nest in other aggregations. You can perform complex analysis on your data by nesting metric and bucket aggregations within bucket aggregations.
### General nested aggregation syntax
```json
{
"aggs": {
"name": {
"type": {
"data"
},
"aggs": {
"nested": {
"type": {
"data"
}
}
}
}
}
}
```
The inner `aggs` keyword begins a new nested aggregation. The syntax of the parent aggregation and the nested aggregation is the same. Nested aggregations run in the context of the preceding parent aggregations.
You can also pair your aggregations with search queries to narrow down things youre trying to analyze before aggregating. If you don't add a query, OpenSearch implicitly uses the `match_all` query.

File diff suppressed because it is too large Load Diff

256
_opensearch_docs/catapis.md Normal file
View File

@ -0,0 +1,256 @@
---
layout: default
title: CAT API
nav_order: 20
---
# cat API
You can get essential statistics about your cluster in an easy-to-understand, tabular format using the compact and aligned text (CAT) API. The cat API is a human-readable interface that returns plain text instead of traditional JSON.
Using the cat API, you can answer questions like which node is the elected master, what state is the cluster in, how many documents are in each index, and so on.
To see the available operations in the cat API, use the following command:
```
GET _cat
```
You can also use the following string parameters with your query.
Parameter | Description
:--- | :--- |
`?v` | Makes the output more verbose by adding headers to the columns. It also adds some formatting to help align each of the columns together. All examples on this page include the `v` parameter.
`?help` | Lists the default and other available headers for a given operation.
`?h` | Limits the output to specific headers.
`?format` | Outputs the result in JSON, YAML, or CBOR formats.
`?sort` | Sorts the output by the specified columns.
To see what each column represents, use the `?v` parameter:
```
GET _cat/<operation_name>?v
```
To see all the available headers, use the `?help` parameter:
```
GET _cat/<operation_name>?help
```
To limit the output to a subset of headers, use the `?h` parameter:
```
GET _cat/<operation_name>?h=<header_name_1>,<header_name_2>&v
```
Typically, for any operation you can find out what headers are available using the `?help` parameter, and then use the `?h` parameter to limit the output to only the headers that you care about.
---
#### Table of contents
1. TOC
{:toc}
---
## Aliases
Lists the mapping of aliases to indices, plus routing and filtering information.
```
GET _cat/aliases?v
```
To limit the information to a specific alias, add the alias name after your query.
```
GET _cat/aliases/<alias>?v
```
## Allocation
Lists the allocation of disk space for indices and the number of shards on each node.
Default request:
```
GET _cat/allocation?v
```
## Count
Lists the number of documents in your cluster.
```
GET _cat/count?v
```
To see the number of documents in a specific index, add the index name after your query.
```
GET _cat/count/<index>?v
```
## Field data
Lists the memory size used by each field per node.
```
GET _cat/fielddata?v
```
To limit the information to a specific field, add the field name after your query.
```
GET _cat/fielddata/<fields>?v
```
## Health
Lists the status of the cluster, how long the cluster has been up, the number of nodes, and other useful information that helps you analyze the health of your cluster.
```
GET _cat/health?v
```
## Indices
Lists information related to indices—how much disk space they are using, how many shards they have, their health status, and so on.
```
GET _cat/indices?v
```
To limit the information to a specific index, add the index name after your query.
```
GET _cat/indices/<index>?v
```
## Master
Lists information that helps identify the elected master node.
```
GET _cat/master?v
```
## Node attributes
Lists the attributes of custom nodes.
```
GET _cat/nodeattrs?v
```
## Nodes
Lists node-level information, including node roles and load metrics.
A few important node metrics are `pid`, `name`, `master`, `ip`, `port`, `version`, `build`, `jdk`, along with `disk`, `heap`, `ram`, and `file_desc`.
```
GET _cat/nodes?v
```
## Pending tasks
Lists the progress of all pending tasks, including task priority and time in queue.
```
GET _cat/pending_tasks?v
```
## Plugins
Lists the names, components, and versions of the installed plugins.
```
GET _cat/plugins?v
```
## Recovery
Lists all completed and ongoing index and shard recoveries.
```
GET _cat/recovery?v
```
To see only the recoveries of a specific index, add the index name after your query.
```
GET _cat/recovery/<index>?v
```
## Repositories
Lists all snapshot repositories and their types.
```
GET _cat/repositories?v
```
## Segments
Lists Lucene segment-level information for each index.
```
GET _cat/segments?v
```
To see only the information about segments of a specific index, add the index name after your query.
```
GET _cat/segments/<index>?v
```
## Shards
Lists the state of all primary and replica shards and how they are distributed.
```
GET _cat/shards?v
```
To see only the information about shards of a specific index, add the index name after your query.
```
GET _cat/shards/<index>?v
```
## Snapshots
Lists all snapshots for a repository.
```
GET _cat/snapshots/<repository>?v
```
## Tasks
Lists the progress of all tasks currently running on your cluster.
```
GET _cat/tasks?v
```
## Templates
Lists the names, patterns, order numbers, and version numbers of index templates.
```
GET _cat/templates?v
```
## Thread pool
Lists the active, queued, and rejected threads of different thread pools on each node.
```
GET _cat/thread_pool?v
```
To limit the information to a specific thread pool, add the thread pool name after your query.
```
GET _cat/thread_pool/<thread_pool>?v
```

338
_opensearch_docs/cluster.md Normal file
View File

@ -0,0 +1,338 @@
---
layout: default
title: Cluster formation
nav_order: 7
---
# Cluster formation
Before diving into OpenSearch and searching and aggregating data, you first need to create an OpenSearch cluster.
OpenSearch can operate as a single-node or multi-node cluster. The steps to configure both are, in general, quite similar. This page demonstrates how to create and configure a multi-node cluster, but with only a few minor adjustments, you can follow the same steps to create a single-node cluster.
To create and deploy an OpenSearch cluster according to your requirements, its important to understand how node discovery and cluster formation work and what settings govern them.
There are many ways to design a cluster. The following illustration shows a basic architecture:
![multi-node cluster architecture diagram](../../images/cluster.png)
This is a four-node cluster that has one dedicated master node, one dedicated coordinating node, and two data nodes that are master-eligible and also used for ingesting data.
The following table provides brief descriptions of the node types:
Node type | Description | Best practices for production
:--- | :--- | :-- |
`Master` | Manages the overall operation of a cluster and keeps track of the cluster state. This includes creating and deleting indices, keeping track of the nodes that join and leave the cluster, checking the health of each node in the cluster (by running ping requests), and allocating shards to nodes. | Three dedicated master nodes in three different zones is the right approach for almost all production use cases. This configuration ensures your cluster never loses quorum. Two nodes will be idle for most of the time except when one node goes down or needs some maintenance.
`Master-eligible` | Elects one node among them as the master node through a voting process. | For production clusters, make sure you have dedicated master nodes. The way to achieve a dedicated node type is to mark all other node types as false. In this case, you have to mark all the other nodes as not master-eligible.
`Data` | Stores and searches data. Performs all data-related operations (indexing, searching, aggregating) on local shards. These are the worker nodes of your cluster and need more disk space than any other node type. | As you add data nodes, keep them balanced between zones. For example, if you have three zones, add data nodes in multiples of three, one for each zone. We recommend using storage and RAM-heavy nodes.
`Ingest` | Preprocesses data before storing it in the cluster. Runs an ingest pipeline that transforms your data before adding it to an index. | If you plan to ingest a lot of data and run complex ingest pipelines, we recommend you use dedicated ingest nodes. You can also optionally offload your indexing from the data nodes so that your data nodes are used exclusively for searching and aggregating.
`Coordinating` | Delegates client requests to the shards on the data nodes, collects and aggregates the results into one final result, and sends this result back to the client. | A couple of dedicated coordinating-only nodes is appropriate to prevent bottlenecks for search-heavy workloads. We recommend using CPUs with as many cores as you can.
By default, each node is a master-eligible, data, ingest, and coordinating node. Deciding on the number of nodes, assigning node types, and choosing the hardware for each node type depends on your use case. You must take into account factors like the amount of time you want to hold on to your data, the average size of your documents, your typical workload (indexing, searches, aggregations), your expected price-performance ratio, your risk tolerance, and so on.
After you assess all these requirements, we recommend you use a benchmark testing tool like Rally to provision a small sample cluster and run tests with varying workloads and configurations. Compare and analyze the system and query metrics for these tests to design an optimum architecture. To get started with Rally, see the [Rally documentation](https://esrally.readthedocs.io/en/stable/).
This page demonstrates how to work with the different node types. It assumes that you have a four-node cluster similar to the preceding illustration.
## Prerequisites
Before you get started, you must install and configure OpenSearch on all of your nodes. For information about the available options, see [Install and configure OpenSearch](../../install/).
After you're done, use SSH to connect to each node, then open the `config/opensearch.yml` file. You can set all configurations for your cluster in this file.
## Step 1: Name a cluster
Specify a unique name for the cluster. If you don't specify a cluster name, it's set to `opensearch` by default. Setting a descriptive cluster name is important, especially if you want to run multiple clusters inside a single network.
To specify the cluster name, change the following line:
```yml
#cluster.name: my-application
```
to
```yml
cluster.name: opensearch-cluster
```
Make the same change on all the nodes to make sure that they'll join to form a cluster.
## Step 2: Set node attributes for each node in a cluster
After you name the cluster, set node attributes for each node in your cluster.
#### Master node
Give your master node a name. If you don't specify a name, OpenSearch assigns a machine-generated name that makes the node difficult to monitor and troubleshoot.
```yml
node.name: opensearch-master
```
You can also explicitly specify that this node is a master node. This is already true by default, but adding it makes it easier to identify the master node:
```yml
node.master: true
```
Then make the node a dedicated master that wont perform double-duty as a data node:
```yml
node.data: false
```
Specify that this node will not be used for ingesting data:
```yml
node.ingest: false
```
#### Data nodes
Change the name of two nodes to `opensearch-d1` and `opensearch-d2`, respectively:
```yml
node.name: opensearch-d1
```
```yml
node.name: opensearch-d2
```
You can make them master-eligible data nodes that will also be used for ingesting data:
```yml
node.master: true
node.data: true
node.ingest: true
```
You can also specify any other attributes that you'd like to set for the data nodes.
#### Coordinating node
Change the name of the coordinating node to `opensearch-c1`:
```yml
node.name: opensearch-c1
```
Every node is a coordinating node by default, so to make this node a dedicated coordinating node, set `node.master`, `node.data`, and `node.ingest` to `false`:
```yml
node.master: false
node.data: false
node.ingest: false
```
## Step 3: Bind a cluster to specific IP addresses
`network_host` defines the IP address used to bind the node. By default, OpenSearch listens on a local host, which limits the cluster to a single node. You can also use `_local_` and `_site_` to bind to any loopback or site-local address, whether IPv4 or IPv6:
```yml
network.host: [_local_, _site_]
```
To form a multi-node cluster, specify the IP address of the node:
```yml
network.host: <IP address of the node>
```
Make sure to configure these settings on all of your nodes.
## Step 4: Configure discovery hosts for a cluster
Now that you've configured the network hosts, you need to configure the discovery hosts.
Zen Discovery is the built-in, default mechanism that uses [unicast](https://en.wikipedia.org/wiki/Unicast) to find other nodes in the cluster.
You can generally just add all your master-eligible nodes to the `discovery.seed_hosts` array. When a node starts up, it finds the other master-eligible nodes, determines which one is the master, and asks to join the cluster.
For example, for `opensearch-master` the line looks something like this:
```yml
discovery.seed_hosts: ["<private IP of opensearch-d1>", "<private IP of opensearch-d2>", "<private IP of opensearch-c1>"]
```
## Step 5: Start the cluster
After you set the configurations, start OpenSearch on all nodes:
```bash
sudo systemctl start opensearch.service
```
Then go to the logs file to see the formation of the cluster:
```bash
less /var/log/opensearch/opensearch-cluster.log
```
Perform the following `_cat` query on any node to see all the nodes formed as a cluster:
```bash
curl -XGET https://<private-ip>:9200/_cat/nodes?v -u 'admin:admin' --insecure
```
```
ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
x.x.x.x 13 61 0 0.02 0.04 0.05 mi * opensearch-master
x.x.x.x 16 60 0 0.06 0.05 0.05 md - opensearch-d1
x.x.x.x 34 38 0 0.12 0.07 0.06 md - opensearch-d2
x.x.x.x 23 38 0 0.12 0.07 0.06 md - opensearch-c1
```
To better understand and monitor your cluster, use the [cat API](../catapis/).
## (Advanced) Step 6: Configure shard allocation awareness or forced awareness
If your nodes are spread across several geographical zones, you can configure shard allocation awareness to allocate all replica shards to a zone thats different from their primary shard.
With shard allocation awareness, if the nodes in one of your zones fail, you can be assured that your replica shards are spread across your other zones. It adds a layer of fault tolerance to ensure your data survives a zone failure beyond just individual node failures.
To configure shard allocation awareness, add zone attributes to `opensearch-d1` and `opensearch-d2`, respectively:
```yml
node.attr.zone: zoneA
```
```yml
node.attr.zone: zoneB
```
Update the cluster settings:
```json
PUT _cluster/settings
{
"persistent": {
"cluster.routing.allocation.awareness.attributes": "zone"
}
}
```
You can either use `persistent` or `transient` settings. We recommend the `persistent` setting because it persists through a cluster reboot. Transient settings don't persist through a cluster reboot.
Shard allocation awareness attempts to separate primary and replica shards across multiple zones. However, if only one zone is available (such as after a zone failure), OpenSearch allocates replica shards to the only remaining zone.
Another option is to require that primary and replica shards are never allocated to the same zone. This is called forced awareness.
To configure forced awareness, specify all the possible values for your zone attributes:
```json
PUT _cluster/settings
{
"persistent": {
"cluster.routing.allocation.awareness.attributes": "zone",
"cluster.routing.allocation.awareness.force.zone.values":["zoneA", "zoneB"]
}
}
```
Now, if a data node fails, forced awareness doesn't allocate the replicas to a node in the same zone. Instead, the cluster enters a yellow state and only allocates the replicas when nodes in another zone come online.
In our two-zone architecture, we can use allocation awareness if `opensearch-d1` and `opensearch-d2` are less than 50% utilized, so that each of them have the storage capacity to allocate replicas in the same zone.
If that is not the case, and `opensearch-d1` and `opensearch-d2` do not have the capacity to contain all primary and replica shards, we can use forced awareness. This approach helps to make sure that, in the event of a failure, OpenSearch doesn't overload your last remaining zone and lock up your cluster due to lack of storage.
Choosing allocation awareness or forced awareness depends on how much space you might need in each zone to balance your primary and replica shards.
## (Advanced) Step 7: Set up a hot-warm architecture
You can design a hot-warm architecture where you first index your data to hot nodes---fast and expensive---and after a certain period of time move them to warm nodes---slow and cheap.
If you analyze time series data that you rarely update and want the older data to go onto cheaper storage, this architecture can be a good fit.
This architecture helps save money on storage costs. Rather than increasing the number of hot nodes and using fast, expensive storage, you can add warm nodes for data that you don't access as frequently.
To configure a hot-warm storage architecture, add `temp` attributes to `opensearch-d1` and `opensearch-d2`, respectively:
```yml
node.attr.temp: hot
```
```yml
node.attr.temp: warm
```
You can set the attribute name and value to whatever you want as long as its consistent for all your hot and warm nodes.
To add an index `newindex` to the hot node:
```json
PUT newindex
{
"settings": {
"index.routing.allocation.require.temp": "hot"
}
}
```
Take a look at the following shard allocation for `newindex`:
```json
GET _cat/shards/newindex?v
index shard prirep state docs store ip node
new_index 2 p STARTED 0 230b 10.0.0.225 opensearch-d1
new_index 2 r UNASSIGNED
new_index 3 p STARTED 0 230b 10.0.0.225 opensearch-d1
new_index 3 r UNASSIGNED
new_index 4 p STARTED 0 230b 10.0.0.225 opensearch-d1
new_index 4 r UNASSIGNED
new_index 1 p STARTED 0 230b 10.0.0.225 opensearch-d1
new_index 1 r UNASSIGNED
new_index 0 p STARTED 0 230b 10.0.0.225 opensearch-d1
new_index 0 r UNASSIGNED
```
In this example, all primary shards are allocated to `opensearch-d1`, which is our hot node. All replica shards are unassigned, because we're forcing this index to allocate only to hot nodes.
To add an index `oldindex` to the warm node:
```json
PUT oldindex
{
"settings": {
"index.routing.allocation.require.temp": "warm"
}
}
```
The shard allocation for `oldindex`:
```json
GET _cat/shards/oldindex?v
index shard prirep state docs store ip node
old_index 2 p STARTED 0 230b 10.0.0.74 opensearch-d2
old_index 2 r UNASSIGNED
old_index 3 p STARTED 0 230b 10.0.0.74 opensearch-d2
old_index 3 r UNASSIGNED
old_index 4 p STARTED 0 230b 10.0.0.74 opensearch-d2
old_index 4 r UNASSIGNED
old_index 1 p STARTED 0 230b 10.0.0.74 opensearch-d2
old_index 1 r UNASSIGNED
old_index 0 p STARTED 0 230b 10.0.0.74 opensearch-d2
old_index 0 r UNASSIGNED
```
In this case, all primary shards are allocated to `opensearch-d2`. Again, all replica shards are unassigned because we only have one warm node.
A popular approach is to configure your [index templates](../index-templates/) to set the `index.routing.allocation.require.temp` value to `hot`. This way, OpenSearch stores your most recent data on your hot nodes.
You can then use the [Index State Management (ISM)](../../ism/index/) plugin to periodically check the age of an index and specify actions to take on it. For example, when the index reaches a specific age, change the `index.routing.allocation.require.temp` setting to `warm` to automatically move your data from hot nodes to warm nodes.
## Next steps
If you are using the security plugin, the previous request to `_cat/nodes?v` might have failed with an initialization error. To initialize the plugin, run `opensearch/plugins/opensearch-security/tools/securityadmin.sh`. A sample command that uses the demo certificates might look like this:
```bash
sudo ./securityadmin.sh -cd ../securityconfig/ -icl -nhnv -cacert /etc/opensearch/root-ca.pem -cert /etc/opensearch/kirk.pem -key /etc/opensearch/kirk-key.pem -h <private-ip>
```
For full guidance around configuration options, see [Security configuration](../../security/configuration).

View File

@ -0,0 +1,17 @@
---
layout: default
title: Common REST Parameters
nav_order: 93
---
# Common REST parameters
OpenSearch supports the following parameters for all REST operations:
Option | Description | Example
:--- | :--- | :---
Human-readable output | To convert output units to human-readable values (for example, `1h` for 1 hour and `1kb` for 1,024 bytes), add `?human=true` to the request URL. | `GET <index_name>/_search?human=true`
Pretty result | To get back JSON responses in a readable format, add `?pretty=true` to the request URL. | `GET <index_name>/_search?pretty=true`
Content type | To specify the type of content in the request body, use the `Content-Type` key name in the request header. Most operations support JSON, YAML, and CBOR formats. | `POST _scripts/<template_name> -H 'Content-Type: application/json`
Request body in query string | If the client library does not accept a request body for non-POST requests, use the `source` query string parameter to pass the request body. Also, specify the `source_content_type` parameter with a supported media type such as `application/json`. | `GET _search?source_content_type=application/json&source={"query":{"match_all":{}}}`
Stack traces | To include the error stack trace in the response when an exception is raised, add `error_trace=true` to the request URL. | `GET <index_name>/_search?error_trace=true`

View File

@ -0,0 +1,68 @@
---
layout: default
title: Configuration
nav_order: 5
---
# OpenSearch configuration
Most OpenSearch configuration can take place in the cluster settings API. Certain operations require you to modify `opensearch.yml` and restart the cluster.
Whenever possible, use the cluster settings API instead; `opensearch.yml` is local to each node, whereas the API applies the setting to all nodes in the cluster.
## Cluster settings API
The first step in changing a setting is to view the current settings:
```
GET _cluster/settings?include_defaults=true
```
For a more concise summary of non-default settings:
```
GET _cluster/settings
```
Three categories of setting exist in the cluster settings API: persistent, transient, and default. Persistent settings, well, persist after a cluster restart. After a restart, OpenSearch clears transient settings.
If you specify the same setting in multiple places, OpenSearch uses the following precedence:
1. Transient settings
2. Persistent settings
3. Settings from `opensearch.yml`
4. Default settings
To change a setting, just specify the new one as either persistent or transient. This example shows the flat settings form:
```json
PUT /_cluster/settings
{
"persistent" : {
"action.auto_create_index" : false
}
}
```
You can also use the expanded form, which lets you copy and paste from the GET response and change existing values:
```json
PUT /_cluster/settings
{
"persistent": {
"action": {
"auto_create_index": false
}
}
}
```
---
## Configuration file
You can find `opensearch.yml` in `/usr/share/opensearch/config/opensearch.yml` (Docker) or `/etc/opensearch/opensearch.yml` (RPM and DEB) on each node.
The demo configuration includes a number of settings for the security plugin that you should modify before using OpenSearch for a production workload. To learn more, see [Security](../../security/).

View File

@ -0,0 +1,202 @@
---
layout: default
title: Index aliases
nav_order: 12
---
# Index aliases
An alias is a virtual index name that can point to one or more indices.
If your data is spread across multiple indices, rather than keeping track of which indices to query, you can create an alias and query it instead.
For example, if youre storing logs into indices based on the month and you frequently query the logs for the previous two months, you can create a `last_2_months` alias and update the indices it points to each month.
Because you can change the indices an alias points to at any time, referring to indices using aliases in your applications allows you to reindex your data without any downtime.
---
#### Table of contents
1. TOC
{:toc}
---
## Create aliases
To create an alias, use a POST request:
```json
POST _aliases
```
Use the `actions` method to specify the list of actions that you want to perform. This command creates an alias named `alias1` and adds `index-1` to this alias:
```json
POST _aliases
{
"actions": [
{
"add": {
"index": "index-1",
"alias": "alias1"
}
}
]
}
```
You should see the following response:
```json
{
"acknowledged": true
}
```
If this request fails, make sure the index that you're adding to the alias already exists.
To check if `alias1` refers to `index-1`, run the following command:
```json
GET alias1
```
## Add or remove indices
You can perform multiple actions in the same `_aliases` operation.
For example, the following command removes `index-1` and adds `index-2` to `alias1`:
```json
POST _aliases
{
"actions": [
{
"remove": {
"index": "index-1",
"alias": "alias1"
}
},
{
"add": {
"index": "index-2",
"alias": "alias1"
}
}
]
}
```
The `add` and `remove` actions occur atomically, which means that at no point will `alias1` point to both `index-1` and `index-2`.
You can also add indices based on an index pattern:
```json
POST _aliases
{
"actions": [
{
"add": {
"index": "index*",
"alias": "alias1"
}
}
]
}
```
## Manage aliases
To list the mapping of aliases to indices, run the following command:
```json
GET _cat/aliases?v
```
#### Sample response
```json
alias index filter routing.index routing.search
alias1 index-1 * - -
```
To check which indices an alias points to, run the following command:
```json
GET _alias/alias1
```
#### Sample response
```json
{
"index-2": {
"aliases": {
"alias1": {}
}
}
}
```
Conversely, to find which alias points to a specific index, run the following command:
```json
GET /index-2/_alias/*
```
To check if an alias exists, run the following command:
```json
HEAD /alias1/_alias/
```
## Add aliases at index creation
You can add an index to an alias as you create the index:
```json
PUT index-1
{
"aliases": {
"alias1": {}
}
}
```
## Create filtered aliases
You can create a filtered alias to access a subset of documents or fields from the underlying indices.
This command adds only a specific timestamp field to `alias1`:
```json
POST _aliases
{
"actions": [
{
"add": {
"index": "index-1",
"alias": "alias1",
"filter": {
"term": {
"timestamp": "1574641891142"
}
}
}
}
]
}
```
## Index alias options
You can specify the options shown in the following table.
Option | Valid values | Description | Required
:--- | :--- | :---
`index` | String | The name of the index that the alias points to. | Yes
`alias` | String | The name of the alias. | No
`filter` | Object | Add a filter to the alias. | No
`routing` | String | Limit search to an associated shard value. You can specify `search_routing` and `index_routing` independently. | No
`is_write_index` | String | Specify the index that accepts any write operations to the alias. If this value is not specified, then no write operations are allowed. | No

View File

@ -0,0 +1,271 @@
---
layout: default
title: Index data
nav_order: 10
---
# Index data
You index data using the OpenSearch REST API. Two APIs exist: the index API and the `_bulk` API.
For situations in which new data arrives incrementally (for example, customer orders from a small business), you might use the index API to add documents individually as they arrive. For situations in which the flow of data is less frequent (for example, weekly updates to a marketing website), you might prefer to generate a file and send it to the `_bulk` API. For large numbers of documents, lumping requests together and using the `_bulk` API offers superior performance. If your documents are enormous, however, you might need to index them individually.
## Introduction to indexing
Before you can search data, you must *index* it. Indexing is the method by which search engines organize data for fast retrieval. The resulting structure is called, fittingly, an index.
In OpenSearch, the basic unit of data is a JSON *document*. Within an index, OpenSearch identifies each document using a unique ID.
A request to the index API looks like this:
```json
PUT <index>/_doc/<id>
{ "A JSON": "document" }
```
A request to the `_bulk` API looks a little different, because you specify the index and ID in the bulk data:
```json
POST _bulk
{ "index": { "_index": "<index>", "_id": "<id>" } }
{ "A JSON": "document" }
```
Bulk data must conform to a specific format, which requires a newline character (`\n`) at the end of every line, including the last line. This is the basic format:
```
Action and metadata\n
Optional document\n
Action and metadata\n
Optional document\n
```
The document is optional, because `delete` actions don't require a document. The other actions (`index`, `create`, and `update`) all require a document. If you specifically want the action to fail if the document already exists, use the `create` action instead of the `index` action.
{: .note }
To index bulk data using the `curl` command, navigate to the folder where you have your file saved and run the following command:
```json
curl -H "Content-Type: application/x-ndjson" -POST https://localhost:9200/data/_bulk -u 'admin:admin' --insecure --data-binary "@data.json"
```
If any one of the actions in the `_bulk` API fail, OpenSearch continues to execute the other actions. Examine the `items` array in the response to figure out what went wrong. The entries in the `items` array are in the same order as the actions specified in the request.
OpenSearch automatically creates an index when you add a document to an index that doesn't already exist. It also automatically generates an ID if you don't specify an ID in the request. This simple example automatically creates the movies index, indexes the document, and assigns it a unique ID:
```json
POST movies/_doc
{ "title": "Spirited Away" }
```
Automatic ID generation has a clear downside: because the indexing request didn't specify a document ID, you can't easily update the document at a later time. Also, if you run this request 10 times, OpenSearch indexes this document as 10 different documents with unique IDs. To specify an ID of 1, use the following request (note the use of PUT instead of POST):
```json
PUT movies/_doc/1
{ "title": "Spirited Away" }
```
Because you must specify an ID, if you run this command 10 times, you still have just one document indexed with the `_version` field incremented to 10.
Indices default to one primary shard and one replica. If you want to specify non-default settings, create the index before adding documents:
```json
PUT more-movies
{ "settings": { "number_of_shards": 6, "number_of_replicas": 2 } }
```
## Naming restrictions for indices
OpenSearch indices have the following naming restrictions:
- All letters must be lowercase.
- Index names can't begin with underscores (`_`) or hyphens (`-`).
- Index names can't contain spaces, commas, or the following characters:
`:`, `"`, `*`, `+`, `/`, `\`, `|`, `?`, `#`, `>`, or `<`
## Read data
After you index a document, you can retrieve it by sending a GET request to the same endpoint that you used for indexing:
```json
GET movies/_doc/1
{
"_index" : "movies",
"_type" : "_doc",
"_id" : "1",
"_version" : 1,
"_seq_no" : 0,
"_primary_term" : 1,
"found" : true,
"_source" : {
"title" : "Spirited Away"
}
}
```
You can see the document in the `_source` object. If the document is not found, the `found` key is `false` and the `_source` object is not part of the response.
To retrieve multiple documents with a single command, use the `_mget` operation.
The format for retrieving multiple documents is similar to the `_bulk` operation, where you must specify the index and ID in the request body:
```json
GET _mget
{
"docs": [
{
"_index": "<index>",
"_id": "<id>"
},
{
"_index": "<index>",
"_id": "<id>"
}
]
}
```
To only return specific fields in a document:
```json
GET _mget
{
"docs": [
{
"_index": "<index>",
"_id": "<id>",
"_source": "field1"
},
{
"_index": "<index>",
"_id": "<id>",
"_source": "field2"
}
]
}
```
To check if a document exists:
```json
HEAD movies/_doc/<doc-id>
```
If the document exists, you get back a `200 OK` response, and if it doesn't, you get back a `404 - Not Found` error.
## Update data
To update existing fields or to add new fields, send a POST request to the `_update` operation with your changes in a `doc` object:
```json
POST movies/_update/1
{
"doc": {
"title": "Castle in the Sky",
"genre": ["Animation", "Fantasy"]
}
}
```
Note the updated `title` field and new `genre` field:
```json
GET movies/_doc/1
{
"_index" : "movies",
"_type" : "_doc",
"_id" : "1",
"_version" : 2,
"_seq_no" : 1,
"_primary_term" : 1,
"found" : true,
"_source" : {
"title" : "Castle in the Sky",
"genre" : [
"Animation",
"Fantasy"
]
}
}
```
The document also has an incremented `_version` field. Use this field to keep track of how many times a document is updated.
POST requests make partial updates to documents. To altogether replace a document, use a PUT request:
```json
PUT movies/_doc/1
{
"title": "Spirited Away"
}
```
The document with ID of 1 will contain only the `title` field, because the entire document will be replaced with the document indexed in this PUT request.
Use the `upsert` object to conditionally update documents based on whether they already exist. Here, if the document exists, its `title` field changes to `Castle in the Sky`. If it doesn't, OpenSearch indexes the document in the `upsert` object.
```json
POST movies/_update/2
{
"doc": {
"title": "Castle in the Sky"
},
"upsert": {
"title": "Only Yesterday",
"genre": ["Animation", "Fantasy"],
"date": 1993
}
}
```
#### Sample response
```json
{
"_index" : "movies",
"_type" : "_doc",
"_id" : "2",
"_version" : 2,
"result" : "updated",
"_shards" : {
"total" : 2,
"successful" : 1,
"failed" : 0
},
"_seq_no" : 3,
"_primary_term" : 1
}
```
Each update operation for a document has a unique combination of the `_seq_no` and `_primary_term` values.
OpenSearch first writes your updates to the primary shard and then sends this change to all the replica shards. An uncommon issue can occur if multiple users of your OpenSearch-based application make updates to existing documents in the same index. In this situation, another user can read and update a document from a replica before it receives your update from the primary shard. Your update operation then ends up updating an older version of the document. In the best case, you and the other user make the same changes, and the document remains accurate. In the worst case, the document now contains out-of-date information.
To prevent this situation, use the `_seq_no` and `_primary_term` values in the request header:
```json
POST movies/_update/2?if_seq_no=3&if_primary_term=1
{
"doc": {
"title": "Castle in the Sky",
"genre": ["Animation", "Fantasy"]
}
}
```
If the document is updated after we retrieved it, the `_seq_no` and `_primary_term` values are different and our update operation fails with a `409 — Conflict` error.
When using the `_bulk` API, specify the `_seq_no` and `_primary_term` values within the action metadata.
## Delete data
To delete a document from an index, use a DELETE request:
```json
DELETE movies/_doc/1
```
The DELETE operation increments the `_version` field. If you add the document back to the same ID, the `_version` field increments again. This behavior occurs because OpenSearch deletes the document `_source`, but retains its metadata.

View File

@ -0,0 +1,202 @@
---
layout: default
title: Index templates
nav_order: 14
---
# Index templates
Index templates let you initialize new indices with predefined mappings and settings. For example, if you continuously index log data, you can define an index template so that all of these indices have the same number of shards and replicas.
---
#### Table of contents
1. TOC
{:toc}
---
## Create a template
To create an index template, use a POST request:
```json
POST _index_template
```
This command creates a template named `daily_logs` and applies it to any new index whose name matches the regular expression `logs-2020-01-*` and also adds it to the `my_logs` alias:
```json
PUT _index_template/daily_logs
{
"index_patterns": [
"logs-2020-01-*"
],
"template": {
"aliases": {
"my_logs": {}
},
"settings": {
"number_of_shards": 2,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"value": {
"type": "double"
}
}
}
}
}
```
You should see the following response:
```json
{
"acknowledged": true
}
```
If you create an index named `logs-2020-01-01`, you can see that it has the mappings and settings from the template:
```json
PUT logs-2020-01-01
GET logs-2020-01-01
```
```json
{
"logs-2020-01-01": {
"aliases": {
"my_logs": {}
},
"mappings": {
"properties": {
"timestamp": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"value": {
"type": "double"
}
}
},
"settings": {
"index": {
"creation_date": "1578107970779",
"number_of_shards": "2",
"number_of_replicas": "1",
"uuid": "U1vMDMOHSAuS2IzPcPHpOA",
"version": {
"created": "7010199"
},
"provided_name": "logs-2020-01-01"
}
}
}
}
```
Any additional indices that match this pattern---`logs-2020-01-02`, `logs-2020-01-03`, and so on---will inherit the same mappings and settings.
## Retrieve a template
To list all index templates:
```json
GET _cat/templates
```
To find a template by its name:
```json
GET _index_template/daily_logs
```
To get a list of all your templates:
```json
GET _index_template/daily_logs
```
To get a list of all templates that match a pattern:
```json
GET _index_template/daily*
```
To check if a specific template exists:
```json
HEAD _index_template/<name>
```
## Configure multiple templates
You can create multiple index templates for your indices. If the index name matches more than one template, OpenSearch merges all mappings and settings from all matching templates and applies them to the index.
The settings from the more recently created index templates override the settings of older index templates. So, you can first define a few common settings in a generic template that can act as a catch-all and then add more specialized settings as required.
An even better approach is to explicitly specify template priority using the `order` parameter. OpenSearch applies templates with lower priority numbers first and then overrides them with templates with higher priority numbers.
For example, say you have the following two templates that both match the `logs-2020-01-02` index and theres a conflict in the `number_of_shards` field:
#### Template 1
```json
PUT _index_template/template-01
{
"index_patterns": [
"logs*"
],
"priority": 0,
"template": {
"settings": {
"number_of_shards": 2
}
}
}
```
#### Template 2
```json
PUT _index_template/template-02
{
"index_patterns": [
"logs-2020-01-*"
],
"priority": 1,
"template": {
"settings": {
"number_of_shards": 3
}
}
}
```
Because `template-02` has a higher `priority` value, it takes precedence over `template-01` . The `logs-2020-01-02` index would have the `number_of_shards` value as 3.
## Delete a template
You can delete an index template using its name:
```json
DELETE _index_template/daily_logs
```
## Index template options
You can specify the following template options:
Option | Type | Description | Required
:--- | :--- | :--- | :---
`priority` | `Number` | The priority of the index template. | No
`create` | `Boolean` | Whether this index template should replace an existing one. | No

89
_opensearch_docs/index.md Normal file
View File

@ -0,0 +1,89 @@
---
layout: default
title: Introduction to OpenSearch
nav_order: 1
---
# Introduction to OpenSearch
OpenSearch is a distributed search and analytics engine based on [Apache Lucene](https://lucene.apache.org/). After adding your data to OpenSearch, you can perform full-text searches on it with all of the features you might expect: search by field, search multiple indices, boost fields, rank results by score, sort results by field, and aggregate results.
Unsurprisingly, people often use OpenSearch as the backend for a search application---think [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:FAQ/Technical#What_software_is_used_to_run_Wikipedia?) or an online store. It offers excellent performance and can scale up and down as the needs of the application grow or shrink.
An equally popular, but less obvious use case is log analytics, in which you take the logs from an application, feed them into OpenSearch, and use the rich search and visualization functionality to identify issues. For example, a malfunctioning web server might throw a 500 error 0.5% of the time, which can be hard to notice unless you have a real-time graph of all HTTP status codes that the server has thrown in the past four hours. You can use [OpenSearch Dashboards](../opensearch-dashboards/) to build these sorts of visualizations from data in OpenSearch.
## Clusters and nodes
Its distributed design means that you interact with OpenSearch *clusters*. Each cluster is a collection of one or more *nodes*, servers that store your data and process search requests.
You can run OpenSearch locally on a laptop---its system requirements are minimal---but you can also scale a single cluster to hundreds of powerful machines in a data center.
In a single node cluster, such as a laptop, one machine has to do everything: manage the state of the cluster, index and search data, and perform any preprocessing of data prior to indexing it. As a cluster grows, however, you can subdivide responsibilities. Nodes with fast disks and plenty of RAM might be great at indexing and searching data, whereas a node with plenty of CPU power and a tiny disk could manage cluster state. For more information on setting node types, see [Cluster formation](cluster/).
## Indices and documents
OpenSearch organizes data into *indices*. Each index is a collection of JSON *documents*. If you have a set of raw encyclopedia articles or log lines that you want to add to OpenSearch, you must first convert them to [JSON](https://www.json.org/). A simple JSON document for a movie might look like this:
```json
{
"title": "The Wind Rises",
"release_date": "2013-07-20"
}
```
When you add the document to an index, OpenSearch adds some metadata, such as the unique document *ID*:
```json
{
"_index": "<index-name>",
"_type": "_doc",
"_id": "<document-id>",
"_version": 1,
"_source": {
"title": "The Wind Rises",
"release_date": "2013-07-20"
}
}
```
Indices also contain mappings and settings:
- A *mapping* is the collection of *fields* that documents in the index have. In this case, those fields are `title` and `release_date`.
- Settings include data like the index name, creation date, and number of shards.
## Primary and replica shards
OpenSearch splits indices into *shards* for even distribution across nodes in a cluster. For example, a 400 GB index might be too large for any single node in your cluster to handle, but split into ten shards, each one 40 GB, OpenSearch can distribute the shards across ten nodes and work with each shard individually.
By default, OpenSearch creates a *replica* shard for each *primary* shard. If you split your index into ten shards, for example, OpenSearch also creates ten replica shards. These replica shards act as backups in the event of a node failure---OpenSearch distributes replica shards to different nodes than their corresponding primary shards---but they also improve the speed and rate at which the cluster can process search requests. You might specify more than one replica per index for a search-heavy workload.
Despite being a piece of an OpenSearch index, each shard is actually a full Lucene index---confusing, we know. This detail is important, though, because each instance of Lucene is a running process that consumes CPU and memory. More shards is not necessarily better. Splitting a 400 GB index into 1,000 shards, for example, would place needless strain on your cluster. A good rule of thumb is to keep shard size between 10--50 GB.
## REST API
You interact with OpenSearch clusters using the REST API, which offers a lot of flexibility. You can use clients like [curl](https://curl.haxx.se/) or any programming language that can send HTTP requests. To add a JSON document to an OpenSearch index (i.e. index a document), you send an HTTP request:
```json
PUT https://<host>:<port>/<index-name>/_doc/<document-id>
{
"title": "The Wind Rises",
"release_date": "2013-07-20"
}
```
To run a search for the document:
```
GET https://<host>:<port>/<index-name>/_search?q=wind
```
To delete the document:
```
DELETE https://<host>:<port>/<index-name>/_doc/<document-id>
```
You can change most OpenSearch settings using the REST API, modify indices, check the health of the cluster, get statistics---almost everything.

View File

@ -0,0 +1,178 @@
---
layout: default
title: Docker security configuration
parent: Install OpenSearch
nav_order: 5
---
# Docker security configuration
Before deploying to a production environment, you should replace the demo security certificates and configuration YAML files with your own. With the tarball, you have direct access to the file system, but the Docker image requires modifying the Docker storage volumes include the replacement files.
Additionally, you can set the Docker environment variable `DISABLE_INSTALL_DEMO_CONFIG` to `true`. This change completely disables the demo installer.
## Sample Docker Compose file
```yml
version: '3'
services:
opensearch-node1:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node1
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node1
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
- network.host=0.0.0.0 # required if not using the demo security configuration
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
hard: 65536
volumes:
- opensearch-data1:/usr/share/opensearch/data
- ./root-ca.pem:/usr/share/opensearch/config/root-ca.pem
- ./node.pem:/usr/share/opensearch/config/node.pem
- ./node-key.pem:/usr/share/opensearch/config/node-key.pem
- ./admin.pem:/usr/share/opensearch/config/admin.pem
- ./admin-key.pem:/usr/share/opensearch/config/admin-key.pem
- ./custom-opensearch.yml:/usr/share/opensearch/config/opensearch.yml
- ./internal_users.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/internal_users.yml
- ./roles_mapping.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/roles_mapping.yml
- ./tenants.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/tenants.yml
- ./roles.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/roles.yml
- ./action_groups.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/action_groups.yml
ports:
- 9200:9200
- 9600:9600 # required for Performance Analyzer
networks:
- opensearch-net
opensearch-node2:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node2
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
- network.host=0.0.0.0
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data2:/usr/share/opensearch/data
- ./root-ca.pem:/usr/share/opensearch/config/root-ca.pem
- ./node.pem:/usr/share/opensearch/config/node.pem
- ./node-key.pem:/usr/share/opensearch/config/node-key.pem
- ./admin.pem:/usr/share/opensearch/config/admin.pem
- ./admin-key.pem:/usr/share/opensearch/config/admin-key.pem
- ./custom-opensearch.yml:/usr/share/opensearch/config/opensearch.yml
- ./internal_users.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/internal_users.yml
- ./roles_mapping.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/roles_mapping.yml
- ./tenants.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/tenants.yml
- ./roles.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/roles.yml
- ./action_groups.yml:/usr/share/opensearch/plugins/opensearch-security/securityconfig/action_groups.yml
networks:
- opensearch-net
opensearch-dashboards
image: opensearchproject/opensearch-dashboards:{{site.opensearch_version}}
container_name: opensearch-dashboards
ports:
- 5601:5601
expose:
- "5601"
environment:
OPENSEARCH_URL: https://opensearch-node1:9200
OPENSEARCH_HOSTS: https://opensearch-node1:9200
volumes:
- ./custom-opensearch_dashboards.yml:/usr/share/opensearch-dashboards/config/opensearch_dashboards.yml
networks:
- opensearch-net
volumes:
opensearch-data1:
opensearch-data2:
networks:
opensearch-net:
```
Then make your changes to `opensearch.yml`. For a full list of settings, see [Security](../../../security/configuration/). This example adds (extremely) verbose audit logging:
```yml
opensearch_security.ssl.transport.pemcert_filepath: node.pem
opensearch_security.ssl.transport.pemkey_filepath: node-key.pem
opensearch_security.ssl.transport.pemtrustedcas_filepath: root-ca.pem
opensearch_security.ssl.transport.enforce_hostname_verification: false
opensearch_security.ssl.http.enabled: true
opensearch_security.ssl.http.pemcert_filepath: node.pem
opensearch_security.ssl.http.pemkey_filepath: node-key.pem
opensearch_security.ssl.http.pemtrustedcas_filepath: root-ca.pem
opensearch_security.allow_default_init_securityindex: true
opensearch_security.authcz.admin_dn:
- CN=A,OU=UNIT,O=ORG,L=TORONTO,ST=ONTARIO,C=CA
opensearch_security.nodes_dn:
- 'CN=N,OU=UNIT,O=ORG,L=TORONTO,ST=ONTARIO,C=CA'
opensearch_security.audit.type: internal_opensearch
opensearch_security.enable_snapshot_restore_privilege: true
opensearch_security.check_snapshot_restore_write_privileges: true
opensearch_security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]
cluster.routing.allocation.disk.threshold_enabled: false
opensearch_security.audit.config.disabled_rest_categories: NONE
opensearch_security.audit.config.disabled_transport_categories: NONE
```
Use this same override process to specify new [authentication settings](../../../security/configuration/configuration/) in `/usr/share/opensearch/plugins/opensearch-security/securityconfig/config.yml`, as well as new default [internal users, roles, mappings, action groups, and tenants](../../../security/configuration/yaml/).
To start the cluster, run `docker-compose up`.
If you encounter any `File /usr/share/opensearch/config/opensearch.yml has insecure file permissions (should be 0600)` messages, you can use `chmod` to set file permissions before running `docker-compose up`. Docker Compose passes files to the container as-is.
{: .note }
Finally, you can reach OpenSearch Dashboards at http://localhost:5601, sign in, and use the **Security** panel to perform other management tasks.
## Using certificates with Docker
To use your own certificates in your configuration, add all of the necessary certificates to the volumes section of the Docker Compose file:
```yml
volumes:
- ./root-ca.pem:/full/path/to/certificate.pem
- ./admin.pem:/full/path/to/certificate.pem
- ./admin-key.pem:/full/path/to/certificate.pem
#Add other certificates
```
After replacing the demo certificates with your own, you must also include a custom `opensearch.yml` in your setup, which you need to specify in the volumes section.
```yml
volumes:
#Add certificates here
- ./custom-opensearch.yml: /full/path/to/custom-opensearch.yml
```
Remember that the certificates you specify in your Docker Compose file must be the same as the certificates listed in your custom `opensearch.yml` file. At a minimum, you should replace the root, admin, and node certificates with your own. For more information about adding and using certificates, see [Configure TLS certificates](../security/configuration/tls.md).
```yml
opensearch_security.ssl.transport.pemcert_filepath: new-node-cert.pem
opensearch_security.ssl.transport.pemkey_filepath: new-node-cert-key.pem
opensearch_security.ssl.transport.pemtrustedcas_filepath: new-root-ca.pem
opensearch_security.ssl.http.pemcert_filepath: new-node-cert.pem
opensearch_security.ssl.http.pemkey_filepath: new-node-cert-key.pem
opensearch_security.ssl.http.pemtrustedcas_filepath: new-root-ca.pem
opensearch_security.authcz.admin_dn:
- CN=admin,OU=SSL,O=Test,L=Test,C=DE
```
To start the cluster, run `docker-compose up` as usual.

View File

@ -0,0 +1,324 @@
---
layout: default
title: Docker
parent: Install OpenSearch
nav_order: 1
---
# Docker image
You can pull the OpenSearch Docker image just like any other image:
```bash
docker pull opensearchproject/opensearch:{{site.opensearch_version}}
docker pull opensearchproject/opensearch-dashboards:{{site.opensearch_version}}
```
To check available versions, see [Docker Hub](https://hub.docker.com/u/opensearchproject).
OpenSearch images use `centos:7` as the base image. If you run Docker locally, we recommend allowing Docker to use at least 4 GB of RAM in **Preferences** > **Resources**.
---
#### Table of contents
1. TOC
{:toc}
---
## Run the image
To run the image for local development:
```bash
docker run -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:{{site.opensearch_version}}
```
Then send requests to the server to verify that OpenSearch is up and running:
```bash
curl -XGET https://localhost:9200 -u 'admin:admin' --insecure
curl -XGET https://localhost:9200/_cat/nodes?v -u 'admin:admin' --insecure
curl -XGET https://localhost:9200/_cat/plugins?v -u 'admin:admin' --insecure
```
To find the container ID:
```bash
docker ps
```
Then you can stop the container using:
```bash
docker stop <container-id>
```
## Start a cluster
To deploy multiple nodes and simulate a more realistic deployment, create a [docker-compose.yml](https://docs.docker.com/compose/compose-file/) file appropriate for your environment and run:
```bash
docker-compose up
```
To stop the cluster, run:
```bash
docker-compose down
```
To stop the cluster and delete all data volumes, run:
```bash
docker-compose down -v
```
#### Sample Docker Compose file
This sample file starts two data nodes and a container for OpenSearch Dashboards.
```yml
version: '3'
services:
opensearch-node1:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node1
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node1
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
hard: 65536
volumes:
- opensearch-data1:/usr/share/opensearch/data
ports:
- 9200:9200
- 9600:9600 # required for Performance Analyzer
networks:
- opensearch-net
opensearch-node2:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster
- node.name=opensearch-node2
- discovery.seed_hosts=opensearch-node1,opensearch-node2
- cluster.initial_master_nodes=opensearch-node1,opensearch-node2
- bootstrap.memory_lock=true
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
volumes:
- opensearch-data2:/usr/share/opensearch/data
networks:
- opensearch-net
opensearch-dashboards:
image: opensearchproject/opensearch-dashboards:{{site.opensearch_version}}
container_name: opensearch-dashboards
ports:
- 5601:5601
expose:
- "5601"
environment:
OPENSEARCH_HOSTS: https://opensearch-node1:9200
networks:
- opensearch-net
volumes:
opensearch-data1:
opensearch-data2:
networks:
opensearch-net:
```
If you override `opensearch_dashboards.yml` settings using environment variables, as seen above, use all uppercase letters and periods in place of underscores (e.g. for `opensearch.url`, specify `OPENSEARCH_URL`).
{: .note}
## Configure OpenSearch
You can pass a custom `opensearch.yml` file to the Docker container using the [`-v` flag](https://docs.docker.com/engine/reference/commandline/run/#mount-volume--v---read-only) for `docker run`:
```bash
docker run \
-p 9200:9200 -p 9600:9600 \
-e "discovery.type=single-node" \
-v /<full-path-to>/custom-opensearch.yml:/usr/share/opensearch/config/opensearch.yml \
opensearchproject/opensearch:{{site.opensearch_version}}
```
You can perform the same operation in `docker-compose.yml` using a relative path:
```yml
services:
opensearch-node1:
volumes:
- opensearch-data1:/usr/share/opensearch/data
- ./custom-opensearch.yml:/usr/share/opensearch/config/opensearch.yml
opensearch-node2:
volumes:
- opensearch-data2:/usr/share/opensearch/data
- ./custom-opensearch.yml:/usr/share/opensearch/config/opensearch.yml
opensearch-dashboards
volumes:
- ./custom-opensearch_dashboards.yml:/usr/share/opensearch-dashboards/config/opensearch_dashboards.yml
```
You can also configure `docker-compose.yml` and `opensearch.yml` [to take your own certificates](../docker-security/) for use with the [Security](../../security/configuration/) plugin.
### (Optional) Set up Performance Analyzer
1. Enable the Performance Analyzer plugin:
```bash
curl -XPOST localhost:9200/_opensearch/_performanceanalyzer/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}'
```
If you receive the `curl: (52) Empty reply from server` error, you are likely protecting your cluster with the security plugin and you need to provide credentials. Modify the following command to use your username and password:
```bash
curl -XPOST https://localhost:9200/_opensearch/_performanceanalyzer/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:admin' -k
```
1. Enable the Root Cause Analyzer (RCA) framework
```bash
curl -XPOST localhost:9200/_opensearch/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}'
```
Similar to step 1, if you run into `curl: (52) Empty reply from server`, run the command below to enable RCA
```bash
curl -XPOST https://localhost:9200/_opensearch/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:admin' -k
```
1. By default, Performance Analyzer's endpoints are not accessible from outside the Docker container.
To edit this behavior, open a shell session in the container and modify the configuration:
```bash
docker ps # Look up the container id
docker exec -it <container-id> /bin/bash
# Inside container
cd plugins/opensearch_performance_analyzer/pa_config/
vi performance-analyzer.properties
```
Uncomment the line `#webservice-bind-host` and set it to `0.0.0.0`:
```
# ======================== OpenSearch performance analyzer plugin config =========================
# NOTE: this is an example for Linux. Please modify the config accordingly if you are using it under other OS.
# WebService bind host; default to all interfaces
webservice-bind-host = 0.0.0.0
# Metrics data location
metrics-location = /dev/shm/performanceanalyzer/
# Metrics deletion interval (minutes) for metrics data.
# Interval should be between 1 to 60.
metrics-deletion-interval = 1
# If set to true, the system cleans up the files behind it. So at any point, we should expect only 2
# metrics-db-file-prefix-path files. If set to false, no files are cleaned up. This can be useful, if you are archiving
# the files and wouldn't like for them to be cleaned up.
cleanup-metrics-db-files = true
# WebService exposed by App's port
webservice-listener-port = 9600
# Metric DB File Prefix Path location
metrics-db-file-prefix-path = /tmp/metricsdb_
https-enabled = false
#Setup the correct path for certificates
certificate-file-path = specify_path
private-key-file-path = specify_path
# Plugin Stats Metadata file name, expected to be in the same location
plugin-stats-metadata = plugin-stats-metadata
# Agent Stats Metadata file name, expected to be in the same location
agent-stats-metadata = agent-stats-metadata
```
1. Then restart the Performance Analyzer agent:
```bash
kill $(ps aux | grep -i 'PerformanceAnalyzerApp' | grep -v grep | awk '{print $2}')
```
## Bash access to containers
To create an interactive Bash session in a container, run `docker ps` to find the container ID. Then run:
```bash
docker exec -it <container-id> /bin/bash
```
## Customize the Docker image
To run the image with a custom plugin, first create a [`Dockerfile`](https://docs.docker.com/engine/reference/builder/):
```
FROM opensearchproject/opensearch:{{site.opensearch_version}}
RUN /usr/share/opensearch/bin/opensearch-plugin install --batch <plugin-name-or-url>
```
Then run the following commands:
```bash
docker build --tag=opensearch-custom-plugin .
docker run -p 9200:9200 -p 9600:9600 -v /usr/share/opensearch/data opensearch-custom-plugin
```
You can also use a `Dockerfile` to pass your own certificates for use with the [Security](../../../security/) plugin, similar to the `-v` argument in [Configure OpenSearch](#configure-opensearch):
```
FROM opensearchproject/opensearch:{{site.opensearch_version}}
COPY --chown=opensearch:opensearch opensearch.yml /usr/share/opensearch/config/
COPY --chown=opensearch:opensearch my-key-file.pem /usr/share/opensearch/config/
COPY --chown=opensearch:opensearch my-certificate-chain.pem /usr/share/opensearch/config/
COPY --chown=opensearch:opensearch my-root-cas.pem /usr/share/opensearch/config/
```
Alternately, you might want to remove a plugin. This `Dockerfile` removes the security plugin:
```
FROM opensearchproject/opensearch:{{site.opensearch_version}}
RUN /usr/share/opensearch/bin/opensearch-plugin remove opensearch_security
COPY --chown=opensearch:opensearch opensearch.yml /usr/share/opensearch/config/
```
In this case, `opensearch.yml` is a "vanilla" version of the file with no OpenSearch entries. It might look like this:
```yml
cluster.name: "docker-cluster"
network.host: 0.0.0.0
```

View File

@ -0,0 +1,40 @@
---
layout: default
title: Important settings
parent: Install OpenSearch
nav_order: 70
---
# Important settings
For production workloads, make sure the [Linux setting](https://www.kernel.org/doc/Documentation/sysctl/vm.txt) `vm.max_map_count` is set to at least 262144. On the OpenSearch Docker image, this setting is the default. To check, start a Bash session in the container and run:
```bash
cat /proc/sys/vm/max_map_count
```
To increase this value, you have to modify the Docker image. For other install types, add this setting to the host machine's `/etc/sysctl.conf` file with the following line:
```
vm.max_map_count=262144
```
Then run `sudo sysctl -p` to reload.
The [sample docker-compose.yml](../docker/#sample-docker-compose-file) file also contains several key settings:
- `bootstrap.memory_lock=true`
Disbles swapping (along with `memlock`). Swapping can dramatically decrease performance and stability, so you should ensure it is disabled on production clusters.
- `OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m`
Sets the size of the Java heap (we recommend half of system RAM).
- `nofile 65536`
Sets a limit of 65536 open files for the OpenSearch user.
- `port 9600`
Allows you to access Performance Analyzer on port 9600.

View File

@ -0,0 +1,11 @@
---
layout: default
title: Install OpenSearch
nav_order: 2
redirect_from: /docs/install/
has_children: true
---
# Install and configure OpenSearch
OpenSearch has two installation options at this time: Docker images and tarballs.

View File

@ -0,0 +1,263 @@
---
layout: default
title: OpenSearch plugins
parent: Install OpenSearch
nav_order: 90
---
# Standalone OpenSearch plugin installation
If you don't want to use the all-in-one OpenSearch installation options, you can install the individual plugins on a compatible OpenSearch cluster, just like any other plugin.
---
#### Table of contents
1. TOC
{:toc}
---
## Plugin compatibility
<table>
<thead style="text-align: left">
<tr>
<th>OpenSearch version</th>
<th>Plugin versions</th>
</tr>
</thead>
<tbody>
<tr>
<td>1.0.0-beta1</td>
<td>
<pre>opensearch-alerting 1.0.0.0-beta1
opensearch-anomaly-detection 1.0.0.0-beta1
opensearch-asynchronous-search 1.0.0.0-beta1
opensearch-index-management 1.0.0.0-beta1
opensearch-job-scheduler 1.0.0.0-beta1
opensearch-knn 1.0.0.0-beta1
opensearch-performance-analyzer 1.0.0.0-beta1
opensearch-reports-scheduler 1.0.0.0-beta1
opensearch-security 1.0.0.0-beta1
opensearch-sql 1.0.0.0-beta1
</pre>
</td>
</tr>
</tbody>
</table>
To install plugins manually, you must have the exact version of OpenSearch installed, down to the minor version.
{% comment %}
To get a list of available OpenSearch versions on CentOS 7 and Amazon Linux 2, run the following command:
```bash
sudo yum list opensearch-oss --showduplicates
```
Then you can specify the version that you need:
```bash
sudo yum install opensearch-oss-6.7.1
```
{% endcomment %}
## Install plugins
Navigate to the OpenSearch home directory (most likely, it is `/usr/share/opensearch`), and run the install command for each plugin.
### Security
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-security/opensearch-security-{{site.opensearch_major_minor_version}}.1.0.zip
```
After installing the security plugin, you can run `sudo sh /usr/share/opensearch/plugins/opensearch-security/tools/install_demo_configuration.sh` to quickly get started with demo certificates. Otherwise, you must configure it manually and run [securityadmin.sh](../../../security/configuration/security-admin/).
The security plugin has a corresponding [OpenSearch Dashboards plugin](../../../opensearch-dashboards/install/plugins) that you probably want to install as well.
### Job scheduler
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-job-scheduler/opensearch-job-scheduler-{{site.opensearch_major_minor_version}}.0.0.zip
```
### Alerting
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-alerting/opensearch-alerting-{{site.opensearch_major_minor_version}}.1.0.zip
```
To install Alerting, you must first install the Job Scheduler plugin. Alerting has a corresponding [OpenSearch Dashboards plugin](../../../opensearch-dashboards/install/plugins/) that you probably want to install as well.
### SQL
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-sql/opensearch-sql-{{site.opensearch_major_minor_version}}.2.0.zip
```
### Reports scheduler
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-reports-scheduler/opensearch-reports-scheduler-{{site.opensearch_major_minor_version}}.0.0.zip
```
### Index State Management
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-index-management/opensearch-index-management-{{site.opensearch_major_minor_version}}.2.0.zip
```
To install Index State Management, you must first install the Job Scheduler plugin. ISM has a corresponding [OpenSearch Dashboards plugin](../../../opensearch-dashboards/install/plugins/) that you probably want to install as well.
### k-NN
k-NN is only available as part of the all-in-one installs: Docker, RPM, and Debian.
### Anomaly detection
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-anomaly-detection/opensearch-anomaly-detection-{{site.opensearch_major_minor_version}}.0.0.zip
```
### Asynchronous search
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/opensearch-asynchronous-search/opensearch-asynchronous-search-{{site.opensearch_major_minor_version}}.0.1.zip
```
### Performance Analyzer
```bash
sudo bin/opensearch-plugin install https://d3g5vo6xdbdb9a.cloudfront.net/downloads/opensearch-plugins/performance-analyzer/opensearch-performance-analyzer-{{site.opensearch_major_minor_version}}.0.0.zip
```
Performance Analyzer requires some manual configuration after installing the plugin:
1. Create `/usr/lib/systemd/system/opensearch-performance-analyzer.service` based on [this file](https://github.com/opensearch-project/performance-analyzer/blob/master/packaging/opensearch-performance-analyzer.service).
1. Make the CLI executable:
```bash
sudo chmod +x /usr/share/opensearch/bin/performance-analyzer-agent-cli
```
1. Run the appropriate `postinst` script for your Linux distribution:
```bash
# Debian-based distros
sudo sh /usr/share/opensearch/plugins/opensearch-performance-analyzer/install/deb/postinst.sh 1
# RPM distros
sudo sh /usr/share/opensearch/plugins/opensearch-performance-analyzer/install/rpm/postinst.sh 1
```
1. Make Performance Analyzer accessible outside of the host machine
```bash
cd /usr/share/opensearch # navigate to the OpenSearch home directory
cd plugins/opensearch_performance_analyzer/pa_config/
vi performance-analyzer.properties
```
Uncomment the line `#webservice-bind-host` and set it to `0.0.0.0`:
```bash
# ======================== OpenSearch performance analyzer plugin config =========================
# NOTE: this is an example for Linux. Please modify the config accordingly if you are using it under other OS.
# WebService bind host; default to all interfaces
webservice-bind-host = 0.0.0.0
# Metrics data location
metrics-location = /dev/shm/performanceanalyzer/
# Metrics deletion interval (minutes) for metrics data.
# Interval should be between 1 to 60.
metrics-deletion-interval = 1
# If set to true, the system cleans up the files behind it. So at any point, we should expect only 2
# metrics-db-file-prefix-path files. If set to false, no files are cleaned up. This can be useful, if you are archiving
# the files and wouldn't like for them to be cleaned up.
cleanup-metrics-db-files = true
# WebService exposed by App's port
webservice-listener-port = 9600
# Metric DB File Prefix Path location
metrics-db-file-prefix-path = /tmp/metricsdb_
https-enabled = false
#Setup the correct path for certificates
certificate-file-path = specify_path
private-key-file-path = specify_path
# Plugin Stats Metadata file name, expected to be in the same location
plugin-stats-metadata = plugin-stats-metadata
# Agent Stats Metadata file name, expected to be in the same location
agent-stats-metadata = agent-stats-metadata
```
1. Start the OpenSearch service:
```bash
sudo systemctl start opensearch.service
```
1. Send a test request:
```bash
curl -XGET "localhost:9600/_opensearch/_performanceanalyzer/metrics?metrics=Latency,CPU_Utilization&agg=avg,max&dim=ShardID&nodes=all"
```
## List installed plugins
To check your installed plugins:
```bash
sudo bin/opensearch-plugin list
```
## Remove plugins
If you are removing Performance Analyzer, see below. Otherwise, you can remove the plugin with a single command:
```bash
sudo bin/opensearch-plugin remove <plugin-name>
```
Then restart OpenSearch on the node:
```bash
sudo systemctl restart opensearch.service
```
## Update plugins
OpenSearch doesn't update plugins. Instead, you have to remove and reinstall them:
```bash
sudo bin/opensearch-plugin remove <plugin-name>
sudo bin/opensearch-plugin install <plugin-name>
```

View File

@ -0,0 +1,147 @@
---
layout: default
title: Tarball
parent: Install OpenSearch
nav_order: 50
---
# Tarball
The tarball installation provides a self-contained directory with everything you need to run OpenSearch, including an integrated Java Development Kit (JDK). The tarball is a good option for testing and development.
The tarball supports most Linux distributions, including CentOS 7, Amazon Linux 2, and Ubuntu 18.04. If you have your own Java installation and set `JAVA_HOME` in the terminal, macOS works, as well.
1. Download the tarball from the [OpenSearch downloads page](https://opensearch.org/downloads.html){:target='\_blank'}.
1. Extract the TAR file to a directory and change to that directory:
```bash
# x64
tar -zxf opensearch-{{site.opensearch_version}}-linux-x64.tar.gz
cd opensearch-{{site.opensearch_version}}{% comment %}# ARM64
tar -zxf opensearch-{{site.opensearch_version}}-linux-arm64.tar.gz
cd opensearch-{{site.opensearch_version}}{% endcomment %}
```
1. Run OpenSearch:
```bash
./opensearch-tar-install.sh
```
1. Open a second terminal session, and send requests to the server to verify that OpenSearch is up and running:
```bash
curl -XGET https://localhost:9200 -u 'admin:admin' --insecure
curl -XGET https://localhost:9200/_cat/plugins?v -u 'admin:admin' --insecure
```
## Configuration
You can modify `config/opensearch.yml` or specify environment variables as arguments using `-E`:
```bash
./opensearch-tar-install.sh -Ecluster.name=opensearch-cluster -Enode.name=opensearch-node1 -Ehttp.host=0.0.0.0 -Ediscovery.type=single-node
```
For other settings, see [Important settings](../important-settings/).
### (Optional) Set up Performance Analyzer
In a tarball installation, Performance Analyzer collects data when it is enabled. But in order to read that data using the REST API on port 9600, you must first manually launch the associated reader agent process:
1. Make Performance Analyzer accessible outside of the host machine
```bash
cd /usr/share/opensearch # navigate to the OpenSearch home directory
cd plugins/opensearch_performance_analyzer/pa_config/
vi performance-analyzer.properties
```
Uncomment the line `#webservice-bind-host` and set it to `0.0.0.0`:
```
# ======================== OpenSearch performance analyzer plugin config =========================
# NOTE: this is an example for Linux. Please modify the config accordingly if you are using it under other OS.
# WebService bind host; default to all interfaces
webservice-bind-host = 0.0.0.0
# Metrics data location
metrics-location = /dev/shm/performanceanalyzer/
# Metrics deletion interval (minutes) for metrics data.
# Interval should be between 1 to 60.
metrics-deletion-interval = 1
# If set to true, the system cleans up the files behind it. So at any point, we should expect only 2
# metrics-db-file-prefix-path files. If set to false, no files are cleaned up. This can be useful, if you are archiving
# the files and wouldn't like for them to be cleaned up.
cleanup-metrics-db-files = true
# WebService exposed by App's port
webservice-listener-port = 9600
# Metric DB File Prefix Path location
metrics-db-file-prefix-path = /tmp/metricsdb_
https-enabled = false
#Setup the correct path for certificates
certificate-file-path = specify_path
private-key-file-path = specify_path
# Plugin Stats Metadata file name, expected to be in the same location
plugin-stats-metadata = plugin-stats-metadata
# Agent Stats Metadata file name, expected to be in the same location
agent-stats-metadata = agent-stats-metadata
```
1. Make the CLI executable:
```bash
sudo chmod +x ./bin/performance-analyzer-agent-cli
```
1. Launch the agent CLI:
```bash
ES_HOME="$PWD" ./bin/performance-analyzer-agent-cli
```
1. In a separate window, enable the Performance Analyzer plugin:
```bash
curl -XPOST localhost:9200/_opensearch/_performanceanalyzer/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}'
```
If you receive the `curl: (52) Empty reply from server` error, you are likely protecting your cluster with the security plugin and you need to provide credentials. Modify the following command to use your username and password:
```bash
curl -XPOST https://localhost:9200/_opensearch/_performanceanalyzer/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:admin' -k
```
1. Finally, enable the Root Cause Analyzer (RCA) framework
```bash
curl -XPOST localhost:9200/_opensearch/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}'
```
Similar to step 4, if you run into `curl: (52) Empty reply from server`, run the command below to enable RCA
```bash
curl -XPOST https://localhost:9200/_opensearch/_performanceanalyzer/rca/cluster/config -H 'Content-Type: application/json' -d '{"enabled": true}' -u 'admin:admin' -k
```
{% comment %}
### (Optional) Removing Performance Analyzer
See [Clean up Performance Analyzer files](../plugins/#optional-clean-up-performance-analyzer-files).
{% endcomment %}

174
_opensearch_docs/logs.md Normal file
View File

@ -0,0 +1,174 @@
---
layout: default
title: Logs
nav_order: 60
---
# Logs
The OpenSearch logs include valuable information for monitoring cluster operations and troubleshooting issues. The location of the logs differs based on the installation type:
- On Docker, OpenSearch writes most logs to the console and stores the remainder in `opensearch/logs/`. The tarball installation also uses `opensearch/logs/`.
- On the RPM and Debian installations, OpenSearch writes logs to `/var/log/opensearch/`.
Logs are available as `.log` (plain text) and `.json` files.
## Application logs
For its application logs, OpenSearch uses [Apache Log4j 2](https://logging.apache.org/log4j/2.x/) and its built-in log levels (from least to most severe) of TRACE, DEBUG, INFO, WARN, ERROR, and FATAL. The default OpenSearch log level is INFO.
Rather than changing the default log level (`logger.level`), you change the log level for individual OpenSearch modules:
```json
PUT /_cluster/settings
{
"persistent" : {
"logger.org.opensearch.index.reindex" : "DEBUG"
}
}
```
The easiest way to identify modules is not from the logs, which abbreviate the path (for example, `o.o.i.r`), but from the [OpenSearch source code](https://github.com/opensearch-project/opensearch/tree/master/server/src/main/java/org/opensearch).
{: .tip }
After this sample change, OpenSearch emits much more detailed logs during reindex operations:
```
[2019-10-18T16:52:51,184][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: starting
[2019-10-18T16:52:51,186][DEBUG][o.o.i.r.TransportReindexAction] [node1] executing initial scroll against [some-index]
[2019-10-18T16:52:51,291][DEBUG][o.o.i.r.TransportReindexAction] [node1] scroll returned [3] documents with a scroll id of [DXF1Z==]
[2019-10-18T16:52:51,292][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: got scroll response with [3] hits
[2019-10-18T16:52:51,294][DEBUG][o.o.i.r.WorkerBulkByScrollTaskState] [node1] [1626]: preparing bulk request for [0s]
[2019-10-18T16:52:51,297][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: preparing bulk request
[2019-10-18T16:52:51,299][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: sending [3] entry, [222b] bulk request
[2019-10-18T16:52:51,310][INFO ][o.e.c.m.MetaDataMappingService] [node1] [some-new-index/R-j3adc6QTmEAEb-eAie9g] create_mapping [_doc]
[2019-10-18T16:52:51,383][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: got scroll response with [0] hits
[2019-10-18T16:52:51,384][DEBUG][o.o.i.r.WorkerBulkByScrollTaskState] [node1] [1626]: preparing bulk request for [0s]
[2019-10-18T16:52:51,385][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: preparing bulk request
[2019-10-18T16:52:51,386][DEBUG][o.o.i.r.TransportReindexAction] [node1] [1626]: finishing without any catastrophic failures
[2019-10-18T16:52:51,395][DEBUG][o.o.i.r.TransportReindexAction] [node1] Freed [1] contexts
```
The DEBUG and TRACE levels are extremely verbose. If you enable either one to troubleshoot a problem, disable it after you finish.
There are other ways to change log levels:
1. Add lines to `opensearch.yml`:
```yml
logger.org.opensearch.index.reindex: debug
```
Modifying `opensearch.yml` makes the most sense if you want to reuse your logging configuration across multiple clusters or debug startup issues with a single node.
2. Modify `log4j2.properties`:
```
# Define a new logger with unique ID of reindex
logger.reindex.name = org.opensearch.index.reindex
# Set the log level for that ID
logger.reindex.level = debug
```
This approach is extremely flexible, but requires familiarity with the [Log4j 2 property file syntax](https://logging.apache.org/log4j/2.x/manual/configuration.html#Properties). In general, the other options offer a simpler configuration experience.
If you examine the default `log4j2.properties` file in the configuration directory, you can see a few OpenSearch-specific variables:
```
appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n
appender.rolling_old.fileName = ${sys:os.logs.base_path}${sys:file.separator}${sys:os.logs.cluster_name}.log
```
- `${sys:os.logs.base_path}` is the directory for logs (for example, `/var/log/opensearch/`).
- `${sys:os.logs.cluster_name}` is the name of the cluster.
- `[%node_name]` is the name of the node.
## Slow logs
OpenSearch has two *slow logs*, logs that help you identify performance issues: the search slow log and the indexing slow log.
These logs rely on thresholds to define what qualifies as a "slow" search or indexing operation. For example, you might decide that a query is slow if it takes more than 15 seconds to complete. Unlike application logs, which you configure for modules, you configure slow logs for indices. By default, both logs are disabled (all thresholds are set to `-1`):
```json
GET <some-index>/_settings?include_defaults=true
{
"indexing": {
"slowlog": {
"reformat": "true",
"threshold": {
"index": {
"warn": "-1",
"trace": "-1",
"debug": "-1",
"info": "-1"
}
},
"source": "1000",
"level": "TRACE"
}
},
"search": {
"slowlog": {
"level": "TRACE",
"threshold": {
"fetch": {
"warn": "-1",
"trace": "-1",
"debug": "-1",
"info": "-1"
},
"query": {
"warn": "-1",
"trace": "-1",
"debug": "-1",
"info": "-1"
}
}
}
}
}
```
To enable these logs, increase one or more thresholds:
```json
PUT <some-index>/_settings
{
"indexing": {
"slowlog": {
"threshold": {
"index": {
"warn": "15s",
"trace": "750ms",
"debug": "3s",
"info": "10s"
}
},
"source": "500",
"level": "INFO"
}
}
}
```
In this example, OpenSearch logs indexing operations that take 15 seconds or longer at the WARN level and operations that take between 10 and 14.*x* seconds at the INFO level. If you set a threshold to 0 seconds, OpenSearch logs all operations, which can be useful for testing whether slow logs are indeed enabled.
- `reformat` specifies whether to log the document `_source` field as a single line (`true`) or let it span multiple lines (`false`).
- `source` is the number of characters of the document `_source` field to log.
- `level` is the minimum log level to include.
A line from `opensearch_index_indexing_slowlog.log` might look like this:
```
node1 | [2019-10-24T19:48:51,012][WARN][i.i.s.index] [node1] [some-index/i86iF5kyTyy-PS8zrdDeAA] took[3.4ms], took_millis[3], type[_doc], id[1], routing[], source[{"title":"Your Name", "Director":"Makoto Shinkai"}]
```
Slow logs can consume considerable disk space if you set thresholds or levels too low. Consider enabling them temporarily for troubleshooting or performance tuning. To disable slow logs, return all thresholds to `-1`.
## Deprecation logs
Deprecation logs record when clients make deprecated API calls to your cluster. These logs can help you identify and fix issues prior to upgrading to a new major version. By default, OpenSearch logs deprecated API calls at the WARN level, which works well for almost all use cases. If desired, configure `logger.deprecation.level` using `_cluster/settings`, `opensearch.yml`, or `log4j2.properties`.

View File

@ -0,0 +1,635 @@
---
layout: default
title: Metric aggregations
parent: Aggregations
nav_order: 1
---
# Metric Aggregations
Metric aggregations let you perform simple calculations such as finding the minimum, maximum, and average values of a field.
## Types of metric aggregations
Metric aggregations are of two types: single-value metric aggregations and multi-value metric aggregations.
### Single-value metric aggregations
Single-value metric aggregations return a single metric. For example, `sum`, `min`, `max`, `avg`, `cardinality`, and `value_count`.
### Multi-value metric aggregations
Multi-value metric aggregations return more than one metric. For example, `stats`, `extended_stats`, `matrix_stats`, `percentile`, `percentile_ranks`, `geo_bound`, `top_hits`, and `scripted_metric`.
## sum, min, max, avg
The `sum`, `min`, `max`, and `avg` metrics are single-value metric aggregations that return the sum, minimum, maximum, and average values of a field, respectively.
The following example calculates the total sum of the `taxful_total_price` field:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"sum_taxful_total_price": {
"sum": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample Response
```json
...
"aggregations" : {
"sum_taxful_total_price" : {
"value" : 350884.12890625
}
}
}
```
In a similar fashion, you can find the minimum, maximum, and average values of a field.
## cardinality
The `cardinality` metric is a single-value metric aggregation that counts the number of unique or distinct values of a field.
The following example finds the number of unique products in an eCommerce store:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"unique_products": {
"cardinality": {
"field": "products.product_id"
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"unique_products" : {
"value" : 7033
}
}
}
```
The cardinality count is approximate.
If you had tens of thousands of products in your store, an accurate cardinality calculation requires loading all the values into a hash set and returning its size. This approach doesn't scale well because it requires more memory and causes high latency.
You can control the trade-off between memory and accuracy with the `precision_threshold` setting. This setting defines the threshold below which counts are expected to be close to accurate. Above this value, counts might become a bit less accurate. The default value of `precision_threshold` is 3,000. The maximum supported value is 40,000.
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"unique_products": {
"cardinality": {
"field": "products.product_id",
"precision_threshold": 10000
}
}
}
}
```
## value_count
The `value_count` metric is a single-value metric aggregation that calculates the number of values that an aggregation is based on.
For example, you can use the `value_count` metric with the `avg` metric to find how many numbers the aggregation uses to calculate an average value.
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"number_of_values": {
"value_count": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"number_of_values" : {
"value" : 4675
}
}
}
```
## stats, extended_stats, matrix_stats
The `stats` metric is a multi-value metric aggregation that returns all basic metrics such as `min`, `max`, `sum`, `avg`, and `value_count` in one aggregation query.
The following example returns the basic stats for the `taxful_total_price` field:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"stats_taxful_total_price": {
"stats": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"stats_taxful_total_price" : {
"count" : 4675,
"min" : 6.98828125,
"max" : 2250.0,
"avg" : 75.05542864304813,
"sum" : 350884.12890625
}
}
}
```
The `extended_stats` aggregation is an extended version of the `stats` aggregation. Apart from including basic stats, `extended_stats` also returns stats such as `sum_of_squares`, `variance`, and `std_deviation`.
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"extended_stats_taxful_total_price": {
"extended_stats": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample Response
```json
...
"aggregations" : {
"extended_stats_taxful_total_price" : {
"count" : 4675,
"min" : 6.98828125,
"max" : 2250.0,
"avg" : 75.05542864304813,
"sum" : 350884.12890625,
"sum_of_squares" : 3.9367749294174194E7,
"variance" : 2787.59157113862,
"variance_population" : 2787.59157113862,
"variance_sampling" : 2788.187974983536,
"std_deviation" : 52.79764740155209,
"std_deviation_population" : 52.79764740155209,
"std_deviation_sampling" : 52.80329511482722,
"std_deviation_bounds" : {
"upper" : 180.6507234461523,
"lower" : -30.53986616005605,
"upper_population" : 180.6507234461523,
"lower_population" : -30.53986616005605,
"upper_sampling" : 180.66201887270256,
"lower_sampling" : -30.551161586606312
}
}
}
}
```
The `std_deviation_bounds` object provides a visual variance of the data with an interval of plus/minus two standard deviations from the mean.
To set the standard deviation to a different value, say 3, set `sigma` to 3:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"extended_stats_taxful_total_price": {
"extended_stats": {
"field": "taxful_total_price",
"sigma": 3
}
}
}
}
```
The `matrix_stats` aggregation generates advanced stats for multiple fields in a matrix form.
The following example returns advanced stats in a matrix form for the `taxful_total_price` and `products.base_price` fields:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"matrix_stats_taxful_total_price": {
"matrix_stats": {
"fields": ["taxful_total_price", "products.base_price"]
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"matrix_stats_taxful_total_price" : {
"doc_count" : 4675,
"fields" : [
{
"name" : "products.base_price",
"count" : 4675,
"mean" : 34.994239430147196,
"variance" : 360.5035285833703,
"skewness" : 5.530161335032702,
"kurtosis" : 131.16306324042148,
"covariance" : {
"products.base_price" : 360.5035285833703,
"taxful_total_price" : 846.6489362233166
},
"correlation" : {
"products.base_price" : 1.0,
"taxful_total_price" : 0.8444765264325268
}
},
{
"name" : "taxful_total_price",
"count" : 4675,
"mean" : 75.05542864304839,
"variance" : 2788.1879749835402,
"skewness" : 15.812149139924037,
"kurtosis" : 619.1235507385902,
"covariance" : {
"products.base_price" : 846.6489362233166,
"taxful_total_price" : 2788.1879749835402
},
"correlation" : {
"products.base_price" : 0.8444765264325268,
"taxful_total_price" : 1.0
}
}
]
}
}
}
```
Statistic | Description
:--- | :---
`count` | The number of samples measured.
`mean` | The average value of the field measured from the sample.
`variance` | How far the values of the field measured are spread out from its mean value. The larger the variance, the more it's spread from its mean value.
`skewness` | An asymmetric measure of the distribution of the field's values around the mean.
`kurtosis` | A measure of the tail heaviness of a distribution. As the tail becomes lighter, kurtosis decreases. As the tail becomes heavier, kurtosis increases. To learn about kurtosis, see [Wikipedia](https://en.wikipedia.org/wiki/Kurtosis).
`covariance` | A measure of the joint variability between two fields. A positive value means their values move in the same direction and vice versa.
`correlation` | A measure of the strength of the relationship between two fields. The valid values are between [-1, 1]. A value of -1 means that the value is negatively correlated and a value of 1 means that it's positively correlated. A value of 0 means that there's no identifiable relationship between them.
## percentile, percentile_ranks
Percentile is the percentage of the data that's at or below a certain threshold value.
The `percentile` metric is a multi-value metric aggregation that lets you find outliers in your data or figure out the distribution of your data.
Like the `cardinality` metric, the `percentile` metric is also approximate.
The following example calculates the percentile in relation to the `taxful_total_price` field:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"percentile_taxful_total_price": {
"percentiles": {
"field": "taxful_total_price"
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"percentile_taxful_total_price" : {
"values" : {
"1.0" : 21.984375,
"5.0" : 27.984375,
"25.0" : 44.96875,
"50.0" : 64.22061688311689,
"75.0" : 93.0,
"95.0" : 156.0,
"99.0" : 222.0
}
}
}
}
```
Percentile rank is the percentile of values at or below a threshold grouped by a specified value. For example, if a value is greater than or equal to 80% of the values, it has a percentile rank of 80.
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"percentile_rank_taxful_total_price": {
"percentile_ranks": {
"field": "taxful_total_price",
"values": [
10,
15
]
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"percentile_rank_taxful_total_price" : {
"values" : {
"10.0" : 0.055096056411283456,
"15.0" : 0.0830092961834656
}
}
}
}
```
## geo_bound
The `geo_bound` metric is a multi-value metric aggregation that calculates the bounding box in terms of latitude and longitude around a `geo_point` field.
The following example returns the `geo_bound` metrics for the `geoip.location` field:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"geo": {
"geo_bounds": {
"field": "geoip.location"
}
}
}
}
```
#### Sample response
```json
"aggregations" : {
"geo" : {
"bounds" : {
"top_left" : {
"lat" : 52.49999997206032,
"lon" : -118.20000001229346
},
"bottom_right" : {
"lat" : 4.599999985657632,
"lon" : 55.299999956041574
}
}
}
}
}
```
## top_hits
The `top_hits` metric is a multi-value metric aggregation that ranks the matching documents based on a relevance score for the field that's being aggregated.
You can specify the following options:
- `from`: The starting position of the hit.
- `size`: The maximum size of hits to return. The default value is 3.
- `sort`: How the matching hits are sorted. By default, the hits are sorted by the relevance score of the aggregation query.
The following example returns the top 5 products in your eCommerce data:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
{
"size": 0,
"aggs": {
"top_hits_products": {
"top_hits": {
"size": 5
}
}
}
}
```
#### Sample response
```json
...
"aggregations" : {
"top_hits_products" : {
"hits" : {
"total" : {
"value" : 4675,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "opensearch_dashboards_sample_data_ecommerce",
"_type" : "_doc",
"_id" : "glMlwXcBQVLeQPrkHPtI",
"_score" : 1.0,
"_source" : {
"category" : [
"Women's Accessories",
"Women's Clothing"
],
"currency" : "EUR",
"customer_first_name" : "rania",
"customer_full_name" : "rania Evans",
"customer_gender" : "FEMALE",
"customer_id" : 24,
"customer_last_name" : "Evans",
"customer_phone" : "",
"day_of_week" : "Sunday",
"day_of_week_i" : 6,
"email" : "rania@evans-family.zzz",
"manufacturer" : [
"Tigress Enterprises"
],
"order_date" : "2021-02-28T14:16:48+00:00",
"order_id" : 583581,
"products" : [
{
"base_price" : 10.99,
"discount_percentage" : 0,
"quantity" : 1,
"manufacturer" : "Tigress Enterprises",
"tax_amount" : 0,
"product_id" : 19024,
"category" : "Women's Accessories",
"sku" : "ZO0082400824",
"taxless_price" : 10.99,
"unit_discount_amount" : 0,
"min_price" : 5.17,
"_id" : "sold_product_583581_19024",
"discount_amount" : 0,
"created_on" : "2016-12-25T14:16:48+00:00",
"product_name" : "Snood - white/grey/peach",
"price" : 10.99,
"taxful_price" : 10.99,
"base_unit_price" : 10.99
},
{
"base_price" : 32.99,
"discount_percentage" : 0,
"quantity" : 1,
"manufacturer" : "Tigress Enterprises",
"tax_amount" : 0,
"product_id" : 19260,
"category" : "Women's Clothing",
"sku" : "ZO0071900719",
"taxless_price" : 32.99,
"unit_discount_amount" : 0,
"min_price" : 17.15,
"_id" : "sold_product_583581_19260",
"discount_amount" : 0,
"created_on" : "2016-12-25T14:16:48+00:00",
"product_name" : "Cardigan - grey",
"price" : 32.99,
"taxful_price" : 32.99,
"base_unit_price" : 32.99
}
],
"sku" : [
"ZO0082400824",
"ZO0071900719"
],
"taxful_total_price" : 43.98,
"taxless_total_price" : 43.98,
"total_quantity" : 2,
"total_unique_products" : 2,
"type" : "order",
"user" : "rani",
"geoip" : {
"country_iso_code" : "EG",
"location" : {
"lon" : 31.3,
"lat" : 30.1
},
"region_name" : "Cairo Governorate",
"continent_name" : "Africa",
"city_name" : "Cairo"
},
"event" : {
"dataset" : "sample_ecommerce"
}
}
...
}
]
}
}
}
}
```
## scripted_metric
The `scripted_metric` metric is a multi-value metric aggregation that returns metrics calculated from a specified script.
A script has four stages: the initial stage, the map stage, the combine stage, and the reduce stage.
* `init_script`: (OPTIONAL) Sets the initial state and executes before any collection of documents.
* `map_script`: Checks the value of the `type` field and executes the aggregation on the collected documents.
* `combine_script`: Aggregates the state returned from every shard. The aggregated value is returned to the coordinating node.
* `reduce_script`: Provides access to the variable states; this variable combines the results from the `combine_script` on each shard into an array.
The following example aggregates the different HTTP response types in web log data:
```json
GET opensearch_dashboards_sample_data_logs/_search
{
"size": 0,
"aggregations": {
"responses.counts": {
"scripted_metric": {
"init_script": "state.responses = ['error':0L,'success':0L,'other':0L]",
"map_script": """
def code = doc['response.keyword'].value;
if (code.startsWith('5') || code.startsWith('4')) {
state.responses.error += 1 ;
} else if(code.startsWith('2')) {
state.responses.success += 1;
} else {
state.responses.other += 1;
}
""",
"combine_script": "state.responses",
"reduce_script": """
def counts = ['error': 0L, 'success': 0L, 'other': 0L];
for (responses in states) {
counts.error += responses['error'];
counts.success += responses['success'];
counts.other += responses['other'];
}
return counts;
"""
}
}
}
}
```
#### Sample Response
```json
...
"aggregations" : {
"responses.counts" : {
"value" : {
"other" : 0,
"success" : 12832,
"error" : 1242
}
}
}
}
```

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,189 @@
---
layout: default
title: Popular APIs
nav_order: 96
---
# Popular APIs
This page contains sample requests for popular OpenSearch APIs.
---
#### Table of contents
1. TOC
{:toc}
---
## Create index with non-default settings
```json
PUT my-logs
{
"settings": {
"number_of_shards": 4,
"number_of_replicas": 2
},
"mappings": {
"properties": {
"title": {
"type": "text"
},
"year": {
"type": "integer"
}
}
}
}
```
## Index a document with a random ID
```json
POST my-logs/_doc
{
"title": "Your Name",
"year": "2016"
}
```
## Index a document with a specific ID
```json
PUT my-logs/_doc/1
{
"title": "Weathering with You",
"year": "2019"
}
```
## Index several documents at once
The blank line at the end of the request body is required. If you omit the `_id` field, OpenSearch generates a random ID.
```json
POST _bulk
{ "index": { "_index": "my-logs", "_id": "2" } }
{ "title": "The Garden of Words", "year": 2013 }
{ "index" : { "_index": "my-logs", "_id" : "3" } }
{ "title": "5 Centimeters Per Second", "year": 2007 }
```
## List all indices
```
GET _cat/indices?v
```
## Open or close all indices that match a pattern
```
POST my-logs*/_open
POST my-logs*/_close
```
## Delete all indices that match a pattern
```
DELETE my-logs*
```
## Create an index alias
This request creates the alias `my-logs-today` for the index `my-logs-2019-11-13`.
```
PUT my-logs-2019-11-13/_alias/my-logs-today
```
## List all aliases
```
GET _cat/aliases?v
```
## Search an index or all indices that match a pattern
```
GET my-logs/_search?q=test
GET my-logs*/_search?q=test
```
## Get cluster settings, including defaults
```
GET _cluster/settings?include_defaults=true
```
## Change disk watermarks (or other cluster settings)
```json
PUT _cluster/settings
{
"transient": {
"cluster.routing.allocation.disk.watermark.low": "80%",
"cluster.routing.allocation.disk.watermark.high": "85%"
}
}
```
## Get cluster health
```
GET _cluster/health
```
## List nodes in the cluster
```
GET _cat/nodes?v
```
## Get node statistics
```
GET _nodes/stats
```
## Get snapshots in a repository
```
GET _snapshot/my-repository/_all
```
## Take a snapshot
```
PUT _snapshot/my-repository/my-snapshot
```
## Restore a snapshot
```json
POST _snapshot/my-repository/my-snapshot/_restore
{
"indices": "-.opensearch_security",
"include_global_state": false
}
```

View File

@ -0,0 +1,290 @@
---
layout: default
title: Boolean queries
parent: Query DSL
nav_order: 45
---
# Boolean queries
The `bool` query lets you combine multiple search queries with boolean logic. You can use boolean logic between queries to either narrow or broaden your search results.
The `bool` query is a go-to query because it allows you to construct an advanced query by chaining together several simple ones.
Use the following clauses (subqueries) within the `bool` query:
Clause | Behavior
:--- | :---
`must` | The results must match the queries in this clause. If you have multiple queries, every single one must match. Acts as an `and` operator.
`must_not` | This is the anti-must clause. All matches are excluded from the results. Acts as a `not` operator.
`should` | The results should, but don't have to, match the queries. Each matching `should` clause increases the relevancy score. As an option, you can require one or more queries to match the value of the `minimum_number_should_match` parameter (default is 1).
`filter` | Filters reduce your dataset before applying the queries. A query within a filter clause is a yes-no option, where if a document matches the query it's included in the results. Otherwise, it's not. Filter queries do not affect the relevancy score that the results are sorted by. The results of a filter query are generally cached so they tend to run faster. Use the filter query to filter the results based on exact matches, ranges, dates, numbers, and so on.
The structure of a `bool` query is as follows:
```json
GET _search
{
"query": {
"bool": {
"must": [
{}
],
"must_not": [
{}
],
"should": [
{}
],
"filter": {}
}
}
}
```
For example, assume you have the complete works of Shakespeare indexed in an OpenSearch cluster. You want to construct a single query that meets the following requirements:
1. The `text_entry` field must contain the word `love` and should contain either `life` or `grace`.
2. The `speaker` field must not contain `ROMEO`.
3. Filter these results to the play `Romeo and Juliet` without affecting the relevancy score.
Use the following query:
```json
GET shakespeare/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"text_entry": "love"
}
}
],
"should": [
{
"match": {
"text_entry": "life"
}
},
{
"match": {
"text_entry": "grace"
}
}
],
"minimum_should_match": 1,
"must_not": [
{
"match": {
"speaker": "ROMEO"
}
}
],
"filter": {
"term": {
"play_name": "Romeo and Juliet"
}
}
}
}
}
```
#### Sample output
```json
{
"took": 12,
"timed_out": false,
"_shards": {
"total": 4,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": 11.356054,
"hits": [
{
"_index": "shakespeare",
"_type": "_doc",
"_id": "88020",
"_score": 11.356054,
"_source": {
"type": "line",
"line_id": 88021,
"play_name": "Romeo and Juliet",
"speech_number": 19,
"line_number": "4.5.61",
"speaker": "PARIS",
"text_entry": "O love! O life! not life, but love in death!"
}
}
]
}
}
```
If you want to identify which of these clauses actually caused the matching results, name each query with the `_name` parameter.
To add the `_name` parameter, change the field name in the `match` query to an object:
```json
GET shakespeare/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"text_entry": {
"query": "love",
"_name": "love-must"
}
}
}
],
"should": [
{
"match": {
"text_entry": {
"query": "life",
"_name": "life-should"
}
}
},
{
"match": {
"text_entry": {
"query": "grace",
"_name": "grace-should"
}
}
}
],
"minimum_should_match": 1,
"must_not": [
{
"match": {
"speaker": {
"query": "ROMEO",
"_name": "ROMEO-must-not"
}
}
}
],
"filter": {
"term": {
"play_name": "Romeo and Juliet"
}
}
}
}
}
```
OpenSearch returns a `matched_queries` array that lists the queries that matched these results:
```json
"matched_queries": [
"love-must",
"life-should"
]
```
If you remove the queries not in this list, you will still see the exact same result.
By examining which `should` clause matched, you can better understand the relevancy score of the results.
You can also construct complex boolean expressions by nesting `bool` queries.
For example, to find a `text_entry` field that matches (`love` OR `hate`) AND (`life` OR `grace`) in the play `Romeo and Juliet`:
```json
GET shakespeare/_search
{
"query": {
"bool": {
"must": [
{
"bool": {
"should": [
{
"match": {
"text_entry": "love"
}
},
{
"match": {
"text": "hate"
}
}
]
}
},
{
"bool": {
"should": [
{
"match": {
"text_entry": "life"
}
},
{
"match": {
"text": "grace"
}
}
]
}
}
],
"filter": {
"term": {
"play_name": "Romeo and Juliet"
}
}
}
}
}
```
#### Sample output
```json
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 2,
"successful": 2,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 11.37006,
"hits": [
{
"_index": "shakespeare",
"_type": "doc",
"_id": "88020",
"_score": 11.37006,
"_source": {
"type": "line",
"line_id": 88021,
"play_name": "Romeo and Juliet",
"speech_number": 19,
"line_number": "4.5.61",
"speaker": "PARIS",
"text_entry": "O love! O life! not life, but love in death!"
}
}
]
}
}
```

View File

@ -0,0 +1,435 @@
---
layout: default
title: Full-text queries
parent: Query DSL
nav_order: 40
---
# Full-text queries
This page lists all full-text query types and common options. Given the sheer number of options and subtle behaviors, the best method of ensuring useful search results is to test different queries against representative indices and verify the output.
---
#### Table of contents
1. TOC
{:toc}
---
## Match
Creates a [boolean query](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/BooleanQuery.html) that returns results if the search term is present in the field.
The most basic form of the query provides only a field (`title`) and a term (`wind`):
```json
GET _search
{
"query": {
"match": {
"title": "wind"
}
}
}
```
For an example that uses [curl](https://curl.haxx.se/), try:
```bash
curl --insecure -XGET -u 'admin:admin' https://<host>:<port>/<index>/_search \
-H "content-type: application/json" \
-d '{
"query": {
"match": {
"title": "wind"
}
}
}'
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"match": {
"title": {
"query": "wind",
"fuzziness": "AUTO",
"fuzzy_transpositions": true,
"operator": "or",
"minimum_should_match": 1,
"analyzer": "standard",
"zero_terms_query": "none",
"lenient": false,
"cutoff_frequency": 0.01,
"prefix_length": 0,
"max_expansions": 50,
"boost": 1
}
}
}
}
```
## Multi match
Similar to [match](#match), but searches multiple fields.
The `^` lets you "boost" certain fields. Boosts are multipliers that weigh matches in one field more heavily than matches in other fields. In the following example, a match for "wind" in the title field influences `_score` four times as much as a match in the plot field. The result is that films like *The Wind Rises* and *Gone with the Wind* are near the top of the search results, and films like *Twister* and *Sharknado*, which presumably have "wind" in their plot summaries, are near the bottom.
```json
GET _search
{
"query": {
"multi_match": {
"query": "wind",
"fields": ["title^4", "plot"]
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"multi_match": {
"query": "wind",
"fields": ["title^4", "description"],
"type": "most_fields",
"operator": "and",
"minimum_should_match": 3,
"tie_breaker": 0.0,
"analyzer": "standard",
"boost": 1,
"fuzziness": "AUTO",
"fuzzy_transpositions": true,
"lenient": false,
"prefix_length": 0,
"max_expansions": 50,
"auto_generate_synonyms_phrase_query": true,
"cutoff_frequency": 0.01,
"zero_terms_query": "none"
}
}
}
```
## Match boolean prefix
Similar to [match](#match), but creates a [prefix query](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/PrefixQuery.html) out of the last term in the query string.
```json
GET _search
{
"query": {
"match_bool_prefix": {
"title": "rises wi"
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"match_bool_prefix": {
"title": {
"query": "rises wi",
"fuzziness": "AUTO",
"fuzzy_transpositions": true,
"max_expansions": 50,
"prefix_length": 0,
"operator": "or",
"minimum_should_match": 2,
"analyzer": "standard"
}
}
}
}
```
## Match phrase
Creates a [phrase query](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/PhraseQuery.html) that matches a sequence of terms.
```json
GET _search
{
"query": {
"match_phrase": {
"title": "the wind rises"
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"match_phrase": {
"title": {
"query": "wind rises the",
"slop": 3,
"analyzer": "standard",
"zero_terms_query": "none"
}
}
}
}
```
## Match phrase prefix
Similar to [match phrase](#match-phrase), but creates a [prefix query](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/PrefixQuery.html) out of the last term in the query string.
```json
GET _search
{
"query": {
"match_phrase_prefix": {
"title": "the wind ri"
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"match_phrase_prefix": {
"title": {
"query": "the wind ri",
"analyzer": "standard",
"max_expansions": 50,
"slop": 3
}
}
}
}
```
## Common terms
The common terms query separates the query string into high- and low-frequency terms based on number of occurrences on the shard. Low-frequency terms are weighed more heavily in the results, and high-frequency terms are considered only for documents that already matched one or more low-frequency terms. In that sense, you can think of this query as having a built-in, ever-changing list of stop words.
```json
GET _search
{
"query": {
"common": {
"title": {
"query": "the wind rises"
}
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"common": {
"title": {
"query": "the wind rises",
"cutoff_frequency": 0.002,
"low_freq_operator": "or",
"boost": 1,
"analyzer": "standard",
"minimum_should_match": {
"low_freq" : 2,
"high_freq" : 3
}
}
}
}
}
```
## Query string
The query string query splits text based on operators and analyzes each individually.
If you search using the HTTP request parameters (i.e. `_search?q=wind`), OpenSearch creates a query string query.
{: .note }
```json
GET _search
{
"query": {
"query_string": {
"query": "the wind AND (rises OR rising)"
}
}
}
```
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"query_string": {
"query": "the wind AND (rises OR rising)",
"default_field": "title",
"type": "best_fields",
"fuzziness": "AUTO",
"fuzzy_transpositions": true,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
"minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": false,
"boost": 1,
"allow_leading_wildcard": true,
"enable_position_increments": true,
"phrase_slop": 3,
"max_determinized_states": 10000,
"time_zone": "-08:00",
"quote_field_suffix": "",
"quote_analyzer": "standard",
"analyze_wildcard": false,
"auto_generate_synonyms_phrase_query": true
}
}
}
```
## Simple query string
The simple query string query is like the query string query, but it lets advanced users specify many arguments directly in the query string. The query discards any invalid portions of the query string.
```json
GET _search
{
"query": {
"simple_query_string": {
"query": "\"rises wind the\"~4 | *ising~2",
"fields": ["title"]
}
}
}
```
Special character | Behavior
:--- | :---
`+` | Acts as the `and` operator.
`|` | Acts as the `or` operator.
`*` | Acts as a wildcard.
`""` | Wraps several terms into a phrase.
`()` | Wraps a clause for precedence.
`~n` | When used after a term (e.g. `wnid~3`), sets `fuzziness`. When used after a phrase, sets `slop`. See [Options](#options).
`-` | Negates the term.
The query accepts the following options. For descriptions of each, see [Options](#options).
```json
GET _search
{
"query": {
"simple_query_string": {
"query": "\"rises wind the\"~4 | *ising~2",
"fields": ["title"],
"flags": "ALL",
"fuzzy_transpositions": true,
"fuzzy_max_expansions": 50,
"fuzzy_prefix_length": 0,
"minimum_should_match": 1,
"default_operator": "or",
"analyzer": "standard",
"lenient": false,
"quote_field_suffix": "",
"analyze_wildcard": false,
"auto_generate_synonyms_phrase_query": true
}
}
}
```
## Match all
Matches all documents. Can be useful for testing.
```json
GET _search
{
"query": {
"match_all": {}
}
}
```
## Match none
Matches no documents. Rarely useful.
```json
GET _search
{
"query": {
"match_none": {}
}
}
```
## Options
Option | Valid values | Description
:--- | :--- | :---
`allow_leading_wildcard` | Boolean | Whether `*` and `?` are allowed as the first character of a search term. The default is true.
`analyze_wildcard` | Boolean | Whether OpenSearch should attempt to analyze wildcard terms. Some analyzers do a poor job at this task, so the default is false.
`analyzer` | `standard, simple, whitespace, stop, keyword, pattern, <language>, fingerprint` | The analyzer you want to use for the query. Different analyzers have different character filters, tokenizers, and token filters. The `stop` analyzer, for example, removes stop words (e.g. "an," "but," "this") from the query string.
`auto_generate_synonyms_phrase_query` | Boolean | A value of true (default) automatically generates [phrase queries](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/PhraseQuery.html) for multi-term synonyms. For example, if you have the synonym `"ba, batting average"` and search for "ba," OpenSearch searches for `ba OR "batting average"` (if this option is true) or `ba OR (batting AND average)` (if this option is false).
`boost` | Floating-point | Boosts the clause by the given multiplier. Useful for weighing clauses in compound queries. The default is 1.0.
`cutoff_frequency` | Between `0.0` and `1.0` or a positive integer | This value lets you define high and low frequency terms based on number of occurrences in the index. Numbers between 0 and 1 are treated as a percentage. For example, 0.10 is 10%. This value means that if a word occurs within the search field in more than 10% of the documents on the shard, OpenSearch considers the word "high frequency" and deemphasizes it when calculating search score.<br /><br />Because this setting is *per shard*, testing its impact on search results can be challenging unless a cluster has many documents.
`enable_position_increments` | Boolean | When true, result queries are aware of position increments. This setting is useful when the removal of stop words leaves an unwanted "gap" between terms. The default is true.
`fields` | String array | The list of fields to search (e.g. `"fields": ["title^4", "description"]`). If unspecified, defaults to the `index.query.default_field` setting, which defaults to `["*"]`.
`flags` | String | A `|`-delimited string of [flags](#simple-query-string) to enable (e.g. `AND|OR|NOT`). The default is `ALL`.
`fuzziness` | `AUTO`, `0`, or a positive integer | The number of character edits (insert, delete, substitute) that it takes to change one word to another when determining whether a term matched a value. For example, the distance between `wined` and `wind` is 1. The default, `AUTO`, chooses a value based on the length of each term and is a good choice for most use cases.
`fuzzy_transpositions` | Boolean | Setting `fuzzy_transpositions` to true (default) adds swaps of adjacent characters to the insert, delete, and substitute operations of the `fuzziness` option. For example, the distance between `wind` and `wnid` is 1 if `fuzzy_transpositions` is true (swap "n" and "i") and 2 if it is false (delete "n", insert "n"). <br /><br />If `fuzzy_transpositions` is false, `rewind` and `wnid` have the same distance (2) from `wind`, despite the more human-centric opinion that `wnid` is an obvious typo. The default is a good choice for most use cases.
`lenient` | Boolean | Setting `lenient` to true lets you ignore data type mismatches between the query and the document field. For example, a query string of "8.2" could match a field of type `float`. The default is false.
`low_freq_operator` | `and, or` | The operator for low-frequency terms. The default is `or`. See [Common terms](#common-terms) queries and `operator` in this table.
`max_determinized_states` | Positive integer | The maximum number of "[states](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/util/automaton/Operations.html#DEFAULT_MAX_DETERMINIZED_STATES)" (a measure of complexity) that Lucene can create for query strings that contain regular expressions (e.g. `"query": "/wind.+?/"`). Larger numbers allow for queries that use more memory. The default is 10,000.
`max_expansions` | Positive integer | Fuzzy queries "expand to" a number of matching terms that are within the distance specified in `fuzziness`. Then OpenSearch tries to match those terms against its indices. `max_expansions` specifies the maximum number of terms that the fuzzy query expands to. The default is 50.
`minimum_should_match` | Positive or negative integer, positive or negative percentage, combination | If the query string contains multiple search terms and you used the `or` operator, the number of terms that need to match for the document to be considered a match. For example, if `minimum_should_match` is 2, "wind often rising" does not match "The Wind Rises." If `minimum_should_match` is 1, it matches. This option also has `low_freq` and `high_freq` properties for [Common terms](#common-terms) queries.
`operator` | `or, and` | If the query string contains multiple search terms, whether all terms need to match (`and`) or only one term needs to match (`or`) for a document to be considered a match.
`phrase_slop` | `0` (default) or a positive integer | See `slop`.
`prefix_length` | `0` (default) or a positive integer | The number of leading characters that are not considered in fuzziness.
`quote_field_suffix` | String | This option lets you search different fields depending on whether terms are wrapped in quotes. For example, if `quote_field_suffix` is `".exact"` and you search for `"lightly"` (in quotes) in the `title` field, OpenSearch searches the `title.exact` field. This second field might use a different type (e.g. `keyword` rather than `text`) or a different analyzer. The default is null.
`rewrite` | `constant_score, scoring_boolean, constant_score_boolean, top_terms_N, top_terms_boost_N, top_terms_blended_freqs_N` | Determines how OpenSearch rewrites and scores multi-term queries. The default is `constant_score`.
`slop` | `0` (default) or a positive integer | Controls the degree to which words in a query can be misordered and still be considered a match. From the [Lucene documentation](https://lucene.apache.org/core/8_4_0/core/org/apache/lucene/search/PhraseQuery.html#getSlop--): "The number of other words permitted between words in query phrase. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit re-orderings of phrases, the slop must be at least two. A value of zero requires an exact match."
`tie_breaker` | `0.0` (default) to `1.0` | Changes the way OpenSearch scores searches. For example, a `type` of `best_fields` typically uses the highest score from any one field. If you specify a `tie_breaker` value between 0.0 and 1.0, the score changes to highest score + `tie_breaker` * score for all other matching fields. If you specify a value of 1.0, OpenSearch adds together the scores for all matching fields (effectively defeating the purpose of `best_fields`).
`time_zone` | UTC offset | The time zone to use (e.g. `-08:00`) if the query string contains a date range (e.g. `"query": "wind rises release_date[2012-01-01 TO 2014-01-01]"`). The default is `UTC`.
`type` | `best_fields, most_fields, cross-fields, phrase, phrase_prefix` | Determines how OpenSearch executes the query and scores the results. The default is `best_fields`.
`zero_terms_query` | `none, all` | If the analyzer removes all terms from a query string, whether to match no documents (default) or all documents. For example, the `stop` analyzer removes all terms from the string "an but this."

View File

@ -0,0 +1,120 @@
---
layout: default
title: Query DSL
nav_order: 27
has_children: true
---
# Query DSL
While you can use HTTP request parameters to perform simple searches, you can also use the OpenSearch query domain-specific language (DSL), which provides a wider range of search options. The query DSL uses the HTTP request body, so you can more easily customize your queries to get the exact results that you want.
For example, the following request performs a simple search to search for a `speaker` field that has a value of `queen`.
**Sample request**
```json
GET _search?q=speaker:queen
```
**Sample response**
```
{
"took": 87,
"timed_out": false,
"_shards": {
"total": 68,
"successful": 68,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4080,
"relation": "eq"
},
"max_score": 4.4368687,
"hits": [
{
"_index": "new_shakespeare",
"_type": "_doc",
"_id": "28559",
"_score": 4.4368687,
"_source": {
"type": "line",
"line_id": 28560,
"play_name": "Cymbeline",
"speech_number": 20,
"line_number": "1.1.81",
"speaker": "QUEEN",
"text_entry": "No, be assured you shall not find me, daughter,"
}
}
```
With query DSL, however, you can include an HTTP request body to look for results more tailored to your needs. The following example shows how to search for `speaker` and `text_entry` fields that have a value of `QUEEN`.
**Sample request**
```json
{
"query": {
"multi_match": {
"query": "QUEEN",
"fields": ["speaker", "text_entry"]
}
}
}
```
**Sample Response**
```json
{
"took": 39,
"timed_out": false,
"_shards": {
"total": 68,
"successful": 68,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5837,
"relation": "eq"
},
"max_score": 7.8623476,
"hits": [
{
"_index": "new_shakespeare",
"_type": "_doc",
"_id": "100763",
"_score": 7.8623476,
"_source": {
"type": "line",
"line_id": 100764,
"play_name": "Troilus and Cressida",
"speech_number": 43,
"line_number": "3.1.68",
"speaker": "PANDARUS",
"text_entry": "Sweet queen, sweet queen! thats a sweet queen, i faith."
}
},
{
"_index": "shakespeare",
"_type": "_doc",
"_id": "28559",
"_score": 5.8923807,
"_source": {
"type": "line",
"line_id": 28560,
"play_name": "Cymbeline",
"speech_number": 20,
"line_number": "1.1.81",
"speaker": "QUEEN",
"text_entry": "No, be assured you shall not find me, daughter,"
}
}
]
}
}
```
The OpenSearch query DSL comes in three varieties: term-level queries, full-text queries, and boolean queries. You can even perform more complicated searches by using different elements from each variety to find whatever data you need.

View File

@ -0,0 +1,450 @@
---
layout: default
title: Term-level queries
parent: Query DSL
nav_order: 30
---
# Term-level queries
OpenSearch supports two types of queries when you search for data: term-level queries and full-text queries.
The following table describes the differences between them:
| | Term-level queries | Full-text queries
:--- | :--- | :---
*Description* | Term-level queries answer which documents match a query. | Full-text queries answer how well the documents match a query.
*Analyzer* | The search term isn't analyzed. This means that the term query searches for your search term as it is. | The search term is analyzed by the same analyzer that was used for the specific field of the document at the time it was indexed. This means that your search term goes through the same analysis process that the document's field did.
*Relevance* | Term-level queries simply return documents that match without sorting them based on the relevance score. They still calculate the relevance score, but this score is the same for all the documents that are returned. | Full-text queries calculate a relevance score for each match and sort the results by decreasing order of relevance.
*Use Case* | Use term-level queries when you want to match exact values such as numbers, dates, tags, and so on, and don't need the matches to be sorted by relevance. | Use full-text queries to match text fields and sort by relevance after taking into account factors like casing and stemming variants.
OpenSearch uses a probabilistic ranking framework called Okapi BM25 to calculate relevance scores. To learn more about Okapi BM25, see [Wikipedia](https://en.wikipedia.org/wiki/Okapi_BM25).
{: .note }
Assume that you have the complete works of Shakespeare indexed in an OpenSearch cluster. We use a term-level query to search for the phrase "To be, or not to be" in the `text_entry` field:
```json
GET shakespeare/_search
{
"query": {
"term": {
"text_entry": "To be, or not to be"
}
}
}
```
#### Sample response
```json
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
```
We dont get back any matches (`hits`). This is because the term “To be, or not to be” is searched literally in the inverted index, where only the analyzed values of the text fields are stored. Term-level queries aren't suited for searching on analyzed text fields because they often yield unexpected results. When working with text data, use term-level queries only for fields mapped as keyword only.
Using a full-text query:
```json
GET shakespeare/_search
{
"query": {
"match": {
"text_entry": "To be, or not to be"
}
}
}
```
The search query “To be, or not to be” is analyzed and tokenized into an array of tokens just like the `text_entry` field of the documents. The full-text query performs an intersection of tokens between our search query and the `text_entry` fields for all the documents, and then sorts the results by relevance scores:
#### Sample response
```json
{
"took" : 19,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 10000,
"relation" : "gte"
},
"max_score" : 17.419369,
"hits" : [
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "34229",
"_score" : 17.419369,
"_source" : {
"type" : "line",
"line_id" : 34230,
"play_name" : "Hamlet",
"speech_number" : 19,
"line_number" : "3.1.64",
"speaker" : "HAMLET",
"text_entry" : "To be, or not to be: that is the question:"
}
},
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "109930",
"_score" : 14.883024,
"_source" : {
"type" : "line",
"line_id" : 109931,
"play_name" : "A Winters Tale",
"speech_number" : 23,
"line_number" : "4.4.153",
"speaker" : "PERDITA",
"text_entry" : "Not like a corse; or if, not to be buried,"
}
},
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "103117",
"_score" : 14.782743,
"_source" : {
"type" : "line",
"line_id" : 103118,
"play_name" : "Twelfth Night",
"speech_number" : 53,
"line_number" : "1.3.95",
"speaker" : "SIR ANDREW",
"text_entry" : "will not be seen; or if she be, its four to one"
}
}
]
}
}
...
```
For a list of all full-text queries, see [Full-text queries](../full-text/).
If you want to query for an exact term like “HAMLET” in the speaker field and don't need the results to be sorted by relevance scores, a term-level query is more efficient:
```json
GET shakespeare/_search
{
"query": {
"term": {
"speaker": "HAMLET"
}
}
}
```
#### Sample response
```json
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 1582,
"relation" : "eq"
},
"max_score" : 4.2540946,
"hits" : [
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "32700",
"_score" : 4.2540946,
"_source" : {
"type" : "line",
"line_id" : 32701,
"play_name" : "Hamlet",
"speech_number" : 9,
"line_number" : "1.2.66",
"speaker" : "HAMLET",
"text_entry" : "[Aside] A little more than kin, and less than kind."
}
},
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "32702",
"_score" : 4.2540946,
"_source" : {
"type" : "line",
"line_id" : 32703,
"play_name" : "Hamlet",
"speech_number" : 11,
"line_number" : "1.2.68",
"speaker" : "HAMLET",
"text_entry" : "Not so, my lord; I am too much i' the sun."
}
},
{
"_index" : "shakespeare",
"_type" : "_doc",
"_id" : "32709",
"_score" : 4.2540946,
"_source" : {
"type" : "line",
"line_id" : 32710,
"play_name" : "Hamlet",
"speech_number" : 13,
"line_number" : "1.2.75",
"speaker" : "HAMLET",
"text_entry" : "Ay, madam, it is common."
}
}
]
}
}
...
```
The term-level queries are exact matches. So, if you search for “Hamlet”, you dont get back any matches, because “HAMLET” is a keyword field and is stored in OpenSearch literally and not in an analyzed form.
The search query “HAMLET” is also searched literally. So, to get a match on this field, we need to enter the exact same characters.
---
## Term
Use the `term` query to search for an exact term in a field.
```json
GET shakespeare/_search
{
"query": {
"term": {
"line_id": {
"value": "61809"
}
}
}
}
```
## Terms
Use the `terms` query to search for multiple terms in the same field.
```json
GET shakespeare/_search
{
"query": {
"terms": {
"line_id": [
"61809",
"61810"
]
}
}
}
```
You get back documents that match any of the terms.
## IDs
Use the `ids` query to search for one or more document ID values.
```json
GET shakespeare/_search
{
"query": {
"ids": {
"values": [
34229,
91296
]
}
}
}
```
## Range
Use the `range` query to search for a range of values in a field.
To search for documents where the `line_id` value is >= 10 and <= 20:
```json
GET shakespeare/_search
{
"query": {
"range": {
"line_id": {
"gte": 10,
"lte": 20
}
}
}
}
```
Parameter | Behavior
:--- | :---
`gte` | Greater than or equal to.
`gt` | Greater than.
`lte` | Less than or equal to.
`lt` | Less than.
Assume that you have a `products` index and you want to find all the products that were added in the year 2019:
```json
GET products/_search
{
"query": {
"range": {
"created": {
"gte": "2019/01/01",
"lte": "2019/12/31"
}
}
}
}
```
Specify relative dates by using basic math expressions.
To subtract 1 year and 1 day from the specified date:
```json
GET products/_search
{
"query": {
"range": {
"created": {
"gte": "2019/01/01||-1y-1d"
}
}
}
}
```
The first date that we specify is the anchor date or the starting point for the date math. Add two trailing pipe symbols. You could then add one day (`+1d`) or subtract two weeks (`-2w`). This math expression is relative to the anchor date that you specify.
You could also round off dates by adding a forward slash to the date or time unit.
To find products added in the last year and rounded off by month:
```json
GET products/_search
{
"query": {
"range": {
"created": {
"gte": "now-1y/M"
}
}
}
}
```
The keyword `now` refers to the current date and time.
## Prefix
Use the `prefix` query to search for terms that begin with a specific prefix.
```json
GET shakespeare/_search
{
"query": {
"prefix": {
"speaker": "KING"
}
}
}
```
## Exists
Use the `exists` query to search for documents that contain a specific field.
```json
GET shakespeare/_search
{
"query": {
"exists": {
"field": "speaker"
}
}
}
```
## Wildcards
Use wildcard queries to search for terms that match a wildcard pattern.
Feature | Behavior
:--- | :---
`*` | Specifies all valid values.
`?` | Specifies a single valid value.
To search for terms that start with `H` and end with `Y`:
```json
GET shakespeare/_search
{
"query": {
"wildcard": {
"speaker": {
"value": "H*Y"
}
}
}
}
```
If we change `*` to `?`, we get no matches, because `?` refers to a single character.
Wildcard queries tend to be slow because they need to iterate over a lot of terms. Avoid placing wildcard characters at the beginning of a query because it could be a very expensive operation in terms of both resources and time.
## Regex
Use the `regex` query to search for terms that match a regular expression.
This regular expression matches any single uppercase or lowercase letter:
```json
GET shakespeare/_search
{
"query": {
"regexp": {
"play_name": "H[a-zA-Z]+mlet"
}
}
}
```
Regular expressions are applied to the terms in the field and not the entire value of the field.
The efficiency of your regular expression depends a lot on the patterns you write. Make sure that you write `regex` queries with either a prefix or suffix to improve performance.

View File

@ -0,0 +1,284 @@
---
layout: default
title: Reindex data
nav_order: 16
---
# Reindex data
After creating an index, you might need to make an extensive change such as adding a new field to every document or combining multiple indices to form a new one. Rather than deleting your index, making the change offline, and then indexing your data all over again, you can use the `reindex` operation.
With the `reindex` operation, you can copy all or a subset of documents that you select through a query to another index. Reindex is a `POST` operation. In its most basic form, you specify a source index and a destination index.
Reindexing can be an expensive operation depending on the size of your source index. We recommend you disable replicas in your destination index by setting `number_of_replicas` to `0` and re-enable them once the reindex process is complete.
{: .note }
---
#### Table of contents
1. TOC
{:toc}
---
## Reindex all documents
You can copy all documents from one index to another.
You first need to create a destination index with your desired field mappings and settings or you can copy the ones from your source index:
```json
PUT destination
{
"mappings":{
"Add in your desired mappings"
},
"settings":{
"Add in your desired settings"
}
}
```
This `reindex` command copies all the documents from a source index to a destination index:
```json
POST _reindex
{
"source":{
"index":"source"
},
"dest":{
"index":"destination"
}
}
```
If the destination index is not already created, the `reindex` operation creates a new destination index with default configurations.
## Reindex from a remote cluster
You can copy documents from an index in a remote cluster. Use the `remote` option to specify the remote hostname and the required login credentials.
This command reaches out to a remote cluster, logs in with the username and password, and copies all the documents from the source index in that remote cluster to the destination index in your local cluster:
```json
POST _reindex
{
"source":{
"remote":{
"host":"https://<REST_endpoint_of_remote_cluster>:9200",
"username":"YOUR_USERNAME",
"password":"YOUR_PASSWORD"
}
},
"dest":{
"index":"destination"
}
}
```
You can specify the following options:
Options | Valid values | Description | Required
:--- | :--- | :---
`host` | String | The REST endpoint of the remote cluster. | Yes
`username` | String | The username to log into the remote cluster. | No
`password` | String | The password to log into the remote cluster. | No
`socket_timeout` | Time Unit | The wait time for socket reads (default 30s). | No
`connect_timeout` | Time Unit | The wait time for remote connection timeouts (default 30s). | No
## Reindex a subset of documents
You can copy a specific set of documents that match a search query.
This command copies only a subset of documents matched by a query operation to the destination index:
```json
POST _reindex
{
"source":{
"index":"source",
"query": {
"match": {
"field_name": "text"
}
}
},
"dest":{
"index":"destination"
}
}
```
For a list of all query operations, see [Full-text queries](../full-text/).
## Combine one or more indices
You can combine documents from one or more indices by adding the source indices as a list.
This command copies all documents from two source indices to one destination index:
```json
POST _reindex
{
"source":{
"index":[
"source_1",
"source_2"
]
},
"dest":{
"index":"destination"
}
}
```
Make sure the number of shards for your source and destination indices are the same.
## Reindex only unique documents
You can copy only documents missing from a destination index by setting the `op_type` option to `create`.
In this case, if a document with the same ID already exists, the operation ignores the one from the source index.
To ignore all version conflicts of documents, set the `conflicts` option to `proceed`.
```json
POST _reindex
{
"conflicts":"proceed",
"source":{
"index":"source"
},
"dest":{
"index":"destination",
"op_type":"create"
}
}
```
## Reindex sorted documents
You can copy certain documents after sorting specific fields in the document.
This command copies the last 10 documents based on the `timestamp` field:
```json
POST _reindex
{
"size":10,
"source":{
"index":"source",
"sort":{
"timestamp":"desc"
}
},
"dest":{
"index":"destination"
}
}
```
## Transform documents during reindexing
You can transform your data during the reindexing process using the `script` option.
We recommend Painless for scripting in OpenSearch.
This command runs the source index through a Painless script that increments a `number` field inside an `account` object before copying it to the destination index:
```json
POST _reindex
{
"source":{
"index":"source"
},
"dest":{
"index":"destination"
},
"script":{
"lang":"painless",
"source":"ctx._account.number++"
}
}
```
You can also specify an ingest pipeline to transform your data during the reindexing process.
You would first have to create a pipeline with `processors` defined. You have a number of different `processors` available to use in your ingest pipeline.
Here's a sample ingest pipeline that defines a `split` processor that splits a `text` field based on a `space` separator and stores it in a new `word` field. The `script` processor is a Painless script that finds the length of the `word` field and stores it in a new `word_count` field. The `remove` processor removes the `test` field.
```json
PUT _ingest/pipeline/pipeline-test
{
"description": "Splits the text field into a list. Computes the length of the 'word' field and stores it in a new 'word_count' field. Removes the 'test' field.",
"processors": [
{
"split": {
"field": "text",
"separator": "\\s+",
"target_field": "word"
},
}
{
"script": {
"lang": "painless",
"source": "ctx.word_count = ctx.word.length"
}
},
{
"remove": {
"field": "test"
}
}
]
}
```
After creating a pipeline, you can use the `reindex` operation:
```json
POST _reindex
{
"source": {
"index": "source",
},
"dest": {
"index": "destination",
"pipeline": "pipeline-test"
}
}
```
## Update documents in the current index
To update the data in your current index itself without copying it to a different index, use the `update_by_query` operation.
The `update_by_query` operation is `POST` operation that you can perform on a single index at a time.
```json
POST <index_name>/_update_by_query
```
If you run this command with no parameters, it increments the version number for all documents in the index.
## Source index options
You can specify the following options for your source index:
Option | Valid values | Description | Required
:--- | :--- | :---
`index` | String | The name of the source index. You can provide multiple source indices as a list. | Yes
`max_docs` | Integer | The maximum number of documents to reindex. | No
`query` | Object | The search query to use for the reindex operation. | No
`size` | Integer | The number of documents to reindex. | No
`slice` | String | Specify manual or automatic slicing to parallelize reindexing. | No
`sort` | List | Sort specific fields in the document before reindexing. | No
## Destination index options
You can specify the following options for your destination index:
Option | Valid values | Description | Required
:--- | :--- | :---
`index` | String | The name of the destination index. | Yes
`version_type` | Enum | The version type for the indexing operation. Valid values: internal, external, external_gt, external_gte. | No

View File

@ -0,0 +1,169 @@
---
layout: default
title: Bulk
parent: REST API reference
nav_order: 15
---
# Bulk
The bulk operation lets you add, update, or delete many documents in a single request. Compared to individual OpenSearch indexing requests, the bulk operation has significant performance benefits. Whenever practical, we recommend batching indexing operations into bulk requests.
## Example
```json
POST _bulk
{ "delete": { "_index": "movies", "_id": "tt2229499" } }
{ "index": { "_index": "movies", "_id": "tt1979320" } }
{ "title": "Rush", "year": 2013 }
{ "create": { "_index": "movies", "_id": "tt1392214" } }
{ "title": "Prisoners", "year": 2013 }
{ "update": { "_index": "movies", "_id": "tt0816711" } }
{ "doc" : { "title": "World War Z" } }
```
## Path and HTTP methods
```
POST _bulk
POST {index}/_bulk
```
Specifying the index in the path means you don't need to include it in the [request body](#request-body).
OpenSearch also accepts PUT requests to the `_bulk` path, but we highly recommend using POST. The accepted usage of PUT---adding or replacing a single resource at a given path---doesn't make sense for bulk requests.
{: .note }
## URL parameters
All bulk URL parameters are optional.
Parameter | Type | Description
:--- | :--- | :---
pipeline | String | The pipeline ID for preprocessing documents.
refresh | Enum | Whether to refresh the affected shards after performing the indexing operations. Default is `false`. `true` makes the changes show up in search results immediately, but hurts cluster performance. `wait_for` waits for a refresh. Requests take longer to return, but cluster performance doesn't suffer.
require_alias | Boolean | Set to `true` to require that all actions target an index alias rather than an index. Default is `false`.
routing | String | Routes the request to the specified shard.
timeout | Time | How long to wait for the request to return. Default `1m`.
type | String | (Deprecated) The default document type for documents that don't specify a type. Default is `_doc`. We highly recommend ignoring this parameter and using a type of `_doc` for all indices.
wait_for_active_shards | String | Specifies the number of active shards that must be available before OpenSearch processes the bulk request. Default is 1 (only the primary shard). Set to `all` or a positive integer. Values greater than 1 require replicas. For example, if you specify a value of 3, the index must have two replicas distributed across two additional nodes for the request to succeed.
{% comment %}_source | List | asdf
_source_excludes | list | asdf
_source_includes | list | asdf{% endcomment %}
## Request body
The bulk request body follows this pattern:
```
Action and metadata\n
Optional document\n
Action and metadata\n
Optional document\n
```
The optional JSON document doesn't need to be minified---spaces are fine---but it does need to be on a single line. OpenSearch uses newline characters to parse bulk requests and requires that the request body end with a newline character.
All actions support the same metadata: `_index`, `_id`, and `_require_alias`. If you don't provide an ID, OpenSearch generates one automatically, which can make it challenging to update the document at a later time.
- Create
Creates a document if it doesn't already exist and returns an error otherwise. The next line must include a JSON document.
```json
{ "create": { "_index": "movies", "_id": "tt1392214" } }
{ "title": "Prisoners", "year": 2013 }
```
- Delete
This action deletes a document if it exists. If the document doesn't exist, OpenSearch doesn't return an error, but instead returns `not_found` under `result`. Delete actions don't require documents on the next line.
```json
{ "delete": { "_index": "movies", "_id": "tt2229499" } }
```
- Index
Index actions create a document if it doesn't yet exist and replace the document if it already exists. The next line must include a JSON document.
```json
{ "index": { "_index": "movies", "_id": "tt1979320" } }
{ "title": "Rush", "year": 2013}
```
- Update
This action updates existing documents and returns an error if the document doesn't exist. The next line must include a full or partial JSON document, depending on how much of the document you want to update. It can also include a script or upsert for more complex document updates.
```json
{ "update": { "_index": "movies", "_id": "tt0816711" } }
{ "doc" : { "title": "World War Z" } }
```
## Response
In the response, pay particular attention to the top-level `errors` boolean. If true, you can iterate over the individual actions for more detailed information.
```json
{
"took": 11,
"errors": true,
"items": [
{
"index": {
"_index": "movies",
"_type": "_doc",
"_id": "tt1979320",
"_version": 1,
"result": "created",
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"_seq_no": 1,
"_primary_term": 1,
"status": 201
}
},
{
"create": {
"_index": "movies",
"_type": "_doc",
"_id": "tt1392214",
"status": 409,
"error": {
"type": "version_conflict_engine_exception",
"reason": "[tt1392214]: version conflict, document already exists (current version [1])",
"index": "movies",
"shard": "0",
"index_uuid": "yhizhusbSWmP0G7OJnmcLg"
}
}
},
{
"update": {
"_index": "movies",
"_type": "_doc",
"_id": "tt0816711",
"status": 404,
"error": {
"type": "document_missing_exception",
"reason": "[_doc][tt0816711]: document missing",
"index": "movies",
"shard": "0",
"index_uuid": "yhizhusbSWmP0G7OJnmcLg"
}
}
}
]
}
```

View File

@ -0,0 +1,144 @@
---
layout: default
title: Cluster allocation explain
parent: REST API reference
nav_order: 30
---
# Cluster allocation explain
The most basic cluster allocation explain request finds an unassigned shard and explains why it can't be allocated to a node.
If you add some options, you can instead get information on a specific shard, including why OpenSearch assigned it to its current node.
## Example
```json
GET /_cluster/allocation/explain?include_yes_decisions=true
{
"index": "movies",
"shard": 0,
"primary": true
}
```
## Path and HTTP methods
```
GET _cluster/allocation/explain
POST _cluster/allocation/explain
```
## URL parameters
All cluster allocation explain parameters are optional.
Parameter | Type | Description
:--- | :--- | :---
include_yes_decisions | Boolean | OpenSearch makes a series of yes or no decisions when trying to allocate a shard to a node. If this parameter is true, OpenSearch includes the (generally more numerous) "yes" decisions in its response. Default is false.
include_disk_info | Boolean | Whether to include information about disk usage in the response. Default is false.
## Request body
All cluster allocation explain fields are optional.
Field | Type | Description
:--- | :--- | :---
current_node | String | If you only want an explanation if the shard happens to be on a particular node, specify that node name here.
index | String | The name of the shard's index.
primary | Boolean | Whether to provide an explanation for the primary shard (true) or its first replica (false), which share the same shard ID.
shard | Integer | The shard ID that you want an explanation for.
## Response
```json
{
"index": "movies",
"shard": 0,
"primary": true,
"current_state": "started",
"current_node": {
"id": "d8jRZcW1QmCBeVFlgOJx5A",
"name": "opensearch-node1",
"transport_address": "172.24.0.4:9300",
"weight_ranking": 1
},
"can_remain_on_current_node": "yes",
"can_rebalance_cluster": "yes",
"can_rebalance_to_other_node": "no",
"rebalance_explanation": "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
"node_allocation_decisions": [{
"node_id": "vRxi4uPcRt2BtHlFoyCyTQ",
"node_name": "opensearch-node2",
"transport_address": "172.24.0.3:9300",
"node_decision": "no",
"weight_ranking": 1,
"deciders": [{
"decider": "max_retry",
"decision": "YES",
"explanation": "shard has no previous failures"
},
{
"decider": "replica_after_primary_active",
"decision": "YES",
"explanation": "shard is primary and can be allocated"
},
{
"decider": "enable",
"decision": "YES",
"explanation": "all allocations are allowed"
},
{
"decider": "node_version",
"decision": "YES",
"explanation": "can relocate primary shard from a node with version [1.0.0] to a node with equal-or-newer version [1.0.0]"
},
{
"decider": "snapshot_in_progress",
"decision": "YES",
"explanation": "no snapshots are currently running"
},
{
"decider": "restore_in_progress",
"decision": "YES",
"explanation": "ignored as shard is not being recovered from a snapshot"
},
{
"decider": "filter",
"decision": "YES",
"explanation": "node passes include/exclude/require filters"
},
{
"decider": "same_shard",
"decision": "NO",
"explanation": "a copy of this shard is already allocated to this node [[movies][0], node[vRxi4uPcRt2BtHlFoyCyTQ], [R], s[STARTED], a[id=x8w7QxWdQQa188HKGn0iMQ]]"
},
{
"decider": "disk_threshold",
"decision": "YES",
"explanation": "enough disk for shard on node, free: [35.9gb], shard size: [15.1kb], free after allocating shard: [35.9gb]"
},
{
"decider": "throttling",
"decision": "YES",
"explanation": "below shard recovery limit of outgoing: [0 < 2] incoming: [0 < 2]"
},
{
"decider": "shards_limit",
"decision": "YES",
"explanation": "total shard limits are disabled: [index: -1, cluster: -1] <= 0"
},
{
"decider": "awareness",
"decision": "YES",
"explanation": "allocation awareness is not enabled, set cluster setting [cluster.routing.allocation.awareness.attributes] to enable it"
}
]
}]
}
```

View File

@ -0,0 +1,10 @@
---
layout: default
title: REST API reference
nav_order: 99
has_children: true
---
# REST API reference
OpenSearch uses its REST API for most operations. This _incomplete_ section includes REST API paths, HTTP verbs, supported parameters, request body details, and sample responses.

View File

@ -0,0 +1,415 @@
---
layout: default
title: Search templates
nav_order: 50
---
# Search templates
You can convert your full-text queries into a search template to accept user input and dynamically insert it into your query.
For example, if you use OpenSearch as a backend search engine for your application or website, you can take in user queries from a search bar or a form field and pass them as parameters into a search template. That way, the syntax to create OpenSearch queries is abstracted from your end users.
When you're writing code to convert user input into OpenSearch queries, you can simplify your code with search templates. If you need to add fields to your search query, you can just modify the template without making changes to your code.
Search templates use the Mustache language. For a list of all syntax options, see the [Mustache manual](http://mustache.github.io/mustache.5.html).
{: .note }
## Create search templates
A search template has two components: the query and the parameters. Parameters are user-inputted values that get placed into variables. Variables are represented with double braces in Mustache notation. When encountering a variable like `{% raw %}{{var}}{% endraw %}` in the query, OpenSearch goes to the `params` section, looks for a parameter called `var`, and replaces it with the specified value.
You can code your application to ask your user what they want to search for and then plug that value into the `params` object at runtime.
This command defines a search template to find a play by its name. The `{% raw %}{{play_name}}{% endraw %}` in the query is replaced by the value `Henry IV`:
```json
GET _search/template
{
"source": {
"query": {
"match": {
"play_name": "{% raw %}{{play_name}}{% endraw %}"
}
}
},
"params": {
"play_name": "Henry IV"
}
}
```
This template runs the search on your entire cluster.
To run this search on a specific index, add the index name to the request:
```json
GET shakespeare/_search/template
```
Specify the `from` and `size` parameters:
```json
GET _search/template
{
"source": {
"from": "{% raw %}{{from}}{% endraw %}",
"size": "{% raw %}{{size}}{% endraw %}",
"query": {
"match": {
"play_name": "{% raw %}{{play_name}}{% endraw %}"
}
}
},
"params": {
"play_name": "Henry IV",
"from": 10,
"size": 10
}
}
```
To improve the search experience, you can define defaults so the user doesnt have to specify every possible parameter. If the parameter is not defined in the `params` section, OpenSearch uses the default value.
The syntax for defining the default value for a variable `var` is as follows:
```json
{% raw %}{{var}}{{^var}}default value{{/var}}{% endraw %}
```
This command sets the defaults for `from` as 10 and `size` as 10:
```json
GET _search/template
{
"source": {
"from": "{% raw %}{{from}}{{^from}}10{{/from}}{% endraw %}",
"size": "{% raw %}{{size}}{{^size}}10{{/size}}{% endraw %}",
"query": {
"match": {
"play_name": "{% raw %}{{play_name}}{% endraw %}"
}
}
},
"params": {
"play_name": "Henry IV"
}
}
```
## Save and execute search templates
After the search template works the way you want it to, you can save the source of that template as a script, making it reusable for different input parameters.
When saving the search template as a script, you need to specify the `lang` parameter as `mustache`:
```json
POST _scripts/play_search_template
{
"script": {
"lang": "mustache",
"source": {
"from": "{% raw %}{{from}}{{^from}}0{{/from}}{% endraw %}",
"size": "{% raw %}{{size}}{{^size}}10{{/size}}{% endraw %}",
"query": {
"match": {
"play_name": "{{play_name}}"
}
}
},
"params": {
"play_name": "Henry IV"
}
}
}
```
Now you can reuse the template by referring to its `id` parameter.
You can reuse this source template for different input values.
```json
GET _search/template
{
"id": "play_search_template",
"params": {
"play_name": "Henry IV",
"from": 0,
"size": 1
}
}
```
#### Sample output
```json
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 6,
"successful": 6,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 3205,
"relation": "eq"
},
"max_score": 3.641852,
"hits": [
{
"_index": "shakespeare",
"_type": "_doc",
"_id": "4",
"_score": 3.641852,
"_source": {
"type": "line",
"line_id": 5,
"play_name": "Henry IV",
"speech_number": 1,
"line_number": "1.1.2",
"speaker": "KING HENRY IV",
"text_entry": "Find we a time for frighted peace to pant,"
}
}
]
}
}
```
If you have a stored template and want to validate it, use the `render` operation:
```json
POST _render/template
{
"id": "play_search_template",
"params": {
"play_name": "Henry IV"
}
}
```
#### Sample output
```json
{
"template_output": {
"from": "0",
"size": "10",
"query": {
"match": {
"play_name": "Henry IV"
}
}
}
}
```
## Advanced parameter conversion with search templates
You have a lot of different syntax options in Mustache to transpose the input parameters into a query.
You can specify conditions, run loops, join arrays, convert arrays to JSON, and so on.
### Conditions
Use the section tag in Mustache to represent conditions:
```json
{% raw %}{{#var}}var{{/var}}{% endraw %}
```
When `var` is a boolean value, this syntax acts as an `if` condition. The `{% raw %}{{#var}}{% endraw %}` and `{% raw %}{{/var}}{% endraw %}` tags insert the values placed between them only if `var` evaluates to `true`.
Using section tags would make your JSON invalid, so you must write your query in a string format instead.
This command includes the `size` parameter in the query only when the `limit` parameter is set to `true`.
In the following example, the `limit` parameter is `true`, so the `size` parameter is activated. As a result, you would get back only two documents.
```json
GET _search/template
{
"source": "{% raw %}{ {{#limit}} \"size\": \"{{size}}\", {{/limit}} \"query\":{\"match\":{\"play_name\": \"{{play_name}}\"}}}{% endraw %}",
"params": {
"play_name": "Henry IV",
"limit": true,
"size": 2
}
}
```
You can also design an `if-else` condition.
This command sets `size` to `2` if `limit` is `true`. Otherwise, it sets `size` to `10`.
```json
GET _search/template
{
"source": "{% raw %}{ {{#limit}} \"size\": \"2\", {{/limit}} {{^limit}} \"size\": \"10\", {{/limit}} \"query\":{\"match\":{\"play_name\": \"{{play_name}}\"}}}{% endraw %}",
"params": {
"play_name": "Henry IV",
"limit": true
}
}
```
### Loops
You can also use the section tag to implement a foreach loop:
```
{% raw %}{{#var}}{{.}}}{{/var}}{% endraw %}
```
When `var` is an array, the search template iterates through it and creates a `terms` query.
```json
GET _search/template
{
"source": "{% raw %}{\"query\":{\"terms\":{\"play_name\":[\"{{#play_name}}\",\"{{.}}\",\"{{/play_name}}\"]}}}{% endraw %}",
"params": {
"play_name": [
"Henry IV",
"Othello"
]
}
}
```
This template is rendered as:
```json
GET _search/template
{
"source": {
"query": {
"terms": {
"play_name": [
"Henry IV",
"Othello"
]
}
}
}
}
```
### Join
You can use the `join` tag to concatenate values of an array (separated by commas):
```json
GET _search/template
{
"source": {
"query": {
"match": {
"text_entry": "{% raw %}{{#join}}{{text_entry}}{{/join}}{% endraw %}"
}
}
},
"params": {
"text_entry": [
"To be",
"or not to be"
]
}
}
```
Renders as:
```json
GET _search/template
{
"source": {
"query": {
"match": {
"text_entry": "{0=To be, 1=or not to be}"
}
}
}
}
```
### Convert to JSON
You can use the `toJson` tag to convert parameters to their JSON representation:
```json
GET _search/template
{
"source": "{\"query\":{\"bool\":{\"must\":[{\"terms\": {\"text_entries\": {% raw %}{{#toJson}}text_entries{{/toJson}}{% endraw %} }}] }}}",
"params": {
"text_entries": [
{ "term": { "text_entry" : "love" } },
{ "term": { "text_entry" : "soldier" } }
]
}
}
```
Renders as:
```json
GET _search/template
{
"source": {
"query": {
"bool": {
"must": [
{
"terms": {
"text_entries": [
{
"term": {
"text_entry": "love"
}
},
{
"term": {
"text_entry": "soldier"
}
}
]
}
}
]
}
}
}
}
```
## Multiple search templates
You can bundle multiple search templates and send them to your OpenSearch cluster in a single request using the `msearch` operation.
This saves network round trip time, so you get back the response more quickly as compared to independent requests.
```json
GET _msearch/template
{"index":"shakespeare"}
{"id":"if_search_template","params":{"play_name":"Henry IV","limit":false,"size":2}}
{"index":"shakespeare"}
{"id":"play_search_template","params":{"play_name":"Henry IV"}}
```
## Manage search templates
To list all scripts, run the following command:
```json
GET _cluster/state/metadata?pretty&filter_path=**.stored_scripts
```
To retrieve a specific search template, run the following command:
```json
GET _scripts/<name_of_search_template>
```
To delete a search template, run the following command:
```json
DELETE _scripts/<name_of_search_template>
```
---

View File

@ -0,0 +1,381 @@
---
layout: default
title: Take and restore snapshots
nav_order: 65
---
# Take and restore snapshots
Snapshots are backups of a cluster's indices and state. State includes cluster settings, node information, index settings, and shard allocation.
Snapshots have two main uses:
- **Recovering from failure**
For example, if cluster health goes red, you might restore the red indices from a snapshot.
- **Migrating from one cluster to another**
For example, if you're moving from a proof-of-concept to a production cluster, you might take a snapshot of the former and restore it on the latter.
---
#### Table of contents
1. TOC
{:toc}
---
## About snapshots
Snapshots aren't instantaneous. They take time to complete and do not represent perfect point-in-time views of the cluster. While a snapshot is in progress, you can still index documents and make other requests to the cluster, but new documents and updates to existing documents generally aren't included in the snapshot. The snapshot includes primary shards as they existed when OpenSearch initiated the snapshot. Depending on the size of your snapshot thread pool, different shards might be included in the snapshot at slightly different times.
OpenSearch snapshots are incremental, meaning that they only store data that has changed since the last successful snapshot. The difference in disk usage between frequent and infrequent snapshots is often minimal.
In other words, taking hourly snapshots for a week (for a total of 168 snapshots) might not use much more disk space than taking a single snapshot at the end of the week. Also, the more frequently you take snapshots, the less time they take to complete. Some OpenSearch users take snapshots as often as every half hour.
If you need to delete a snapshot, be sure to use the OpenSearch API rather than navigating to the storage location and purging files. Incremental snapshots from a cluster often share a lot of the same data; when you use the API, OpenSearch only removes data that no other snapshot is using.
{: .tip }
## Register repository
Before you can take a snapshot, you have to "register" a snapshot repository. A snapshot repository is just a storage location: a shared file system, Amazon S3, Hadoop Distributed File System (HDFS), Azure Storage, etc.
### Shared file system
1. To use a shared file system as a snapshot repository, add it to `opensearch.yml`:
```yml
path.repo: ["/mnt/snapshots"]
```
On the RPM and Debian installs, you can then mount the file system. If you're using the Docker install, add the file system to each node in `docker-compose.yml` before starting the cluster:
```yml
volumes:
- /Users/jdoe/snapshots:/mnt/snapshots
```
1. Then register the repository using the REST API:
```json
PUT _snapshot/my-fs-repository
{
"type": "fs",
"settings": {
"location": "/mnt/snapshots"
}
}
```
If the request is successful, the response from OpenSearch is minimal:
```json
{
"acknowledged": true
}
```
You probably only need to specify `location`, but the following table summarizes the options:
Setting | Description
:--- | :---
`location` | The shared file system for snapshots. Required.
`chunk_size` | Breaks large files into chunks during snapshot operations (e.g. `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `null` (unlimited). Optional.
`compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional.
`max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional.
`max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional.
`readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional.
### Amazon S3
1. To use an Amazon S3 bucket as a snapshot repository, install the `repository-s3` plugin on all nodes:
```bash
sudo ./bin/opensearch-plugin install repository-s3
```
If you're using the Docker installation, see [Customize the Docker image](../../install/docker/#customize-the-docker-image). Your `Dockerfile` should look something like this:
```
FROM opensearchproject/opensearch:{{site.opensearch_version}}
ENV AWS_ACCESS_KEY_ID <access-key>
ENV AWS_SECRET_ACCESS_KEY <secret-key>
# Optional
ENV AWS_SESSION_TOKEN <optional-session-token>
RUN /usr/share/opensearch/bin/opensearch-plugin install --batch repository-s3
RUN /usr/share/opensearch/bin/opensearch-keystore create
RUN echo $AWS_ACCESS_KEY_ID | /usr/share/opensearch/bin/opensearch-keystore add --stdin s3.client.default.access_key
RUN echo $AWS_SECRET_ACCESS_KEY | /usr/share/opensearch/bin/opensearch-keystore add --stdin s3.client.default.secret_key
# Optional
RUN echo $AWS_SESSION_TOKEN | /usr/share/opensearch/bin/opensearch-keystore add --stdin s3.client.default.session_token
```
After the Docker cluster starts, skip to step 7.
1. Add your AWS access and secret keys to the OpenSearch keystore:
```bash
sudo ./bin/opensearch-keystore add s3.client.default.access_key
sudo ./bin/opensearch-keystore add s3.client.default.secret_key
```
1. (Optional) If you're using temporary credentials, add your session token:
```bash
sudo ./bin/opensearch-keystore add s3.client.default.session_token
```
1. (Optional) If you connect to the internet through a proxy, add those credentials:
```bash
sudo ./bin/opensearch-keystore add s3.client.default.proxy.username
sudo ./bin/opensearch-keystore add s3.client.default.proxy.password
```
1. (Optional) Add other settings to `opensearch.yml`:
```yml
s3.client.default.disable_chunked_encoding: false # Disables chunked encoding for compatibility with some storage services, but you probably don't need to change this value.
s3.client.default.endpoint: s3.amazonaws.com # S3 has alternate endpoints, but you probably don't need to change this value.
s3.client.default.max_retries: 3 # number of retries if a request fails
s3.client.default.path_style_access: false # whether to use the deprecated path-style bucket URLs.
# You probably don't need to change this value, but for more information, see https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access.
s3.client.default.protocol: https # http or https
s3.client.default.proxy.host: my-proxy-host # the hostname for your proxy server
s3.client.default.proxy.port: 8080 # port for your proxy server
s3.client.default.read_timeout: 50s # the S3 connection timeout
s3.client.default.use_throttle_retries: true # whether the client should wait a progressively longer amount of time (exponential backoff) between each successive retry
```
1. If you changed `opensearch.yml`, you must restart each node in the cluster. Otherwise, you only need to reload secure cluster settings:
```
POST _nodes/reload_secure_settings
```
1. Create an S3 bucket if you don't already have one. To take snapshots, you need permissions to access the bucket. The following IAM policy is an example of those permissions:
```json
{
"Version": "2012-10-17",
"Statement": [{
"Action": [
"s3:*"
],
"Effect": "Allow",
"Resource": [
"arn:aws:s3:::your-bucket",
"arn:aws:s3:::your-bucket/*"
]
}]
}
```
1. Register the repository using the REST API:
```json
PUT _snapshot/my-s3-repository
{
"type": "s3",
"settings": {
"bucket": "my-s3-bucket",
"base_path": "my/snapshot/directory"
}
}
```
You probably don't need to specify anything but `bucket` and `base_path`, but the following table summarizes the options:
Setting | Description
:--- | :---
`base_path` | The path within the bucket where you want to store snapshots (e.g. `my/snapshot/directory`). Optional. If not specified, snapshots are stored in the bucket root.
`bucket` | Name of the S3 bucket. Required.
`buffer_size` | The threshold beyond which chunks (of `chunk_size`) should be broken into pieces (of `buffer_size`) and sent to S3 using a different API. Default is the smaller of two values: 100 MB or 5% of the Java heap. Valid values are between `5mb` and `5gb`. We don't recommend changing this option.
`canned_acl` | S3 has several [canned ACLs](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl) that the `repository-s3` plugin can add to objects as it creates them in S3. Default is `private`. Optional.
`chunk_size` | Breaks files into chunks during snapshot operations (e.g. `64mb`, `1gb`), which is important for cloud storage providers and far less important for shared file systems. Default is `1gb`. Optional.
`client` | When specifying client settings (e.g. `s3.client.default.access_key`), you can use a string other than `default` (e.g. `s3.client.backup-role.access_key`). If you used an alternate name, change this value to match. Default and recommended value is `default`. Optional.
`compress` | Whether to compress metadata files. This setting does not affect data files, which might already be compressed, depending on your index settings. Default is `false`. Optional.
`max_restore_bytes_per_sec` | The maximum rate at which snapshots restore. Default is 40 MB per second (`40m`). Optional.
`max_snapshot_bytes_per_sec` | The maximum rate at which snapshots take. Default is 40 MB per second (`40m`). Optional.
`readonly` | Whether the repository is read-only. Useful when migrating from one cluster (`"readonly": false` when registering) to another cluster (`"readonly": true` when registering). Optional.
`server_side_encryption` | Whether to encrypt snapshot files in the S3 bucket. This setting uses AES-256 with S3-managed keys. See [Protecting data using server-side encryption](https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html). Default is false. Optional.
`storage_class` | Specifies the [S3 storage class](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-class-intro.html) for the snapshots files. Default is `standard`. Do not use the `glacier` and `deep_archive` storage classes. Optional.
## Take snapshots
You specify two pieces of information when you create a snapshot:
- Name of your snapshot repository
- Name for the snapshot
The following snapshot includes all indices and the cluster state:
```json
PUT _snapshot/my-repository/1
```
You can also add a request body to include or exclude certain indices or specify other settings:
```json
PUT _snapshot/my-repository/2
{
"indices": "opensearch-dashboards*,my-index*,-my-index-2016",
"ignore_unavailable": true,
"include_global_state": false,
"partial": false
}
```
Setting | Description
:--- | :---
`indices` | The indices you want to include in the snapshot. You can use `,` to create a list of indices, `*` to specify an index pattern, and `-` to exclude certain indices. Don't put spaces between items. Default is all indices.
`ignore_unavailable` | If an index from the `indices` list doesn't exist, whether to ignore it rather than fail the snapshot. Default is false.
`include_global_state` | Whether to include cluster state in the snapshot. Default is true.
`partial` | Whether to allow partial snapshots. Default is false, which fails the entire snapshot if one or more shards fails to store.
If you request the snapshot immediately after taking it, you might see something like this:
```json
GET _snapshot/my-repository/2
{
"snapshots": [{
"snapshot": "2",
"version": "6.5.4",
"indices": [
"opensearch_dashboards_sample_data_ecommerce",
"my-index",
"opensearch_dashboards_sample_data_logs",
"opensearch_dashboards_sample_data_flights"
],
"include_global_state": true,
"state": "IN_PROGRESS",
...
}]
}
```
Note that the snapshot is still in progress. If you want to wait for the snapshot to finish before continuing, add the `wait_for_completion` parameter to your request. Snapshots can take a while to complete, so consider whether or not this option fits your use case:
```
PUT _snapshot/my-repository/3?wait_for_completion=true
```
Snapshots have the following states:
State | Description
:--- | :---
SUCCESS | The snapshot successfully stored all shards.
IN_PROGRESS | The snapshot is currently running.
PARTIAL | At least one shard failed to store successfully. Can only occur if you set `partial` to `true` when taking the snapshot.
FAILED | The snapshot encountered an error and stored no data.
INCOMPATIBLE | The snapshot is incompatible with the version of OpenSearch running on this cluster. See [Conflicts and compatibility](#conflicts-and-compatibility).
You can't take a snapshot if one is currently in progress. To check the status:
```
GET _snapshot/_status
```
## Restore snapshots
The first step in restoring a snapshot is retrieving existing snapshots. To see all snapshot repositories:
```
GET _snapshot/_all
```
To see all snapshots in a repository:
```
GET _snapshot/my-repository/_all
```
Then restore a snapshot:
```
POST _snapshot/my-repository/2/_restore
```
Just like when taking a snapshot, you can add a request body to include or exclude certain indices or specify some other settings:
```json
POST _snapshot/my-repository/2/_restore
{
"indices": "opensearch-dashboards*,my-index*",
"ignore_unavailable": true,
"include_global_state": false,
"include_aliases": false,
"partial": false,
"rename_pattern": "opensearch-dashboards(.+)",
"rename_replacement": "restored-opensearch-dashboards$1",
"index_settings": {
"index.blocks.read_only": false
},
"ignore_index_settings": [
"index.refresh_interval"
]
}
```
Setting | Description
:--- | :---
`indices` | The indices you want to restore. You can use `,` to create a list of indices, `*` to specify an index pattern, and `-` to exclude certain indices. Don't put spaces between items. Default is all indices.
`ignore_unavailable` | If an index from the `indices` list doesn't exist, whether to ignore it rather than fail the restore operation. Default is false.
`include_global_state` | Whether to restore the cluster state. Default is false.
`include_aliases` | Whether to restore aliases alongside their associated indices. Default is true.
`partial` | Whether to allow the restoration of partial snapshots. Default is false.
`rename_pattern` | If you want to rename indices as you restore them, use this option to specify a regular expression that matches all indices you want to restore. Use capture groups (`()`) to reuse portions of the index name.
`rename_replacement` | If you want to rename indices as you restore them, use this option to specify the replacement pattern. Use `$0` to include the entire matching index name, `$1` to include the content of the first capture group, etc.
`index_settings` | If you want to change index settings on restore, specify them here.
`ignore_index_settings` | Rather than explicitly specifying new settings with `index_settings`, you can ignore certain index settings in the snapshot and use the cluster defaults on restore.
### Conflicts and compatibility
One way to avoid naming conflicts when restoring indices is to use the `rename_pattern` and `rename_replacement` options. Then, if necessary, you can use the `_reindex` API to combine the two. The simpler way is to delete existing indices prior to restoring from a snapshot.
You can use the `_close` API to close existing indices prior to restoring from a snapshot, but the index in the snapshot has to have the same number of shards as the existing index.
We recommend ceasing write requests to a cluster before restoring from a snapshot, which helps avoid scenarios such as:
1. You delete an index, which also deletes its alias.
1. A write request to the now-deleted alias creates a new index with the same name as the alias.
1. The alias from the snapshot fails to restore due to a naming conflict with the new index.
Snapshots are only forward-compatible by one major version. If you have an old snapshot, you can sometimes restore it into an intermediate cluster, reindex all indices, take a new snapshot, and repeat until you arrive at your desired version, but you might find it easier to just manually index your data on the new cluster.
## Security plugin considerations
If you're using the security plugin, snapshots have some additional restrictions:
- To perform snapshot and restore operations, users must have the built-in `manage_snapshots` role.
- You can't restore snapshots that contain global state or the `.opensearch_security` index.
If a snapshot contains global state, you must exclude it when performing the restore. If your snapshot also contains the `.opensearch_security` index, either exclude it or list all the other indices you want to include:
```json
POST _snapshot/my-repository/3/_restore
{
"indices": "-.opensearch_security",
"include_global_state": false
}
```
The `.opensearch_security` index contains sensitive data, so we recommend excluding it when you take a snapshot. If you do need to restore the index from a snapshot, you must include an admin certificate in the request:
```bash
curl -k --cert ./kirk.pem --key ./kirk-key.pem -XPOST 'https://localhost:9200/_snapshot/my-repository/3/_restore?pretty'
```

View File

@ -0,0 +1,230 @@
---
layout: default
title: Tasks API
nav_order: 25
---
# Tasks API operation
A task is any operation you run in a cluster. For example, searching your data collection of books for a title or author name is a task. When you run OpenSearch, a task is automatically created to monitor your cluster's health and performance. For more information about all of the tasks currently executing in your cluster, you can use the `tasks` API operation.
The following request returns information about all of your tasks:
```
GET _tasks
```
By including a task ID, you can get information specific to a particular task. Note that a task ID consists of a node's identifying string and the task's numerical ID. For example, if your node's identifying string is `nodestring` and the task's numerical ID is `1234`, then your task ID is `nodestring:1234`. You can find this information by running the `tasks` operation:
```
GET _tasks/<task_id>
```
Note that if a task finishes running, it won't be returned as part of your request. For an example of a task that takes a little longer to finish, you can run the [`_reindex`](../reindex-data) API operation on a larger document, and then run `tasks`.
**Sample Response**
```json
{
"nodes": {
"Mgqdm0r9SEGClWxp_RbnaQ": {
"name": "opensearch-node1",
"transport_address": "172.18.0.3:9300",
"host": "172.18.0.3",
"ip": "172.18.0.3:9300",
"roles": [
"data",
"ingest",
"master",
"remote_cluster_client"
],
"tasks": {
"Mgqdm0r9SEGClWxp_RbnaQ:17416": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 17416,
"type": "transport",
"action": "cluster:monitor/tasks/lists",
"start_time_in_millis": 1613599752458,
"running_time_in_nanos": 994000,
"cancellable": false,
"headers": {}
},
"Mgqdm0r9SEGClWxp_RbnaQ:17413": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 17413,
"type": "transport",
"action": "indices:data/write/bulk",
"start_time_in_millis": 1613599752286,
"running_time_in_nanos": 172846500,
"cancellable": false,
"parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:17366",
"headers": {}
},
"Mgqdm0r9SEGClWxp_RbnaQ:17366": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 17366,
"type": "transport",
"action": "indices:data/write/reindex",
"start_time_in_millis": 1613599750929,
"running_time_in_nanos": 1529733100,
"cancellable": true,
"headers": {}
}
}
}
}
}
```
You can also use the following parameters with your query.
Parameter | Data type | Description |
:--- | :--- | :---
`nodes` | List | A comma-separated list of node IDs or names to limit the returned information. Use `_local` to return information from the node you're connecting to, specify the node name to get information from specific nodes, or keep the parameter empty to get information from all nodes.
`actions` | List | A comma-separated list of actions that should be returned. Keep empty to return all.
`detailed` | Boolean | Returns detailed task information. (Default: false)
`parent_task_id` | String | Returns tasks with a specified parent task ID (node_id:task_number). Keep empty or set to -1 to return all.
`wait_for_completion` | Boolean | Waits for the matching tasks to complete. (Default: false)
`group_by` | Enum | Groups tasks by parent/child relationships or nodes. (Default: nodes)
`timeout` | Time | An explicit operation timeout. (Default: 30 seconds)
`master_timeout` | Time | The time to wait for a connection to the primary node. (Default: 30 seconds)
For example, this request returns tasks currently running on a node named `opensearch-node1`:
**Sample Request**
```
GET /_tasks?nodes=opensearch-node1
```
**Sample Response**
```json
{
"nodes": {
"Mgqdm0r9SEGClWxp_RbnaQ": {
"name": "opensearch-node1",
"transport_address": "sample_address",
"host": "sample_host",
"ip": "sample_ip",
"roles": [
"data",
"ingest",
"master",
"remote_cluster_client"
],
"tasks": {
"Mgqdm0r9SEGClWxp_RbnaQ:24578": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 24578,
"type": "transport",
"action": "cluster:monitor/tasks/lists",
"start_time_in_millis": 1611612517044,
"running_time_in_nanos": 638700,
"cancellable": false,
"headers": {}
},
"Mgqdm0r9SEGClWxp_RbnaQ:24579": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 24579,
"type": "direct",
"action": "cluster:monitor/tasks/lists[n]",
"start_time_in_millis": 1611612517044,
"running_time_in_nanos": 222200,
"cancellable": false,
"parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:24578",
"headers": {}
}
}
}
}
}
```
## Task canceling
After getting a list of tasks, you can cancel all cancelable tasks with the following request:
```
POST _tasks/_cancel
```
Note that not all tasks are cancelable. To see if a task is cancelable, refer to the `cancellable` field in the response to your `tasks` API request.
You can also cancel a task by including a specific task ID.
```
POST _tasks/<task_id>/_cancel
```
The `cancel` operation supports the same parameters as the `tasks` operation. The following example shows how to cancel all cancelable tasks on multiple nodes.
```
POST _tasks/_cancel?nodes=opensearch-node1,opensearch-node2
```
## Attaching headers to tasks
To associate requests with tasks for better tracking, you can provide a `X-Opaque-Id:<ID_number>` header as part of the HTTPS request reader of your `curl` command. The API will attach the specified header in the returned result.
Usage:
```bash
curl -i -H "X-Opaque-Id: 111111" "https://localhost:9200/_tasks" -u 'admin:admin' --insecure
```
The `_tasks` operation returns the following result.
```json
HTTP/1.1 200 OK
X-Opaque-Id: 111111
content-type: application/json; charset=UTF-8
content-length: 768
{
"nodes": {
"Mgqdm0r9SEGClWxp_RbnaQ": {
"name": "opensearch-node1",
"transport_address": "172.18.0.4:9300",
"host": "172.18.0.4",
"ip": "172.18.0.4:9300",
"roles": [
"data",
"ingest",
"master",
"remote_cluster_client"
],
"tasks": {
"Mgqdm0r9SEGClWxp_RbnaQ:30072": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 30072,
"type": "direct",
"action": "cluster:monitor/tasks/lists[n]",
"start_time_in_millis": 1613166701725,
"running_time_in_nanos": 245400,
"cancellable": false,
"parent_task_id": "Mgqdm0r9SEGClWxp_RbnaQ:30071",
"headers": {
"X-Opaque-Id": "111111"
}
},
"Mgqdm0r9SEGClWxp_RbnaQ:30071": {
"node": "Mgqdm0r9SEGClWxp_RbnaQ",
"id": 30071,
"type": "transport",
"action": "cluster:monitor/tasks/lists",
"start_time_in_millis": 1613166701725,
"running_time_in_nanos": 658200,
"cancellable": false,
"headers": {
"X-Opaque-Id": "111111"
}
}
}
}
}
}
```
This operation supports the same parameters as the `tasks` operation. The following example shows how you can associate `X-Opaque-Id` with specific tasks:
```bash
curl -i -H "X-Opaque-Id: 123456" "https://localhost:9200/_tasks?nodes=opensearch-node1" -u 'admin:admin' --insecure
```

18
_opensearch_docs/units.md Normal file
View File

@ -0,0 +1,18 @@
---
layout: default
title: Supported units
nav_order: 90
---
# Supported units
OpenSearch supports the following units for all REST operations:
Unit | Description | Example
:--- | :--- | :---
Times | The supported units for time are `d` for days, `h` for hours, `m` for minutes, `s` for seconds, `ms` for milliseconds, `micros` for microseconds, and `nanos` for nanoseconds. | `5d` or `7h`
Bytes | The supported units for byte size are `b` for bytes, `kb` for kibibytes, `mb` for mebibytes, `gb` for gibibytes, `tb` for tebibytes, and `pb` for pebibytes. Despite the base-10 abbreviations, these units are base-2; `1kb` is 1,024 bytes, `1mb` is 1,048,576 bytes, etc. | `7kb` or `6gb`
Distances | The supported units for distance are `mi` for miles, `yd` for yards, `ft` for feet, `in` for inches, `km` for kilometers, `m` for meters, `cm` for centimeters, `mm` for millimeters, and `nmi` or `NM` for nautical miles. | `5mi` or `4ft`
Quantities without units | For large values that don't have a unit, use `k` for kilo, `m` for mega, `g` for giga, `t` for tera, and `p` for peta. | `5k` for 5,000
To convert output units to human-readable values, see [Common REST parameters](../common-parameters/).

1076
_opensearch_docs/ux.md Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,165 @@
---
layout: default
title: Anomaly detection
nav_order: 46
has_children: true
---
# Anomaly detection
An anomaly in OpenSearch is any unusual behavior change in your time-series data. Anomalies can provide valuable insights into your data. For example, for IT infrastructure data, an anomaly in the memory usage metric might help you uncover early signs of a system failure.
It can be challenging to discover anomalies using conventional methods such as creating visualizations and dashboards. You could configure an alert based on a static threshold, but this requires prior domain knowledge and isn't adaptive to data that exhibits organic growth or seasonal behavior.
Anomaly detection automatically detects anomalies in your OpenSearch data in near real-time using the Random Cut Forest (RCF) algorithm. RCF is an unsupervised machine learning algorithm that models a sketch of your incoming data stream to compute an `anomaly grade` and `confidence score` value for each incoming data point. These values are used to differentiate an anomaly from normal variations. For more information about how RCF works, see [Random Cut Forests](https://pdfs.semanticscholar.org/8bba/52e9797f2e2cc9a823dbd12514d02f29c8b9.pdf?_ga=2.56302955.1913766445.1574109076-1059151610.1574109076).
You can pair the anomaly detection plugin with the [alerting plugin](../alerting/) to notify you as soon as an anomaly is detected.
To use the anomaly detection plugin, your computer needs to have more than one CPU core.
{: .note }
## Get started with Anomaly Detection
To get started, choose **Anomaly Detection** in OpenSearch Dashboards.
To first test with sample streaming data, choose **Sample Detectors** and try out one of the preconfigured detectors.
### Step 1: Create a detector
A detector is an individual anomaly detection task. You can create multiple detectors, and all the detectors can run simultaneously, with each analyzing data from different sources.
1. Choose **Create Detector**.
1. Enter a name and brief description. Make sure the name is unique and descriptive enough to help you to identify the purpose of the detector.
1. For **Data source**, choose the index you want to use as the data source. You can optionally use index patterns to choose multiple indices.
1. Select the **Timestamp field** in your index.
1. (Optional) For **Data filter**, filter the index you chose as the data source. From the **Filter type** menu, choose **Visual filter**, and then design your filter query by selecting **Fields**, **Operator**, and **Value**, or choose **Custom Expression** and add your own JSON filter query.
1. For **Detector operation settings**, define the **Detector interval**, which is the time interval at which the detector collects data.
- The detector aggregates the data in this interval, then feeds the aggregated result into the anomaly detection model.
The shorter you set this interval, the fewer data points the detector aggregates.
The anomaly detection model uses a shingling process, a technique that uses consecutive data points to create a sample for the model. This process needs a certain number of aggregated data points from contiguous intervals.
- We recommend setting the detector interval based on your actual data. If it's too long it might delay the results, and if it's too short it might miss some data. It also won't have a sufficient number of consecutive data points for the shingle process.
1. (Optional) To add extra processing time for data collection, specify a **Window delay** value. This value tells the detector that the data is not ingested into OpenSearch in real time but with a certain delay.
Set the window delay to shift the detector interval to account for this delay.
- For example, say the detector interval is 10 minutes and data is ingested into your cluster with a general delay of 1 minute.
Assume the detector runs at 2:00. The detector attempts to get the last 10 minutes of data from 1:50 to 2:00, but because of the 1-minute delay, it only gets 9 minutes of data and misses the data from 1:59 to 2:00.
Setting the window delay to 1 minute shifts the interval window to 1:49 - 1:59, so the detector accounts for all 10 minutes of the detector interval time.
1. Choose **Create**.
After you create the detector, the next step is to add features to it.
### Step 2: Add features to your detector
A feature is the field in your index that you want to check for anomalies. A detector can discover anomalies across one or more features. You must choose an aggregation method for each feature: `average()`, `count()`, `sum()`, `min()`, or `max()`. The aggregation method determines what constitutes an anomaly.
For example, if you choose `min()`, the detector focuses on finding anomalies based on the minimum values of your feature. If you choose `average()`, the detector finds anomalies based on the average values of your feature.
A multi-feature model correlates anomalies across all its features. The [curse of dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality) makes it less likely for multi-feature models to identify smaller anomalies as compared to a single-feature model. Adding more features might negatively impact the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) of a model. A higher proportion of noise in your data might further amplify this negative impact. Selecting the optimal feature set is usually an iterative process. We recommend experimenting with a historical detector with different feature sets and checking the precision before moving on to real-time detectors. By default, the maximum number of features for a detector is 5. You can adjust this limit with the `plugins.anomaly_detection.max_anomaly_features` setting.
{: .note }
1. On the **Model configuration** page, enter the **Feature name**.
1. For **Find anomalies based on**, choose the method to find anomalies. For **Field Value** menu, choose the **field** and the **aggregation method**. Or choose **Custom expression**, and add your own JSON aggregation query.
#### (Optional) Set a category field for high cardinality
You can categorize anomalies based on a keyword or IP field type.
The category field categorizes or slices the source time series with a dimension like IP addresses, product IDs, country codes, and so on. This helps to see a granular view of anomalies within each entity of the category field to isolate and debug issues.
To set a category field, choose **Enable a category field** and select a field.
Only a certain number of unique entities are supported in the category field. Use the following equation to calculate the recommended total number of entities supported in a cluster:
```
(data nodes * heap size * anomaly detection maximum memory percentage) / (entity size of a detector)
```
This formula provides a good starting point, but make sure to test with a representative workload.
{: .note }
For example, for a cluster with 3 data nodes, each with 8G of JVM heap size, a maximum memory percentage of 10% (default), and the entity size of the detector as 1MB: the total number of unique entities supported is (8.096 * 10^9 * 0.1 / 1M ) * 3 = 2429.
#### Set a window size
Set the number of aggregation intervals from your data stream to consider in a detection window. It's best to choose this value based on your actual data to see which one leads to the best results for your use case.
Based on experiments performed on a wide variety of one-dimensional data streams, we recommend using a window size between 1 and 16. The default window size is 8. If you set the category field for high cardinality, the default window size is 1.
If you expect missing values in your data or if you want to base the anomalies on the current interval, choose 1. If your data is continuously ingested and you want to base the anomalies on multiple intervals, choose a larger window size.
#### Preview sample anomalies
Preview sample anomalies and adjust the feature settings if needed.
For sample previews, the anomaly detection plugin selects a small number of data samples---for example, one data point every 30 minutes---and uses interpolation to estimate the remaining data points to approximate the actual feature data. It loads this sample dataset into the detector. The detector uses this sample dataset to generate a sample preview of anomaly results.
Examine the sample preview and use it to fine-tune your feature configurations (for example, enable or disable features) to get more accurate results.
1. Choose **Save and start detector**.
1. Choose between automatically starting the detector (recommended) or manually starting the detector at a later time.
### Step 3: Observe the results
Choose the **Anomaly results** tab. You need to wait for some time to see the anomaly results. If the detector interval is 10 minutes, the detector might take more than an hour to start, as it's waiting for sufficient data to generate anomalies.
A shorter interval means the model passes the shingle process more quickly and starts to generate the anomaly results sooner.
Use the [profile detector](./api#profile-detector) operation to make sure you have sufficient data points.
If you see the detector pending in "initialization" for longer than a day, aggregate your existing data using the detector interval to check for any missing data points. If you find a lot of missing data points from the aggregated data, consider increasing the detector interval.
![Anomaly detection results](../images/ad.png)
Analize anomalies with the following visualizations:
- **Live anomalies** - displays live anomaly results for the last 60 intervals. For example, if the interval is 10, it shows results for the last 600 minutes. The chart refreshes every 30 seconds.
- **Anomaly history** - plots the anomaly grade with the corresponding measure of confidence.
- **Feature breakdown** - plots the features based on the aggregation method. You can vary the date-time range of the detector.
- **Anomaly occurrence** - shows the `Start time`, `End time`, `Data confidence`, and `Anomaly grade` for each detected anomaly.
`Anomaly grade` is a number between 0 and 1 that indicates how anomalous a data point is. An anomaly grade of 0 represents “not an anomaly,” and a non-zero value represents the relative severity of the anomaly.
`Data confidence` is an estimate of the probability that the reported anomaly grade matches the expected anomaly grade. Confidence increases as the model observes more data and learns the data behavior and trends. Note that confidence is distinct from model accuracy.
If you set the category field, you see an additional **Heat map** chart. The heat map correlates results for anomalous entities. This chart is empty until you select an anomalous entity. You also see the anomaly and feature line chart for the time period of the anomaly (`anomaly_grade` > 0).
Choose a filled rectangle to see a more detailed view of the anomaly.
{: .note }
### Step 4: Set up alerts
Choose **Set up alerts** and configure a monitor to notify you when anomalies are detected. For steps to create a monitor and set up notifications based on your anomaly detector, see [Monitors](../alerting/monitors/).
If you stop or delete a detector, make sure to delete any monitors associated with it.
### Step 5: Adjust the model
To see all the configuration settings for a detector, choose the **Detector configuration** tab.
1. To make any changes to the detector configuration, or fine tune the time interval to minimize any false positives, go to the **Detector configuration** section and choose **Edit**.
- You need to stop the detector to change its configuration. Confirm that you want to stop the detector and proceed.
1. To enable or disable features, in the **Features** section, choose **Edit** and adjust the feature settings as needed. After you make your changes, choose **Save and start detector**.
- Choose between automatically starting the detector (recommended) or manually starting the detector at a later time.
### Step 6: Analyze historical data
Analyzing historical data helps you get familiar with the anomaly detection plugin. You can also evaluate the performance of a detector with historical data to further fine-tune it.
To use a historical detector, you need to specify a date range that has data present in at least 1,000 detection intervals.
{: .note }
1. Choose **Historical detectors** and **Create historical detector**.
1. Enter the **Name** of the detector and a brief **Description**.
1. For **Data source**, choose the index to use as the data source. You can optionally use index patterns to choose multiple indices.
1. For **Time range**, select a time range for historical analysis.
1. For **Detector settings**, choose to use the settings of an existing detector. Or choose the **Timestamp field** in your index, add individual features to the detector, and set the detector interval.
1. (Optional) Choose to run the historical detector automatically after creating it.
1. Choose **Create**.
- You can stop the historical detector even before it completes.
### Step 7: Manage your detectors
To change or delete a detector, go to the **Detector details** page.
1. To make changes to your detector, choose the detector name.
1. Choose **Actions** and **Edit detector**.
- You need to stop the detector to change its configuration. Confirm that you want to stop the detector and proceed.
1. Make your changes and choose **Save changes**.
To delete your detector, choose **Actions** and **Delete detector**. In the pop-up box, type `delete` to confirm and choose **Delete**.

View File

@ -0,0 +1,86 @@
---
layout: default
title: Anomaly detection security
nav_order: 10
parent: Anomaly detection
has_children: false
---
# Anomaly detection security
You can use the security plugin with anomaly detection in OpenSearch to limit non-admin users to specific actions. For example, you might want some users to only be able to create, update, or delete detectors, while others to only view detectors.
All anomaly detection indices are protected as system indices. Only a super admin user or an admin user with a TLS certificate can access system indices. For more information, see [System indices](../../security/configuration/system-indices/).
Security for anomaly detection works the same as [security for alerting](../../alerting/security/).
## Basic permissions
As an admin user, you can use the security plugin to assign specific permissions to users based on which APIs they need access to. For a list of supported APIs, see [Anomaly detection API](../api/).
The security plugin has two built-in roles that cover most anomaly detection use cases: `anomaly_full_access` and `anomaly_read_access`. For descriptions of each, see [Predefined roles](../../security/access-control/users-roles/#predefined-roles).
If these roles don't meet your needs, mix and match individual anomaly detection [permissions](../../security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/ad/detector/delete` permission lets you delete detectors.
## (Advanced) Limit access by backend role
Use backend roles to configure fine-grained access to individual detectors based on roles. For example, users of different departments in an organization can view detectors owned by their own department.
First, make sure your users have the appropriate [backend roles](../../security/access-control/). Backend roles usually come from an [LDAP server](../../security/configuration/ldap/) or [SAML provider](../../security/configuration/saml/), but if you use the internal user database, you can use the REST API to [add them manually](../../security/access-control/api/#create-user).
Next, enable the following setting:
```json
PUT _cluster/settings
{
"transient": {
"plugins.anomaly_detection.filter_by_backend_roles": "true"
}
}
```
Now when users view anomaly detection resources in OpenSearch Dashboards (or make REST API calls), they only see detectors created by users who share at least one backend role.
For example, consider two users: `alice` and `bob`.
`alice` has an analyst backend role:
```json
PUT _plugins/_security/api/internalusers/alice
{
"password": "alice",
"backend_roles": [
"analyst"
],
"attributes": {}
}
```
`bob` has a human-resources backend role:
```json
PUT _plugins/_security/api/internalusers/bob
{
"password": "bob",
"backend_roles": [
"human-resources"
],
"attributes": {}
}
```
Both `alice` and `bob` have full access to anomaly detection:
```json
PUT _plugins/_security/api/rolesmapping/anomaly_full_access
{
"backend_roles": [],
"hosts": [],
"users": [
"alice",
"bob"
]
}
```
Because they have different backend roles, `alice` and `bob` cannot view each other's detectors or their results.

View File

@ -0,0 +1,42 @@
---
layout: default
title: Settings
parent: Anomaly detection
nav_order: 4
---
# Settings
The anomaly detection plugin adds several settings to the standard OpenSearch cluster settings.
The settings are dynamic, so you can change the default behavior of the plugin without restarting your cluster.
You can mark settings as `persistent` or `transient`.
For example, to update the retention period of the result index:
```json
PUT _cluster/settings
{
"transient": {
"plugins.anomaly_detection.ad_result_history_retention_period": "5m"
}
}
```
Setting | Default | Description
:--- | :--- | :---
`plugins.anomaly_detection.enabled` | True | Whether the anomaly detection plugin is enabled or not. If disabled, all detectors immediately stop running.
`plugins.anomaly_detection.max_anomaly_detectors` | 1,000 | The maximum number of non-high cardinality detectors (no category field) users can create.
`plugins.anomaly_detection.max_multi_entity_anomaly_detectors` | 10 | The maximum number of high cardinality detectors (with category field) in a cluster.
`plugins.anomaly_detection.max_anomaly_features` | 5 | The maximum number of features for a detector.
`plugins.anomaly_detection.ad_result_history_rollover_period` | 12h | How often the rollover condition is checked. If `true`, the plugin rolls over the result index to a new index.
`plugins.anomaly_detection.ad_result_history_max_docs` | 250000000 | The maximum number of documents in one result index. The plugin only counts refreshed documents in the primary shards.
`plugins.anomaly_detection.ad_result_history_retention_period` | 30d | The maximum age of the result index. If its age exceeds the threshold, the plugin deletes the rolled over result index. If the cluster has only one result index, the plugin keeps the index even if it's older than its configured retention period.
`plugins.anomaly_detection.max_entities_per_query` | 1,000 | The maximum unique values per detection interval for high cardinality detectors. By default, if the category field has more than 1,000 unique values in a detector interval, the plugin selects the top 1,000 values and orders them by `doc_count`.
`plugins.anomaly_detection.max_entities_for_preview` | 30 | The maximum unique category field values displayed with the preview operation for high cardinality detectors. If the category field has more than 30 unique values, the plugin selects the top 30 values and orders them by `doc_count`.
`plugins.anomaly_detection.max_primary_shards` | 10 | The maximum number of primary shards an anomaly detection index can have.
`plugins.anomaly_detection.filter_by_backend_roles` | False | When you enable the security plugin and set this to `true`, the plugin filters results based on the user's backend role(s).
`plugins.anomaly_detection.max_cache_miss_handling_per_second` | 100 | High cardinality detectors use a cache to store active models. In the event of a cache miss, the cache gets the models from the model checkpoint index. Use this setting to limit the rate of fetching models. Because the thread pool for a GET operation has a queue of 1,000, we recommend setting this value below 1,000.
`plugins.anomaly_detection.max_batch_task_per_node` | 2 | Starting a historical detector triggers a batch task. This setting is the number of batch tasks that you can run per data node. You can tune this setting from 1 to 1000. If the data nodes can't support all batch tasks and you're not sure if the data nodes are capable of running more historical detectors, add more data nodes instead of changing this setting to a higher value.
`plugins.anomaly_detection.max_old_ad_task_docs_per_detector` | 10 | You can run the same historical detector many times. For each run, the anomaly detection plugin creates a new task. This setting is the number of previous tasks the plugin keeps. Set this value to at least 1 to track its last run. You can keep a maximum of 1,000 old tasks to avoid overwhelming the cluster.
`plugins.anomaly_detection.batch_task_piece_size` | 1000 | The date range for a historical task is split into smaller pieces and the anomaly detection plugin runs the task piece by piece. Each piece contains 1,000 detection intervals by default. For example, if detector interval is 1 minute and one piece is 1000 minutes, the feature data is queried every 1,000 minutes. You can change this setting from 1 to 10,000.
`plugins.anomaly_detection.batch_task_piece_interval_seconds` | 5 | Add a time interval between historical detector tasks. This interval prevents the task from consuming too much of the available resources and starving other operations like search and bulk index. You can change this setting from 1 to 600 seconds.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,64 @@
---
layout: default
title: Cron
nav_order: 20
parent: Alerting
has_children: false
---
# Cron expression reference
Monitors can run at a variety of fixed intervals (e.g. hourly, daily, etc.), but you can also define custom cron expressions for when they should run. Monitors use the Unix cron syntax and support five fields:
Field | Valid values
:--- | :---
Minute | 0-59
Hour | 0-23
Day of month | 1-31
Month | 1-12
Day of week | 0-7 (0 and 7 are both Sunday) or SUN, MON, TUE, WED, THU, FRI, SAT
For example, the following expression translates to "every Monday through Friday at 11:30 AM":
```
30 11 * * 1-5
```
## Features
Feature | Description
:--- | :---
`*` | Wildcard. Specifies all valid values.
`,` | List. Use to specify several values (e.g. `1,15,30`).
`-` | Range. Use to specify a range of values (e.g. `1-15`).
`/` | Step. Use after a wildcard or range to specify the "step" between values. For example, `0-11/2` is equivalent to `0,2,4,6,8,10`.
Note that you can specify the day using two fields: day of month and day of week. For most situations, we recommend that you use just one of these fields and leave the other as `*`.
If you use a non-wildcard value in both fields, the monitor runs when either field matches the time. For example, `15 2 1,15 * 1` causes the monitor to run at 2:15 AM on the 1st of the month, the 15th of the month, and every Monday.
## Sample expressions
Every other day at 1:45 PM:
```
45 13 1-31/2 * *
```
Every 10 minutes on Saturday and Sunday:
```
0/10 * * * 6-7
```
Every three hours on the first day of every other month:
```
0 0-23/3 1 1-12/2 *
```
## API
For an example of how to use a custom cron expression in an API call, see the [create monitor API operation](../api/#request-1).

View File

@ -0,0 +1,16 @@
---
layout: default
title: Alerting
nav_order: 34
has_children: true
---
# Alerting
OpenSearch Dashboards
{: .label .label-yellow :}
The alerting feature notifies you when data from one or more OpenSearch indices meets certain conditions. For example, you might want to notify a [Slack](https://slack.com/) channel if your application logs more than five HTTP 503 errors in one hour, or you might want to page a developer if no new documents have been indexed in the past 20 minutes.
To get started, choose **Alerting** in OpenSearch Dashboards.
![OpenSearch Dashboards side bar with link](../images/alerting.png)

View File

@ -0,0 +1,373 @@
---
layout: default
title: Monitors
nav_order: 1
parent: Alerting
has_children: false
---
# Monitors
#### Table of contents
- TOC
{:toc}
---
## Key terms
Term | Definition
:--- | :---
Monitor | A job that runs on a defined schedule and queries OpenSearch. The results of these queries are then used as input for one or more *triggers*.
Trigger | Conditions that, if met, generate *alerts*.
Alert | An event associated with a trigger. When an alert is created, the trigger performs *actions*, which can include sending a notification.
Action | The information that you want the monitor to send out after being triggered. Actions have a *destination*, a message subject, and a message body.
Destination | A reusable location for an action, such as Amazon Chime, Slack, or a webhook URL.
---
## Create destinations
1. Choose **Alerting**, **Destinations**, **Add destination**.
1. Specify a name for the destination so that you can identify it later.
1. For **Type**, choose Slack, Amazon Chime, custom webhook, or [email](#email-as-a-destination).
For Email type, refer to [Email as a destination](#email-as-a-destination) section below. For all other types, specify the webhook URL. For more information about webhooks, see the documentation for [Slack](https://api.slack.com/incoming-webhooks) and [Chime](https://docs.aws.amazon.com/chime/latest/ug/webhooks.html).
For custom webhooks, you must specify more information: parameters and headers. For example, if your endpoint requires basic authentication, you might need to add a header with a key of `Authorization` and a value of `Basic <Base64-encoded-credential-string>`. You might also need to change `Content-Type` to whatever your webhook requires. Popular values are `application/json`, `application/xml`, and `text/plain`.
This information is stored in plain text in the OpenSearch cluster. We will improve this design in the future, but for now, the encoded credentials (which are neither encrypted nor hashed) might be visible to other OpenSearch users.
### Email as a destination
To send or receive an alert notification as an email, choose **Email** as the destination type. Next, add at least one sender and recipient. We recommend adding email groups if you want to notify more than a few people of an alert. You can configure senders and recipients using **Manage senders** and **Manage email groups**.
#### Manage senders
Senders are email accounts from which the alerting plugin sends notifications.
To configure a sender email, do the following:
1. After you choose **Email** as the destination type, choose **Manage senders**.
1. Choose **Add sender**, **New sender** and enter a unique name.
1. Enter the email address, SMTP host (e.g. `smtp.gmail.com` for a Gmail account), and the port.
1. Choose an encryption method, or use the default value of **None**. However, most email providers require SSL or TLS, which requires a username and password in OpenSearch keystore. Refer to [Authenticate sender account](#authenticate-sender-account) to learn more.
1. Choose **Save** to save the configuration and create the sender. You can create a sender even before you add your credentials to the OpenSearch keystore. However, you must [authenticate each sender account](#authenticate-sender-account) before you use the destination to send your alert.
You can reuse senders across many different destinations, but each destination only supports one sender.
#### Manage email groups or recipients
Use email groups to create and manage reusable lists of email addresses. For example, one alert might email the DevOps team, whereas another might email the executive team and the engineering team.
You can enter individual email addresses or an email group in the **Recipients** field.
1. After you choose **Email** as the destination type, choose **Manage email groups**. Then choose **Add email group**, **New email group**.
1. Enter a unique name.
1. For recipient emails, enter any number of email addresses.
1. Choose **Save**.
#### Authenticate sender account
If your email provider requires SSL or TLS, you must authenticate each sender account before you can send an email. Enter these credentials in the OpenSearch keystore using the CLI. Run the following commands (in your OpenSearch directory) to enter your username and password. The `<sender_name>` is the name you entered for **Sender** earlier.
```bash
./bin/opensearch-keystore add opendistro.alerting.destination.email.<sender_name>.username
./bin/opensearch-keystore add opendistro.alerting.destination.email.<sender_name>.password
```
**Note**: Keystore settings are node-specific. You must run these commands on each node.
{: .note}
To change or update your credentials (after you've added them to the keystore on every node), call the reload API to automatically update those credentials without restarting OpenSearch:
```json
POST _nodes/reload_secure_settings
{
"secure_settings_password": "1234"
}
```
---
## Create monitors
1. Choose **Alerting**, **Monitors**, **Create monitor**.
1. Specify a name for the monitor.
The anomaly detection option is for pairing with the anomaly detection plugin. See [Anomaly Detection](../../ad/).
For anomaly detector, choose an appropriate schedule for the monitor based on the detector interval. Otherwise, the alerting monitor might miss reading the results.
For example, assume you set the monitor interval and the detector interval as 5 minutes, and you start the detector at 12:00. If an anomaly is detected at 12:05, it might be available at 12:06 because of the delay between writing the anomaly and it being available for queries. The monitor reads the anomaly results between 12:00 and 12:05, so it does not get the anomaly results available at 12:06.
To avoid this issue, make sure the alerting monitor is at least twice the detector interval.
When you create a monitor using OpenSearch Dashboards, the anomaly detector plugin generates a default monitor schedule that's twice the detector interval.
Whenever you update a detectors interval, make sure to update the associated monitor interval as well, as the anomaly detection plugin does not do this automatically.
1. Choose one or more indices. You can also use `*` as a wildcard to specify an index pattern.
If you use the security plugin, you can only choose indices that you have permission to access. For details, see [Alerting security](../security/).
1. Define the monitor in one of three ways: visually, using a query, or using an anomaly detector.
- Visual definition works well for monitors that you can define as "some value is above or below some threshold for some amount of time."
- Query definition gives you flexibility in terms of what you query for (using [the OpenSearch query DSL](../../opensearch/full-text)) and how you evaluate the results of that query (Painless scripting).
This example averages the `cpu_usage` field:
```json
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"avg_cpu": {
"avg": {
"field": "cpu_usage"
}
}
}
}
```
You can even filter query results using `{% raw %}{{period_start}}{% endraw %}` and `{% raw %}{{period_end}}{% endraw %}`:
```json
{
"size": 0,
"query": {
"bool": {
"filter": [{
"range": {
"timestamp": {
"from": "{% raw %}{{period_end}}{% endraw %}||-1h",
"to": "{% raw %}{{period_end}}{% endraw %}",
"include_lower": true,
"include_upper": true,
"format": "epoch_millis",
"boost": 1
}
}
}],
"adjust_pure_negative": true,
"boost": 1
}
},
"aggregations": {}
}
```
"Start" and "end" refer to the interval at which the monitor runs. See [Available variables](#available-variables).
1. To define a monitor visually, choose **Define using visual graph**. Then choose an aggregation (for example, `count()` or `average()`), a set of documents, and a timeframe. Visual definition works well for most monitors.
To use a query, choose **Define using extraction query**, add your query (using [the OpenSearch query DSL](../../opensearch/full-text/)), and test it using the **Run** button.
The monitor makes this query to OpenSearch as often as the schedule dictates; check the **Query Performance** section and make sure you're comfortable with the performance implications.
To use an anomaly detector, choose **Define using Anomaly detector** and select your **Detector**.
1. Choose a frequency and timezone for your monitor. Note that you can only pick a timezone if you choose Daily, Weekly, Monthly, or [custom cron expression](../cron/) for frequency.
1. Choose **Create**.
---
## Create triggers
The next step in creating a monitor is to create a trigger. These steps differ depending on whether you chose **Define using visual graph** or **Define using extraction query** or **Define using Anomaly detector** when you created the monitor.
Either way, you begin by specifying a name and severity level for the trigger. Severity levels help you manage alerts. A trigger with a high severity level (e.g. 1) might page a specific individual, whereas a trigger with a low severity level might message a chat room.
### Visual graph
For **Trigger condition**, specify a threshold for the aggregation and timeframe you chose earlier, such as "is below 1,000" or "is exactly 10."
The line moves up and down as you increase and decrease the threshold. Once this line is crossed, the trigger evaluates to true.
### Extraction query
For **Trigger condition**, specify a Painless script that returns true or false. Painless is the default OpenSearch scripting language and has a syntax similar to Groovy.
Trigger condition scripts revolve around the `ctx.results[0]` variable, which corresponds to the extraction query response. For example, your script might reference `ctx.results[0].hits.total.value` or `ctx.results[0].hits.hits[i]._source.error_code`.
A return value of true means the trigger condition has been met, and the trigger should execute its actions. Test your script using the **Run** button.
The **Info** link next to **Trigger condition** contains a useful summary of the variables and results available to your query.
{: .tip }
### Anomaly detector
For **Trigger type**, choose **Anomaly detector grade and confidence**.
Specify the **Anomaly grade condition** for the aggregation and timeframe you chose earlier, "IS ABOVE 0.7" or "IS EXACTLY 0.5." The *anomaly grade* is a number between 0 and 1 that indicates the level of severity of how anomalous a data point is.
Specify the **Anomaly confidence condition** for the aggregation and timeframe you chose earlier, "IS ABOVE 0.7" or "IS EXACTLY 0.5." The *anomaly confidence* is an estimate of the probability that the reported anomaly grade matches the expected anomaly grade.
The line moves up and down as you increase and decrease the threshold. Once this line is crossed, the trigger evaluates to true.
#### Sample scripts
{::comment}
These scripts are Painless, not Groovy, but calling them Groovy in Jekyll gets us syntax highlighting in the generated HTML.
{:/comment}
```groovy
// Evaluates to true if the query returned any documents
ctx.results[0].hits.total.value > 0
```
```groovy
// Returns true if the avg_cpu aggregation exceeds 90
if (ctx.results[0].aggregations.avg_cpu.value > 90) {
return true;
}
```
```groovy
// Performs some crude custom scoring and returns true if that score exceeds a certain value
int score = 0;
for (int i = 0; i < ctx.results[0].hits.hits.length; i++) {
// Weighs 500 errors 10 times as heavily as 503 errors
if (ctx.results[0].hits.hits[i]._source.http_status_code == "500") {
score += 10;
} else if (ctx.results[0].hits.hits[i]._source.http_status_code == "503") {
score += 1;
}
}
if (score > 99) {
return true;
} else {
return false;
}
```
Below are some variables you can include in your message using Mustache templates to see more information about your monitors.
### Available variables
#### Monitor variables
Variable | Data Type | Description
:--- | :--- | :---
`ctx.monitor` | JSON | Includes `ctx.monitor.name`, `ctx.monitor.type`, `ctx.monitor.enabled`, `ctx.monitor.enabled_time`, `ctx.monitor.schedule`, `ctx.monitor.inputs`, `triggers` and `ctx.monitor.last_update_time`.
`ctx.monitor.user` | JSON | Includes information about the user who created the monitor. Includes `ctx.monitor.user.backend_roles` and `ctx.monitor.user.roles`, which are arrays that contain the backend roles and roles assigned to the user. See [alerting security](../security/) for more information.
`ctx.monitor.enabled` | Boolean | Whether the monitor is enabled.
`ctx.monitor.enabled_time` | Milliseconds | Unix epoch time of when the monitor was last enabled.
`ctx.monitor.schedule` | JSON | Contains a schedule of how often or when the monitor should run.
`ctx.monitor.schedule.period.interval` | Integer | The interval at which the monitor runs.
`ctx.monitor.schedule.period.unit` | String | The interval's unit of time.
`ctx.monitor.inputs` | Array | An array that contains the indices and definition used to create the monitor.
`ctx.monitor.inputs.search.indices` | Array | An array that contains the indices the monitor observes.
`ctx.monitor.inputs.search.query` | N/A | The definition used to define the monitor.
#### Trigger variables
Variable | Data Type | Description
:--- | :--- | : ---
`ctx.trigger.id` | String | The trigger's ID.
`ctx.trigger.name` | String | The trigger's name.
`ctx.trigger.severity` | String | The trigger's severity.
`ctx.trigger.condition`| JSON | Contains the Painless script used when creating the monitor.
`ctx.trigger.condition.script.source` | String | The language used to define the script. Must be painless.
`ctx.trigger.condition.script.lang` | String | The script used to define the trigger.
`ctx.trigger.actions`| Array | An array with one element that contains information about the action the monitor needs to trigger.
#### Action variables
Variable | Data Type | Description
:--- | :--- | : ---
`ctx.trigger.actions.id` | String | The action's ID.
`ctx.trigger.actions.name` | String | The action's name.
`ctx.trigger.actions.destination_id`| String | The alert destination's ID.
`ctx.trigger.actions.message_template.source` | String | The message to send in the alert.
`ctx.trigger.actions.message_template.lang` | String | The scripting language used to define the message. Must be Mustache.
`ctx.trigger.actions.throttle_enabled` | Boolean | Whether throttling is enabled for this trigger. See [adding actions](#add-actions/) for more information about throttling.
`ctx.trigger.actions.subject_template.source` | String | The message's subject in the alert.
`ctx.trigger.actions.subject_template.lang` | String | The scripting language used to define the subject. Must be mustache.
#### Other variables
Variable | Data Type | Description
:--- | :--- : :---
`ctx.results` | Array | An array with one element (i.e. `ctx.results[0]`). Contains the query results. This variable is empty if the trigger was unable to retrieve results. See `ctx.error`.
`ctx.last_update_time` | Milliseconds | Unix epoch time of when the monitor was last updated.
`ctx.periodStart` | String | Unix timestamp for the beginning of the period during which the alert triggered. For example, if a monitor runs every ten minutes, a period might begin at 10:40 and end at 10:50.
`ctx.periodEnd` | String | The end of the period during which the alert triggered.
`ctx.error` | String | The error message if the trigger was unable to retrieve results or unable to evaluate the trigger, typically due to a compile error or null pointer exception. Null otherwise.
`ctx.alert` | JSON | The current, active alert (if it exists). Includes `ctx.alert.id`, `ctx.alert.version`, and `ctx.alert.isAcknowledged`. Null if no alert is active.
---
## Add actions
The final step in creating a monitor is to add one or more actions. Actions send notifications when trigger conditions are met and support [Slack](https://slack.com/), [Amazon Chime](https://aws.amazon.com/chime/), and webhooks.
If you don't want to receive notifications for alerts, you don't have to add actions to your triggers. Instead, you can periodically check OpenSearch Dashboards.
{: .tip }
1. Specify a name for the action.
1. Choose a destination.
1. Add a subject and body for the message.
You can add variables to your messages using [Mustache templates](https://mustache.github.io/mustache.5.html). You have access to `ctx.action.name`, the name of the current action, as well as all [trigger variables](#available-variables).
If your destination is a custom webhook that expects a particular data format, you might need to include JSON (or even XML) directly in the message body:
```json
{% raw %}{ "text": "Monitor {{ctx.monitor.name}} just entered alert status. Please investigate the issue. - Trigger: {{ctx.trigger.name}} - Severity: {{ctx.trigger.severity}} - Period start: {{ctx.periodStart}} - Period end: {{ctx.periodEnd}}" }{% endraw %}
```
In this case, the message content must conform to the `Content-Type` header in the [custom webhook](#create-destinations).
1. (Optional) Use action throttling to limit the number of notifications you receive within a given span of time.
For example, if a monitor checks a trigger condition every minute, you could receive one notification per minute. If you set action throttling to 60 minutes, you receive no more than one notification per hour, even if the trigger condition is met dozens of times in that hour.
1. Choose **Create**.
After an action sends a message, the content of that message has left the purview of the security plugin. Securing access to the message (e.g. access to the Slack channel) is your responsibility.
#### Sample message
```mustache
{% raw %}Monitor {{ctx.monitor.name}} just entered an alert state. Please investigate the issue.
- Trigger: {{ctx.trigger.name}}
- Severity: {{ctx.trigger.severity}}
- Period start: {{ctx.periodStart}}
- Period end: {{ctx.periodEnd}}{% endraw %}
```
If you want to use the `ctx.results` variable in a message, use `{% raw %}{{ctx.results.0}}{% endraw %}` rather than `{% raw %}{{ctx.results[0]}}{% endraw %}`. This difference is due to how Mustache handles bracket notation.
{: .note }
---
## Work with alerts
Alerts persist until you resolve the root cause and have the following states:
State | Description
:--- | :---
Active | The alert is ongoing and unacknowledged. Alerts remain in this state until you acknowledge them, delete the trigger associated with the alert, or delete the monitor entirely.
Acknowledged | Someone has acknowledged the alert, but not fixed the root cause.
Completed | The alert is no longer ongoing. Alerts enter this state after the corresponding trigger evaluates to false.
Error | An error occurred while executing the trigger---usually the result of a a bad trigger or destination.
Deleted | Someone deleted the monitor or trigger associated with this alert while the alert was ongoing.

View File

@ -0,0 +1,79 @@
---
layout: default
title: Alerting Security
nav_order: 10
parent: Alerting
has_children: false
---
# Alerting security
If you use the security plugin alongside alerting, you might want to limit certain users to certain actions. For example, you might want some users to only be able to view and acknowledge alerts, while others can modify monitors and destinations.
## Basic permissions
The security plugin has three built-in roles that cover most alerting use cases: `alerting_read_access`, `alerting_ack_alerts`, and `alerting_full_access`. For descriptions of each, see [Predefined roles](../../security/access-control/users-roles/#predefined-roles).
If these roles don't meet your needs, mix and match individual alerting [permissions](../../security/access-control/permissions/) to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/alerting/destination/delete` permission lets you delete destinations.
## How monitors access data
Monitors run with the permissions of the user who created or last modified them. For example, consider the user `jdoe`, who works at a chain of retail stores. `jdoe` has two roles. Together, these two roles allow read access to three indices: `store1-returns`, `store2-returns`, and `store3-returns`.
`jdoe` creates a monitor that sends an email to management whenever the number of returns across all three indices exceeds 40 per hour.
Later, the user `psantos` wants to edit the monitor to run every two hours, but `psantos` only has access to `store1-returns`. To make the change, `psantos` has two options:
- Update the monitor so that it only checks `store1-returns`.
- Ask an administrator for read access to the other two indices.
After making the change, the monitor now runs with the same permissions as `psantos`, including any [document-level security](../../security/access-control/document-level-security/) queries, [excluded fields](../../security/access-control/field-level-security/), and [masked fields](../../security/access-control/field-masking/). If you use an extraction query to define your monitor, use the **Run** button to ensure that the response includes the fields you need.
## (Advanced) Limit access by backend role
Out of the box, the alerting plugin has no concept of ownership. For example, if you have the `cluster:admin/opensearch/alerting/monitor/write` permission, you can edit *all* monitors, regardless of whether you created them. If a small number of trusted users manage your monitors and destinations, this lack of ownership generally isn't a problem. A larger organization might need to segment access by backend role.
First, make sure that your users have the appropriate [backend roles](../../security/access-control/). Backend roles usually come from an [LDAP server](../../security/configuration/ldap/) or [SAML provider](../../security/configuration/saml/). However, if you use the internal user database, you can use the REST API to [add them manually](../../security/access-control/api/#create-user).
Next, enable the following setting:
```json
PUT _cluster/settings
{
"transient": {
"opendistro.alerting.filter_by_backend_roles": "true"
}
}
```
Now when users view alerting resources in OpenSearch Dashboards (or make REST API calls), they only see monitors and destinations that are created by users who share *at least one* backend role. For example, consider three users who all have full access to alerting: `jdoe`, `jroe`, and `psantos`.
`jdoe` and `jroe` are on the same team at work and both have the `analyst` backend role. `psantos` has the `human-resources` backend role.
If `jdoe` creates a monitor, `jroe` can see and modify it, but `psantos` can't. If that monitor generates an alert, the situation is the same: `jroe` can see and acknowledge it, but `psantos` can't. If `psantos` creates a destination, `jdoe` and `jroe` can't see or modify it.
<!-- ## (Advanced) Limit access by individual
If you only want users to be able to see and modify their own monitors and destinations, duplicate the `alerting_full_access` role and add the following [DLS query](../../security/access-control/document-level-security/) to it:
```json
{
"bool": {
"should": [{
"match": {
"monitor.created_by": "${user.name}"
}
}, {
"match": {
"destination.created_by": "${user.name}"
}
}]
}
}
```
Then, use this new role for all alerting users. -->

View File

@ -0,0 +1,59 @@
---
layout: default
title: Management
parent: Alerting
nav_order: 5
---
# Management
## Alerting indices
The alerting feature creates several indices and one alias. The security plugin demo script configures them as [system indices](../../security/configuration/system-indices/) for an extra layer of protection. Don't delete these indices or modify their contents without using the alerting APIs.
Index | Purpose
:--- | :---
`.opendistro-alerting-alerts` | Stores ongoing alerts.
`.opendistro-alerting-alert-history-<date>` | Stores a history of completed alerts.
`.opendistro-alerting-config` | Stores monitors, triggers, and destinations. [Take a snapshot](../../opensearch/snapshot-restore) of this index to back up your alerting configuration.
`.opendistro-alerting-alert-history-write` (alias) | Provides a consistent URI for the `.opendistro-alerting-alert-history-<date>` index.
All alerting indices are hidden by default. For a summary, make the following request:
```
GET _cat/indices?expand_wildcards=open,hidden
```
## Alerting settings
We don't recommend changing these settings; the defaults should work well for most use cases.
All settings are available using the OpenSearch `_cluster/settings` API. None require a restart, and all can be marked `persistent` or `transient`.
Setting | Default | Description
:--- | :--- | :---
`plugins.scheduled_jobs.enabled` | true | Whether the alerting plugin is enabled or not. If disabled, all monitors immediately stop running.
`plugins.alerting.index_timeout` | 60s | The timeout for creating monitors and destinations using the REST APIs.
`plugins.alerting.request_timeout` | 10s | The timeout for miscellaneous requests from the plugin.
`plugins.alerting.action_throttle_max_value` | 24h | The maximum amount of time you can set for action throttling. By default, this value displays as 1440 minutes in OpenSearch Dashboards.
`plugins.alerting.input_timeout` | 30s | How long the monitor can take to issue the search request.
`plugins.alerting.bulk_timeout` | 120s | How long the monitor can write alerts to the alert index.
`plugins.alerting.alert_backoff_count` | 3 | The number of retries for writing alerts before the operation fails.
`plugins.alerting.alert_backoff_millis` | 50ms | The amount of time to wait between retries---increases exponentially after each failed retry.
`plugins.alerting.alert_history_rollover_period` | 12h | How frequently to check whether the `.opendistro-alerting-alert-history-write` alias should roll over to a new history index and whether the Alerting plugin should delete any history indices.
`plugins.alerting.move_alerts_backoff_millis` | 250 | The amount of time to wait between retries---increases exponentially after each failed retry.
`plugins.alerting.move_alerts_backoff_count` | 3 | The number of retries for moving alerts to a deleted state after their monitor or trigger has been deleted.
`plugins.alerting.monitor.max_monitors` | 1000 | The maximum number of monitors users can create.
`plugins.alerting.alert_history_max_age` | 30d | The oldest document to store in the `.opendistro-alert-history-<date>` index before creating a new index. If the number of alerts in this time period does not exceed `alert_history_max_docs`, alerting creates one history index per period (e.g. one index every 30 days).
`plugins.alerting.alert_history_max_docs` | 1000 | The maximum number of alerts to store in the `.opendistro-alert-history-<date>` index before creating a new index.
`plugins.alerting.alert_history_enabled` | true | Whether to create `.opendistro-alerting-alert-history-<date>` indices.
`plugins.alerting.alert_history_retention_period` | 60d | The amount of time to keep history indices before automatically deleting them.
`plugins.alerting.destination.allow_list` | ["chime", "slack", "custom_webhook", "email", "test_action"] | The list of allowed destinations. If you don't want to allow users to a certain type of destination, you can remove it from this list, but we recommend leaving this setting as-is.
`plugins.alerting.filter_by_backend_roles` | "false" | Restricts access to monitors by backend role. See [Alerting security](../security/).
`plugins.scheduled_jobs.sweeper.period` | 5m | The alerting feature uses its "job sweeper" component to periodically check for new or updated jobs. This setting is the rate at which the sweeper checks to see if any jobs (monitors) have changed and need to be rescheduled.
`plugins.scheduled_jobs.sweeper.page_size` | 100 | The page size for the sweeper. You shouldn't need to change this value.
`plugins.scheduled_jobs.sweeper.backoff_millis` | 50ms | The amount of time the sweeper waits between retries---increases exponentially after each failed retry.
`plugins.scheduled_jobs.sweeper.retry_count` | 3 | The total number of times the sweeper should retry before throwing an error.
`plugins.scheduled_jobs.request_timeout` | 10s | The timeout for the request that sweeps shards for jobs.

View File

@ -0,0 +1,251 @@
---
layout: default
title: Asynchronous search
nav_order: 51
has_children: true
---
# Asynchronous search
Searching large volumes of data can take a long time, especially if you're searching across warm nodes or multiple remote clusters.
Asynchronous search in OpenSearch lets you send search requests that run in the background. You can monitor the progress of these searches and get back partial results as they become available. After the search finishes, you can save the results to examine at a later time.
## REST API
To perform an asynchronous search, send requests to `_opensearch/_asynchronous_search`, with your query in the request body:
```json
POST _opensearch/_asynchronous_search
```
You can specify the following options.
Options | Description | Default value | Required
:--- | :--- |:--- |:--- |
`wait_for_completion_timeout` | The amount of time that you plan to wait for the results. You can see whatever results you get within this time just like in a normal search. You can poll the remaining results based on an ID. The maximum value is 300 seconds. | 1 second | No
`keep_on_completion` | Whether you want to save the results in the cluster after the search is complete. You can examine the stored results at a later time. | `false` | No
`keep_alive` | The amount of time that the result is saved in the cluster. For example, `2d` means that the results are stored in the cluster for 48 hours. The saved search results are deleted after this period or if the search is canceled. Note that this includes the query execution time. If the query overruns this time, the process cancels this query automatically. | 12 hours | No
#### Sample request
```json
POST _opensearch/_asynchronous_search/?pretty&size=10&wait_for_completion_timeout=1ms&keep_on_completion=true&request_cache=false
{
"aggs": {
"city": {
"terms": {
"field": "city",
"size": 10
}
}
}
}
```
#### Sample response
```json
{
"*id*": "FklfVlU4eFdIUTh1Q1hyM3ZnT19fUVEUd29KLWZYUUI3TzRpdU5wMjRYOHgAAAAAAAAABg==",
"state": "RUNNING",
"start_time_in_millis": 1599833301297,
"expiration_time_in_millis": 1600265301297,
"response": {
"took": 15,
"timed_out": false,
"terminated_early": false,
"num_reduce_phases": 4,
"_shards": {
"total": 21,
"successful": 4,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 807,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"city": {
"doc_count_error_upper_bound": 16,
"sum_other_doc_count": 403,
"buckets": [
{
"key": "downsville",
"doc_count": 1
},
....
....
....
{
"key": "blairstown",
"doc_count": 1
}
]
}
}
}
}
```
#### Response parameters
Options | Description
:--- | :---
`id` | The ID of an asynchronous search. Use this ID to monitor the progress of the search, get its partial results, and/or delete the results. If the asynchronous search finishes within the timeout period, the response doesn't include the ID because the results aren't stored in the cluster.
`state` | Specifies whether the search is still running or if it has finished, and if the results persist in the cluster. The possible states are `RUNNING`, `COMPLETED`, and `PERSISTED`.
`start_time_in_millis` | The start time in milliseconds.
`expiration_time_in_millis` | The expiration time in milliseconds.
`took` | The total time that the search is running.
`response` | The actual search response.
`num_reduce_phases` | The number of times that the coordinating node aggregates results from batches of shard responses (5 by default). If this number increases compared to the last retrieved results, you can expect additional results to be included in the search response.
`total` | The total number of shards that run the search.
`successful` | The number of shard responses that the coordinating node received successfully.
`aggregations` | The partial aggregation results that have been completed by the shards so far.
## Get partial results
After you submit an asynchronous search request, you can request partial responses with the ID that you see in the asynchronous search response.
```json
GET _opensearch/_asynchronous_search/<ID>?pretty
```
#### Sample response
```json
{
"id": "Fk9lQk5aWHJIUUltR2xGWnpVcWtFdVEURUN1SWZYUUJBVkFVMEJCTUlZUUoAAAAAAAAAAg==",
"state": "STORE_RESIDENT",
"start_time_in_millis": 1599833907465,
"expiration_time_in_millis": 1600265907465,
"response": {
"took": 83,
"timed_out": false,
"_shards": {
"total": 20,
"successful": 20,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 1000,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "bank",
"_type": "_doc",
"_id": "1",
"_score": 1,
"_source": {
"email": "amberduke@abc.com",
"city": "Brogan",
"state": "IL"
}
},
{....}
]
},
"aggregations": {
"city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 997,
"buckets": [
{
"key": "belvoir",
"doc_count": 2
},
{
"key": "aberdeen",
"doc_count": 1
},
{
"key": "abiquiu",
"doc_count": 1
}
]
}
}
}
}
```
After the response is successfully persisted, you get back the `STORE_RESIDENT` state in the response.
You can poll the ID with the `wait_for_completion_timeout` parameter to wait for the results received for the time that you specify.
For asynchronous searches with `keep_on_completion` as `true` and a sufficiently long `keep_alive` time, you can keep polling the IDs until the search finishes. If you dont want to periodically poll each ID, you can retain the results in your cluster with the `keep_alive` parameter and come back to it at a later time.
## Delete searches and results
You can use the DELETE API operation to delete any ongoing asynchronous search by its ID. If the search is still running, its canceled. If the search is complete, the saved search results are deleted.
```json
DELETE _opensearch/_asynchronous_search/<ID>?pretty
```
#### Sample response
```json
{
"acknowledged": "true"
}
```
## Monitor stats
You can use the stats API operation to monitor asynchronous searches that are running, completed, and/or persisted.
```json
GET _opensearch/_asynchronous_search/stats
```
#### Sample response
```json
{
"_nodes": {
"total": 8,
"successful": 8,
"failed": 0
},
"cluster_name": "264071961897:asynchronous-search",
"nodes": {
"JKEFl6pdRC-xNkKQauy7Yg": {
"asynchronous_search_stats": {
"submitted": 18236,
"initialized": 112,
"search_failed": 56,
"search_completed": 56,
"rejected": 18124,
"persist_failed": 0,
"cancelled": 1,
"running_current": 399,
"persisted": 100
}
}
}
}
```
#### Response parameters
Options | Description
:--- | :---
`submitted` | The number of asynchronous search requests that were submitted.
`initialized` | The number of asynchronous search requests that were initialized.
`rejected` | The number of asynchronous search requests that were rejected.
`search_completed` | The number of asynchronous search requests that completed with a successful response.
`search_failed` | The number of asynchronous search requests that completed with a failed response.
`persisted` | The number of asynchronous search requests whose final result successfully persisted in the cluster.
`persist_failed` | The number of asynchronous search requests whose final result failed to persist in the cluster.
`running_current` | The number of asynchronous search requests that are running on a given coordinator node.
`cancelled` | The number of asynchronous search requests that were canceled while the search was running.

View File

@ -0,0 +1,76 @@
---
layout: default
title: Asynchronous search security
nav_order: 2
parent: Asynchronous search
has_children: false
---
# Asynchronous search security
You can use the security plugin with asynchronous searches to limit non-admin users to specific actions. For example, you might want some users to only be able to submit or delete asynchronous searches, while you might want others to only view the results.
All asynchronous search indices are protected as system indices. Only a super admin user or an admin user with a Transport Layer Security (TLS) certificate can access system indices. For more information, see [System indices](../../security/configuration/system-indices/).
## Basic permissions
As an admin user, you can use the security plugin to assign specific permissions to users based on which API operations they need access to. For a list of supported APIs operations, see [Asynchronous search](../).
The security plugin has two built-in roles that cover most asynchronous search use cases: `asynchronous_search_full_access` and `asynchronous_search_read_access`. For descriptions of each, see [Predefined roles](../../security/access-control/users-roles/#predefined-roles).
If these roles dont meet your needs, mix and match individual asynchronous search permissions to suit your use case. Each action corresponds to an operation in the REST API. For example, the `cluster:admin/opensearch/asynchronous_search/delete` permission lets you delete a previously submitted asynchronous search.
## (Advanced) Limit access by backend role
Use backend roles to configure fine-grained access to asynchronous searches based on roles. For example, users of different departments in an organization can view asynchronous searches owned by their own department.
First, make sure your users have the appropriate [backend roles](../../security/access-control/). Backend roles usually come from an [LDAP server](../../security/configuration/ldap/) or [SAML provider](../../security/configuration/saml/). However, if you use the internal user database, you can use the REST API to [add them manually](../../security/access-control/api/#create-user).
Now when users view asynchronous search resources in OpenSearch Dashboards (or make REST API calls), they only see asynchronous searches submitted by users who have a subset of the backend role.
For example, consider two users: `judy` and `elon`.
`judy` has an IT backend role:
```json
PUT _opensearch/_security/api/internalusers/judy
{
"password": "judy",
"backend_roles": [
"IT"
],
"attributes": {}
}
```
`elon` has an admin backend role:
```json
PUT _opensearch/_security/api/internalusers/elon
{
"password": "elon",
"backend_roles": [
"admin"
],
"attributes": {}
}
```
Both `judy` and `elon` have full access to asynchronous search:
```json
PUT _opensearch/_security/api/rolesmapping/async_full_access
{
"backend_roles": [],
"hosts": [],
"users": [
"judy",
"elon"
]
}
```
Because they have different backend roles, an asynchronous search submitted by `judy` will not be visible to `elon` and vice versa.
`judy` needs to have at least the superset of all roles that `elon` has to see `elon`'s asynchronous searches.
For example, if `judy` has five backend roles and `elon` has one of these roles, then `judy` can see asynchronous searches submitted by `elon`, but `elon` cant see the asynchronous searches submitted by `judy`. This means that `judy` can perform GET and DELETE operations on asynchronous searches submitted by `elon`, but not the reverse.

View File

@ -0,0 +1,29 @@
---
layout: default
title: Settings
parent: Asynchronous search
nav_order: 4
---
# Settings
The asynchronous search plugin adds several settings to the standard OpenSearch cluster settings. They are dynamic, so you can change the default behavior of the plugin without restarting your cluster. You can mark the settings as `persistent` or `transient`.
For example, to update the retention period of the result index:
```json
PUT _cluster/settings
{
"transient": {
"opensearch.asynchronous_search.max_wait_for_completion_timeout": "5m"
}
}
```
Setting | Default | Description
:--- | :--- | :---
`opensearch.asynchronous_search.max_search_running_time` | 12 hours | The maximum running time for the search beyond which the search is terminated.
`opensearch.asynchronous_search.node_concurrent_running_searches` | 20 | The concurrent searches running per coordinator node.
`opensearch.asynchronous_search.max_keep_alive` | 5 days | The maximum amount of time that search results can be stored in the cluster.
`opensearch.asynchronous_search.max_wait_for_completion_timeout` | 1 minute | The maximum value for the `wait_for_completion_timeout` parameter.
`opensearch.asynchronous_search.persist_search_failures` | false | Persist asynchronous search results that end with a search failure in the system index.

View File

@ -0,0 +1,100 @@
---
layout: default
title: OpenSearch CLI
nav_order: 52
has_children: false
---
# OpenSearch CLI
The OpenSearch CLI command line interface (opensearch-cli) lets you manage your OpenSearch cluster from the command line and automate tasks.
Currently, opensearch-cli supports the [Anomaly Detection](../ad/) and [k-NN](../knn/) plugins, along with arbitrary REST API paths. Among other things, you can use opensearch-cli to create and delete detectors, start and stop them, and check k-NN statistics.
Profiles let you easily access different clusters or sign requests with different credentials. opensearch-cli supports unauthenticated requests, HTTP basic signing, and IAM signing for Amazon Web Services.
This example moves a detector (`ecommerce-count-quantity`) from a staging cluster to a production cluster:
```bash
opensearch-cli ad get ecommerce-count-quantity --profile staging > ecommerce-count-quantity.json
opensearch-cli ad create ecommerce-count-quantity.json --profile production
opensearch-cli ad start ecommerce-count-quantity.json --profile production
opensearch-cli ad stop ecommerce-count-quantity --profile staging
opensearch-cli ad delete ecommerce-count-quantity --profile staging
```
## Install
1. [Download](https://opensearch.org/downloads.html){:target='\_blank'} and extract the appropriate installation package for your computer.
1. Make the `opensearch-cli` file executable:
```bash
chmod +x ./opensearch-cli
```
1. Add the command to your path:
```bash
export PATH=$PATH:$(pwd)
```
1. Confirm the CLI is working properly:
```bash
opensearch-cli --version
```
## Profiles
Profiles let you easily switch between different clusters and user credentials. To get started, run `opensearch-cli profile create` with the `--auth-type`, `--endpoint`, and `--name` options:
```bash
opensearch-cli profile create --auth-type basic --endpoint https://localhost:9200 --name docker-local
```
Alternatively, save a configuration file to `~/.opensearch-cli/config.yaml`:
```yaml
profiles:
- name: docker-local
endpoint: https://localhost:9200
user: admin
password: foobar
- name: aws
endpoint: https://some-cluster.us-east-1.es.amazonaws.com
aws_iam:
profile: ""
service: es
```
## Usage
opensearch-cli commands use the following syntax:
```bash
opensearch-cli <command> <subcommand> <flags>
```
For example, the following command retrieves information about a detector:
```bash
opensearch-cli ad get my-detector --profile docker-local
```
For a request to the OpenSearch CAT API, try the following command:
```bash
opensearch-cli curl get --path _cat/plugins --profile aws
```
Use the `-h` or `--help` flag to see all supported commands, subcommands, or usage for a specific command:
```bash
opensearch-cli -h
opensearch-cli ad -h
opensearch-cli ad get -h
```

View File

@ -0,0 +1,457 @@
---
layout: default
title: Index Rollups
nav_order: 35
parent: Index management
has_children: true
redirect_from: /docs/ism/index-rollups/
has_toc: false
---
# Index Rollups
Time series data increases storage costs, strains cluster health, and slows down aggregations over time. Index rollup lets you periodically reduce data granularity by rolling up old data into summarized indices.
You pick the fields that interest you and use index rollup to create a new index with only those fields aggregated into coarser time buckets. You can store months or years of historical data at a fraction of the cost with the same query performance.
For example, say you collect CPU consumption data every five seconds and store it on a hot node. Instead of moving older data to a read-only warm node, you can roll up or compress this data with only the average CPU consumption per day or with a 10% decrease in its interval every week.
You can use index rollup in three ways:
1. Use the index rollup API for an on-demand index rollup job that operates on an index that's not being actively ingested such as a rolled-over index. For example, you can perform an index rollup operation to reduce data collected at a five minute interval to a weekly average for trend analysis.
2. Use the OpenSearch Dashboards UI to create an index rollup job that runs on a defined schedule. You can also set it up to roll up your indices as its being actively ingested. For example, you can continuously roll up Logstash indices from a five second interval to a one hour interval.
3. Specify the index rollup job as an ISM action for complete index management. This allows you to roll up an index after a certain event such as a rollover, index age reaching a certain point, index becoming read-only, and so on. You can also have rollover and index rollup jobs running in sequence, where the rollover first moves the current index to a warm node and then the index rollup job creates a new index with the minimized data on the hot node.
## Create an Index Rollup Job
To get started, choose **Index Management** in OpenSearch Dashboards.
Select **Rollup Jobs** and choose **Create rollup job**.
### Step 1: Set up indices
1. In the **Job name and description** section, specify a unique name and an optional description for the index rollup job.
2. In the **Indices** section, select the source and target index. The source index is the one that you want to roll up. The source index remains as is, the index rollup job creates a new index referred to as a target index. The target index is where the index rollup results are saved. For target index, you can either type in a name for a new index or you select an existing index.
5. Choose **Next**
After you create an index rollup job, you can't change your index selections.
### Step 2: Define aggregations and metrics
Select the attributes with the aggregations (terms and histograms) and metrics (avg, sum, max, min, and value count) that you want to roll up. Make sure you dont add a lot of highly granular attributes, because you wont save much space.
For example, consider a dataset of cities and demographics within those cities. You can aggregate based on cities and specify demographics within a city as metrics.
The order in which you select attributes is critical. A city followed by a demographic is different from a demographic followed by a city.
1. In the **Time aggregation** section, select a timestamp field. Choose between a **Fixed** or **Calendar** interval type and specify the interval and timezone. The index rollup job uses this information to create a date histogram for the timestamp field.
2. (Optional) Add additional aggregations for each field. You can choose terms aggregation for all field types and histogram aggregation only for numeric fields.
3. (Optional) Add additional metrics for each field. You can choose between **All**, **Min**, **Max**, **Sum**, **Avg**, or **Value Count**.
4. Choose **Next**.
### Step 3: Specify schedule
Specify a schedule to roll up your indices as its being ingested. The index rollup job is enabled by default.
1. Specify if the data is continuous or not.
3. For roll up execution frequency, select **Define by fixed interval** and specify the **Rollup interval** and the time unit or **Define by cron expression** and add in a cron expression to select the interval. To learn how to define a cron expression, see [Alerting](../alerting/cron/).
4. Specify the number of pages per execution process. A larger number means faster execution and more cost for memory.
5. (Optional) Add a delay to the roll up executions. This is the amount of time the job waits for data ingestion to accommodate any processing time. For example, if you set this value to 10 minutes, an index rollup that executes at 2 PM to roll up 1 PM to 2 PM of data starts at 2:10 PM.
6. Choose **Next**.
### Step 4: Review and create
Review your configuration and select **Create**.
### Step 5: Search the target index
You can use the standard `_search` API to search the target index. Make sure that the query matches the constraints of the target index. For example, if you dont set up terms aggregations on a field, you dont receive results for terms aggregations. If you dont set up the maximum aggregations, you dont receive results for maximum aggregations.
You cant access the internal structure of the data in the target index because the plugin automatically rewrites the query in the background to suit the target index. This is to make sure you can use the same query for the source and target index.
To query the target index, set `size` to 0:
```json
GET target_index/_search
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"avg_cpu": {
"avg": {
"field": "cpu_usage"
}
}
}
}
```
Consider a scenario where you collect rolled up data from 1 PM to 9 PM in hourly intervals and live data from 7 PM to 11 PM in minutely intervals. If you execute an aggregation over these in the same query, for 7 PM to 9 PM, you see an overlap of both rolled up data and live data because they get counted twice in the aggregations.
## Sample Walkthrough
This walkthrough uses the OpenSearch Dashboards sample e-commerce data. To add that sample data, log in to OpenSearch Dashboards, choose **Home** and **Try our sample data**. For **Sample eCommerce orders**, choose **Add data**.
Then run a search:
```json
GET opensearch_dashboards_sample_data_ecommerce/_search
```
#### Sample response
```json
{
"took": 23,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4675,
"relation": "eq"
},
"max_score": 1,
"hits": [
{
"_index": "opensearch_dashboards_sample_data_ecommerce",
"_type": "_doc",
"_id": "jlMlwXcBQVLeQPrkC_kQ",
"_score": 1,
"_source": {
"category": [
"Women's Clothing",
"Women's Accessories"
],
"currency": "EUR",
"customer_first_name": "Selena",
"customer_full_name": "Selena Mullins",
"customer_gender": "FEMALE",
"customer_id": 42,
"customer_last_name": "Mullins",
"customer_phone": "",
"day_of_week": "Saturday",
"day_of_week_i": 5,
"email": "selena@mullins-family.zzz",
"manufacturer": [
"Tigress Enterprises"
],
"order_date": "2021-02-27T03:56:10+00:00",
"order_id": 581553,
"products": [
{
"base_price": 24.99,
"discount_percentage": 0,
"quantity": 1,
"manufacturer": "Tigress Enterprises",
"tax_amount": 0,
"product_id": 19240,
"category": "Women's Clothing",
"sku": "ZO0064500645",
"taxless_price": 24.99,
"unit_discount_amount": 0,
"min_price": 12.99,
"_id": "sold_product_581553_19240",
"discount_amount": 0,
"created_on": "2016-12-24T03:56:10+00:00",
"product_name": "Blouse - port royal",
"price": 24.99,
"taxful_price": 24.99,
"base_unit_price": 24.99
},
{
"base_price": 10.99,
"discount_percentage": 0,
"quantity": 1,
"manufacturer": "Tigress Enterprises",
"tax_amount": 0,
"product_id": 17221,
"category": "Women's Accessories",
"sku": "ZO0085200852",
"taxless_price": 10.99,
"unit_discount_amount": 0,
"min_price": 5.06,
"_id": "sold_product_581553_17221",
"discount_amount": 0,
"created_on": "2016-12-24T03:56:10+00:00",
"product_name": "Snood - rose",
"price": 10.99,
"taxful_price": 10.99,
"base_unit_price": 10.99
}
],
"sku": [
"ZO0064500645",
"ZO0085200852"
],
"taxful_total_price": 35.98,
"taxless_total_price": 35.98,
"total_quantity": 2,
"total_unique_products": 2,
"type": "order",
"user": "selena",
"geoip": {
"country_iso_code": "MA",
"location": {
"lon": -8,
"lat": 31.6
},
"region_name": "Marrakech-Tensift-Al Haouz",
"continent_name": "Africa",
"city_name": "Marrakesh"
},
"event": {
"dataset": "sample_ecommerce"
}
}
}
]
}
}
...
```
Create an index rollup job.
This example picks the `order_date`, `customer_gender`, `geoip.city_name`, `geoip.region_name`, and `day_of_week` fields and rolls them into an `example_rollup` target index:
```json
PUT _plugins/_rollup/jobs/example
{
"rollup": {
"enabled": true,
"schedule": {
"interval": {
"period": 1,
"unit": "Minutes",
"start_time": 1602100553
}
},
"last_updated_time": 1602100553,
"description": "An example policy that rolls up the sample ecommerce data",
"source_index": "opensearch_dashboards_sample_data_ecommerce",
"target_index": "example_rollup",
"page_size": 1000,
"delay": 0,
"continuous": false,
"dimensions": [
{
"date_histogram": {
"source_field": "order_date",
"fixed_interval": "60m",
"timezone": "America/Los_Angeles"
}
},
{
"terms": {
"source_field": "customer_gender"
}
},
{
"terms": {
"source_field": "geoip.city_name"
}
},
{
"terms": {
"source_field": "geoip.region_name"
}
},
{
"terms": {
"source_field": "day_of_week"
}
}
],
"metrics": [
{
"source_field": "taxless_total_price",
"metrics": [
{
"avg": {}
},
{
"sum": {}
},
{
"max": {}
},
{
"min": {}
},
{
"value_count": {}
}
]
},
{
"source_field": "total_quantity",
"metrics": [
{
"avg": {}
},
{
"max": {}
}
]
}
]
}
}
```
You can query the `example_rollup` index for the terms aggregations on the fields set up in the rollup job.
You get back the same response that you would on the original `opensearch_dashboards_sample_data_ecommerce` source index.
```json
POST example_rollup/_search
{
"size": 0,
"query": {
"bool": {
"must": {"term": { "geoip.region_name": "California" } }
}
},
"aggregations": {
"daily_numbers": {
"terms": {
"field": "day_of_week"
},
"aggs": {
"per_city": {
"terms": {
"field": "geoip.city_name"
},
"aggregations": {
"average quantity": {
"avg": {
"field": "total_quantity"
}
}
}
},
"total_revenue": {
"sum": {
"field": "taxless_total_price"
}
}
}
}
}
}
```
#### Sample Response
```json
{
"took": 476,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 281,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"daily_numbers": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Friday",
"doc_count": 53,
"total_revenue": {
"value": 4858.84375
},
"per_city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Los Angeles",
"doc_count": 53,
"average quantity": {
"value": 2.305084745762712
}
}
]
}
},
{
"key": "Saturday",
"doc_count": 43,
"total_revenue": {
"value": 3547.203125
},
"per_city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Los Angeles",
"doc_count": 43,
"average quantity": {
"value": 2.260869565217391
}
}
]
}
},
{
"key": "Tuesday",
"doc_count": 42,
"total_revenue": {
"value": 3983.28125
},
"per_city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Los Angeles",
"doc_count": 42,
"average quantity": {
"value": 2.2888888888888888
}
}
]
}
},
{
"key": "Sunday",
"doc_count": 40,
"total_revenue": {
"value": 3308.1640625
},
"per_city": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "Los Angeles",
"doc_count": 40,
"average quantity": {
"value": 2.090909090909091
}
}
]
}
}
...
]
}
}
}
```

View File

@ -0,0 +1,235 @@
---
layout: default
title: Index Rollups API
parent: Index Rollups
grand_parent: Index management
redirect_from: /docs/ism/rollup-api/
nav_order: 9
---
# Index Rollups API
Use the index rollup operations to programmatically work with index rollup jobs.
---
#### Table of contents
- TOC
{:toc}
---
## Create or update an index rollup job
Creates or updates an index rollup job.
You must provide the `seq_no` and `primary_term` parameters.
#### Request
```json
PUT _plugins/_rollup/jobs/<rollup_id> // Create
PUT _plugins/_rollup/jobs/<rollup_id>?if_seq_no=1&if_primary_term=1 // Update
{
"rollup": {
"source_index": "nyc-taxi-data",
"target_index": "rollup-nyc-taxi-data",
"schedule": {
"interval": {
"period": 1,
"unit": "Days"
}
},
"description": "Example rollup job",
"enabled": true,
"page_size": 200,
"delay": 0,
"roles": [
"rollup_all",
"nyc_taxi_all",
"example_rollup_index_all"
],
"continuous": false,
"dimensions": {
"date_histogram": {
"source_field": "tpep_pickup_datetime",
"fixed_interval": "1h",
"timezone": "America/Los_Angeles"
},
"terms": {
"source_field": "PULocationID"
},
"metrics": [
{
"source_field": "passenger_count",
"metrics": [
{
"avg": {}
},
{
"sum": {}
},
{
"max": {}
},
{
"min": {}
},
{
"value_count": {}
}
]
}
]
}
}
}
```
You can specify the following options.
Options | Description | Type | Required
:--- | :--- |:--- |:--- |
`source_index` | The name of the detector. | `string` | Yes
`target_index` | Specify the target index that the rolled up data is ingested into. You could either create a new target index or use an existing index. The target index cannot be a combination of raw and rolled up data. | `string` | Yes
`schedule` | Schedule of the index rollup job which can be an interval or a cron expression. | `object` | Yes
`schedule.interval` | Specify the frequency of execution of the rollup job. | `object` | No
`schedule.interval.start_time` | Start time of the interval. | `timestamp` | Yes
`schedule.interval.period` | Define the interval period. | `string` | Yes
`schedule.interval.unit` | Specify the time unit of the interval. | `string` | Yes
`schedule.interval.cron` | Optionally, specify a cron expression to define therollup frequency. | `list` | No
`schedule.interval.cron.expression` | Specify a Unix cron expression. | `string` | Yes
`schedule.interval.cron.timezone` | Specify timezones as defined by the IANA Time Zone Database. Defaults to UTC. | `string` | No
`description` | Optionally, describe the rollup job. | `string` | No
`enabled` | When true, the index rollup job is scheduled. Default is true. | `boolean` | Yes
`continuous` | Specify whether or not the index rollup job continuously rolls up data forever or just executes over the current data set once and stops. Default is false. | `boolean` | Yes
`error_notification` | Set up a Mustache message template sent for error notifications. For example, if an index rollup job fails, the system sends a message to a Slack channel. | `object` | No
`page_size` | Specify the number of buckets to paginate through at a time while rolling up. | `number` | Yes
`delay` | Specify time value to delay execution of the index rollup job. | `time_unit` | No
`dimensions` | Specify aggregations to create dimensions for the roll up time window. | `object` | Yes
`dimensions.date_histogram` | Specify either fixed_interval or calendar_interval, but not both. Either one limits what you can query in the target index. | `object` | No
`dimensions.date_histogram.fixed_interval` | Specify the fixed interval for aggregations in milliseconds, seconds, minutes, hours, or days. | `string` | No
`dimensions.date_histogram.calendar_interval` | Specify the calendar interval for aggregations in minutes, hours, days, weeks, months, quarters, or years. | `string` | No
`dimensions.date_histogram.field` | Specify the date field used in date histogram aggregation. | `string` | No
`dimensions.date_histogram.timezone` | Specify the timezones as defined by the IANA Time Zone Database. The default is UTC. | `string` | No
`dimensions.terms` | Specify the term aggregations that you want to roll up. | `object` | No
`dimensions.terms.fields` | Specify terms aggregation for compatible fields. | `object` | No
`dimensions.histogram` | Specify the histogram aggregations that you want to roll up. | `object` | No
`dimensions.histogram.field` | Add a field for histogram aggregations. | `string` | Yes
`dimensions.histogram.interval` | Specify the histogram aggregation interval for the field. | `long` | Yes
`dimensions.metrics` | Specify a list of objects that represent the fields and metrics that you want to calculate. | `nested object` | No
`dimensions.metrics.field` | Specify the field that you want to perform metric aggregations on. | `string` | No
`dimensions.metrics.field.metrics` | Specify the metric aggregations you want to calculate for the field. | `multiple strings` | No
#### Sample response
```json
{
"_id": "rollup_id",
"_seqNo": 1,
"_primaryTerm": 1,
"rollup": { ... }
}
```
## Get an index rollup job
Returns all information about an index rollup job based on the `rollup_id`.
#### Request
```json
GET _plugins/_rollup/jobs/<rollup_id>
```
#### Sample response
```json
{
"_id": "my_rollup",
"_seqNo": 1,
"_primaryTerm": 1,
"rollup": { ... }
}
```
---
## Delete an index rollup job
Deletes an index rollup job based on the `rollup_id`.
#### Request
```json
DELETE _plugins/_rollup/jobs/<rollup_id>
```
#### Sample response
```json
200 OK
```
---
## Start or stop an index rollup job
Start or stop an index rollup job.
#### Request
```json
POST _plugins/_rollup/jobs/<rollup_id>/_start
POST _plugins/_rollup/jobs/<rollup_id>/_stop
```
#### Sample response
```json
200 OK
```
---
## Explain an index rollup job
Returns detailed metadata information about the index rollup job and its current progress.
#### Request
```json
GET _plugins/_rollup/jobs/<rollup_id>/_explain
```
#### Sample response
```json
{
"example_rollup": {
"rollup_id": "example_rollup",
"last_updated_time": 1602014281,
"continuous": {
"next_window_start_time": 1602055591,
"next_window_end_time": 1602075591
},
"status": "running",
"failure_reason": null,
"stats": {
"pages_processed": 342,
"documents_processed": 489359,
"rollups_indexed": 3420,
"index_time_in_ms": 30495,
"search_time_in_ms": 584922
}
}
}
```

View File

@ -0,0 +1,12 @@
---
layout: default
title: Index management
nav_order: 30
has_children: true
---
# Index Management
OpenSearch Dashboards
{: .label .label-yellow :}
The Index Management (IM) plugin lets you automate recurring index management activities and reduce storage costs.

View File

@ -0,0 +1,494 @@
---
layout: default
title: ISM API
parent: Index State Management
grand_parent: Index management
redirect_from: /docs/ism/api/
nav_order: 5
---
# ISM API
Use the index state management operations to programmatically work with policies and managed indices.
---
#### Table of contents
- TOC
{:toc}
---
## Create policy
Creates a policy.
#### Request
```json
PUT _plugins/_ism/policies/policy_1
{
"policy": {
"description": "ingesting logs",
"default_state": "ingest",
"states": [
{
"name": "ingest",
"actions": [
{
"rollover": {
"min_doc_count": 5
}
}
],
"transitions": [
{
"state_name": "search"
}
]
},
{
"name": "search",
"actions": [],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "5m"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
```
#### Sample response
```json
{
"_id": "policy_1",
"_version": 1,
"_primary_term": 1,
"_seq_no": 7,
"policy": {
"policy": {
"policy_id": "policy_1",
"description": "ingesting logs",
"last_updated_time": 1577990761311,
"schema_version": 1,
"error_notification": null,
"default_state": "ingest",
"states": [
{
"name": "ingest",
"actions": [
{
"rollover": {
"min_doc_count": 5
}
}
],
"transitions": [
{
"state_name": "search"
}
]
},
{
"name": "search",
"actions": [],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "5m"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
}
```
---
## Add policy
Adds a policy to an index. This operation does not change the policy if the index already has one.
#### Request
```json
POST _plugins/_ism/add/index_1
{
"policy_id": "policy_1"
}
```
#### Sample response
```json
{
"updated_indices": 1,
"failures": false,
"failed_indices": []
}
```
---
## Update policy
Updates a policy. Use the `seq_no` and `primary_term` parameters to update an existing policy. If these numbers don't match the existing policy or the policy doesn't exist, ISM throws an error.
#### Request
```json
PUT _plugins/_ism/policies/policy_1?if_seq_no=7&if_primary_term=1
{
"policy": {
"description": "ingesting logs",
"default_state": "ingest",
"states": [
{
"name": "ingest",
"actions": [
{
"rollover": {
"min_doc_count": 5
}
}
],
"transitions": [
{
"state_name": "search"
}
]
},
{
"name": "search",
"actions": [],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "5m"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
```
#### Sample response
```json
{
"_id": "policy_1",
"_version": 2,
"_primary_term": 1,
"_seq_no": 10,
"policy": {
"policy": {
"policy_id": "policy_1",
"description": "ingesting logs",
"last_updated_time": 1577990934044,
"schema_version": 1,
"error_notification": null,
"default_state": "ingest",
"states": [
{
"name": "ingest",
"actions": [
{
"rollover": {
"min_doc_count": 5
}
}
],
"transitions": [
{
"state_name": "search"
}
]
},
{
"name": "search",
"actions": [],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "5m"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
}
```
---
## Get policy
Gets the policy by `policy_id`.
#### Request
```json
GET _plugins/_ism/policies/policy_1
```
#### Sample response
```json
{
"_id": "policy_1",
"_version": 2,
"_seq_no": 10,
"_primary_term": 1,
"policy": {
"policy_id": "policy_1",
"description": "ingesting logs",
"last_updated_time": 1577990934044,
"schema_version": 1,
"error_notification": null,
"default_state": "ingest",
"states": [
{
"name": "ingest",
"actions": [
{
"rollover": {
"min_doc_count": 5
}
}
],
"transitions": [
{
"state_name": "search"
}
]
},
{
"name": "search",
"actions": [],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "5m"
}
}
]
},
{
"name": "delete",
"actions": [
{
"delete": {}
}
],
"transitions": []
}
]
}
}
```
---
## Remove policy from index
Removes any ISM policy from the index.
#### Request
```json
POST _plugins/_ism/remove/index_1
```
#### Sample response
```json
{
"updated_indices": 1,
"failures": false,
"failed_indices": []
}
```
---
## Update managed index policy
Updates the managed index policy to a new policy (or to a new version of the policy). You can use an index pattern to update multiple indices at once. When updating multiple indices, you might want to include a state filter to only affect certain managed indices. The change policy filters out all the existing managed indices and only applies the change to the ones in the state that you specify. You can also explicitly specify the state that the managed index transitions to after the change policy takes effect.
A policy change is an asynchronous background process. The changes are queued and are not executed immediately by the background process. This delay in execution protects the currently running managed indices from being put into a broken state. If the policy you are changing to has only some small configuration changes, then the change takes place immediately. For example, if the policy changes the `min_index_age` parameter in a rollover condition from `1000d` to `100d`, this change takes place immediately in its next execution. If the change modifies the state, actions, or the order of actions of the current state the index is in, then the change happens at the end of its current state before transitioning to a new state.
In this example, the policy applied on the `index_1` index is changed to `policy_1`, which could either be a completely new policy or an updated version of its existing policy. The process only applies the change if the index is currently in the `searches` state. After this change in policy takes place, `index_1` transitions to the `delete` state.
#### Request
```json
POST _plugins/_ism/change_policy/index_1
{
"policy_id": "policy_1",
"state": "delete",
"include": [
{
"state": "searches"
}
]
}
```
#### Sample response
```json
{
"updated_indices": 0,
"failures": false,
"failed_indices": []
}
```
---
## Retry failed index
Retries the failed action for an index. For the retry call to succeed, ISM must manage the index, and the index must be in a failed state. You can use index patterns (`*`) to retry multiple failed indices.
#### Request
```json
POST _plugins/_ism/retry/index_1
{
"state": "delete"
}
```
#### Sample response
```json
{
"updated_indices": 0,
"failures": false,
"failed_indices": []
}
```
---
## Explain index
Gets the current state of the index. You can use index patterns to get the status of multiple indices.
#### Request
```json
GET _plugins/_ism/explain/index_1
```
#### Sample response
```json
{
"index_1": {
"index.opendistro.index_state_management.policy_id": "policy_1"
}
}
```
The `opendistro.index_state_management.policy_id` setting is deprecated starting from version 1.13.0.
We retain this field in the response API for consistency.
---
## Delete policy
Deletes the policy by `policy_id`.
#### Request
```json
DELETE _plugins/_ism/policies/policy_1
```
#### Sample response
```json
{
"_index": ".opendistro-ism-config",
"_type": "_doc",
"_id": "policy_1",
"_version": 3,
"result": "deleted",
"forced_refresh": true,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"_seq_no": 15,
"_primary_term": 1
}
```

View File

@ -0,0 +1,103 @@
---
layout: default
title: Index State Management
nav_order: 3
parent: Index management
has_children: true
redirect_from: /docs/ism/
has_toc: false
---
# Index State Management
OpenSearch Dashboards
{: .label .label-yellow :}
If you analyze time-series data, you likely prioritize new data over old data. You might periodically perform certain operations on older indices, such as reducing replica count or deleting them.
Index State Management (ISM) is a plugin that lets you automate these periodic, administrative operations by triggering them based on changes in the index age, index size, or number of documents. Using the ISM plugin, you can define *policies* that automatically handle index rollovers or deletions to fit your use case.
For example, you can define a policy that moves your index into a `read_only` state after 30 days and then deletes it after a set period of 90 days. You can also set up the policy to send you a notification message when the index is deleted.
You might want to perform an index rollover after a certain amount of time or run a `force_merge` operation on an index during off-peak hours to improve search performance during peak hours.
To use the ISM plugin, your user role needs to be mapped to the `all_access` role that gives you full access to the cluster. To learn more, see [Users and roles](../security/access-control/users-roles/).
{: .note }
## Get started with ISM
To get started, choose **Index Management** in OpenSearch Dashboards.
### Step 1: Set up policies
A policy is a set of rules that describes how an index should be managed. For information about creating a policy, see [Policies](policies/).
1. Choose the **Index Policies** tab.
2. Choose **Create policy**.
3. In the **Name policy** section, enter a policy ID.
4. In the **Define policy** section, enter your policy.
5. Choose **Create**.
After you create a policy, your next step is to attach this policy to an index or indices.
You can set up an `ism_template` in the policy so when you create an index that matches the ISM template pattern, the index will have this policy attached to it:
```json
PUT _plugins/_ism/policies/policy_id
{
"policy": {
"description": "Example policy.",
"default_state": "...",
"states": [...],
"ism_template": {
"index_patterns": ["index_name-*"],
"priority": 100
}
}
}
```
For an example ISM template policy, see [Sample policy with ISM template](policies/#sample-policy-with-ism-template).
Older versions of the plugin include the `policy_id` in an index template, so when an index is created that matches the index template pattern, the index will have the policy attached to it:
```json
PUT _index_template/<template_name>
{
"index_patterns": [
"index_name-*"
],
"template": {
"settings": {
"opendistro.index_state_management.policy_id": "policy_id"
}
}
}
```
The `opendistro.index_state_management.policy_id` setting is deprecated. You can continue to automatically manage newly created indices with the ISM template field.
{: .note }
### Step 2: Attach policies to indices
1. Choose **Indices**.
2. Choose the index or indices that you want to attach your policy to.
3. Choose **Apply policy**.
4. From the **Policy ID** menu, choose the policy that you created.
You can see a preview of your policy.
5. If your policy includes a rollover operation, specify a rollover alias.
Make sure that the alias that you enter already exists. For more information about the rollover operation, see [rollover](policies/#rollover).
6. Choose **Apply**.
After you attach a policy to an index, ISM creates a job that runs every 5 minutes by default to perform policy actions, check conditions, and transition the index into different states. To change the default time interval for this job, see [Settings](settings/).
If you want to use an OpenSearch operation to create an index with a policy already attached to it, see [create index](api/#create-index).
### Step 3: Manage indices
1. Choose **Managed Indices**.
2. To change your policy, see [Change Policy](managedindices/#change-policy).
3. To attach a rollover alias to your index, select your policy and choose **Add rollover alias**.
Make sure that the alias that you enter already exists. For more information about the rollover operation, see [rollover](policies/#rollover).
4. To remove a policy, choose your policy, and then choose **Remove policy**.
5. To retry a policy, choose your policy, and then choose **Retry policy**.
For information about managing your policies, see [Managed Indices](managedindices/).

View File

@ -0,0 +1,75 @@
---
layout: default
title: Managed Indices
nav_order: 3
parent: Index State Management
grand_parent: Index management
redirect_from: /docs/ism/managedindices/
has_children: false
---
# Managed indices
You can change or update a policy using the managed index operations.
This table lists the fields of managed index operations.
Parameter | Description | Type | Required | Read Only
:--- | :--- |:--- |:--- |
`name` | The name of the managed index policy. | `string` | Yes | No
`index` | The name of the managed index that this policy is managing. | `string` | Yes | No
`index_uuid` | The uuid of the index. | `string` | Yes | No
`enabled` | When `true`, the managed index is scheduled and run by the scheduler. | `boolean` | Yes | No
`enabled_time` | The time the managed index was last enabled. If the managed index process is disabled, then this is null. | `timestamp` | Yes | Yes
`last_updated_time` | The time the managed index was last updated. | `timestamp` | Yes | Yes
`schedule` | The schedule of the managed index job. | `object` | Yes | No
`policy_id` | The name of the policy used by this managed index. | `string` | Yes | No
`policy_seq_no` | The sequence number of the policy used by this managed index. | `number` | Yes | No
`policy_primary_term` | The primary term of the policy used by this managed index. | `number` | Yes | No
`policy_version` | The version of the policy used by this managed index. | `number` | Yes | Yes
`policy` | The cached JSON of the policy for the `policy_version` that's used during runs. If the policy is null, it means that this is the first execution of the job and the latest policy document is read in/saved. | `object` | No | No
`change_policy` | The information regarding what policy and state to change to. | `object` | No | No
`policy_name` | The name of the policy to update to. To update to the latest version, set this to be the same as the current `policy_name`. | `string` | No | Yes
`state` | The state of the managed index after it finishes updating. If no state is specified, it's assumed that the policy structure did not change. | `string` | No | Yes
The following example shows a managed index policy:
```json
{
"managed_index": {
"name": "my_index",
"index": "my_index",
"index_uuid": "sOKSOfkdsoSKeofjIS",
"enabled": true,
"enabled_time": 1553112384,
"last_updated_time": 1553112384,
"schedule": {
"interval": {
"period": 1,
"unit": "MINUTES",
"start_time": 1553112384
}
},
"policy_id": "log_rotation",
"policy_version": 1,
"policy": {...},
"change_policy": null
}
}
```
## Change policy
You can change any managed index policy, but ISM has a few constraints in place to make sure that policy changes don't break indices.
If an index is stuck in its current state, never proceeding, and you want to update its policy immediately, make sure that the new policy includes the same state---same name, same actions, same order---as the old policy. In this case, even if the policy is in the middle of executing an action, ISM applies the new policy.
If you update the policy without including an identical state, ISM updates the policy only after all actions in the current state finish executing. Alternately, you can choose a specific state in your old policy after which you want the new policy to take effect.
To change a policy using OpenSearch Dashboards, do the following:
- Under **Managed indices**, choose the indices that you want to attach the new policy to.
- To attach the new policy to indices in specific states, choose **Choose state filters**, and then choose those states.
- Under **Choose New Policy**, choose the new policy.
- To start the new policy for indices in the current state, choose **Keep indices in their current state after the policy takes effect**.
- To start the new policy in a specific state, choose **Start from a chosen state after changing policies**, and then choose the default start state in your new policy.

View File

@ -0,0 +1,666 @@
---
layout: default
title: Policies
nav_order: 1
parent: Index State Management
grand_parent: Index management
redirect_from: /docs/ism/policies/
has_children: false
---
# Policies
Policies are JSON documents that define the following:
- The *states* that an index can be in, including the default state for new indices. For example, you might name your states "hot," "warm," "delete," and so on. For more information, see [States](#states).
- Any *actions* that you want the plugin to take when an index enters a state, such as performing a rollover. For more information, see [Actions](#actions).
- The conditions that must be met for an index to move into a new state, known as *transitions*. For example, if an index is more than eight weeks old, you might want to move it to the "delete" state. For more information, see [Transitions](#transitions).
In other words, a policy defines the *states* that an index can be in, the *actions* to perform when in a state, and the conditions that must be met to *transition* between states.
You have complete flexibility in the way you can design your policies. You can create any state, transition to any other state, and specify any number of actions in each state.
This table lists the relevant fields of a policy.
Field | Description | Type | Required | Read Only
:--- | :--- |:--- |:--- |
`policy_id` | The name of the policy. | `string` | Yes | Yes
`description` | A human-readable description of the policy. | `string` | Yes | No
`ism_template` | Specify an ISM template pattern that matches the index to apply the policy. | `nested list of objects` | No | No
`last_updated_time` | The time the policy was last updated. | `timestamp` | Yes | Yes
`error_notification` | The destination and message template for error notifications. The destination could be Amazon Chime, Slack, or a webhook URL. | `object` | No | No
`default_state` | The default starting state for each index that uses this policy. | `string` | Yes | No
`states` | The states that you define in the policy. | `nested list of objects` | Yes | No
---
#### Table of contents
1. TOC
{:toc}
---
## States
A state is the description of the status that the managed index is currently in. A managed index can be in only one state at a time. Each state has associated actions that are executed sequentially on entering a state and transitions that are checked after all the actions have been completed.
This table lists the parameters that you can define for a state.
Field | Description | Type | Required
:--- | :--- |:--- |:--- |
`name` | The name of the state. | `string` | Yes
`actions` | The actions to execute after entering a state. For more information, see [Actions](#actions). | `nested list of objects` | Yes
`transitions` | The next states and the conditions required to transition to those states. If no transitions exist, the policy assumes that it's complete and can now stop managing the index. For more information, see [Transitions](#transitions). | `nested list of objects` | Yes
---
## Actions
Actions are the steps that the policy sequentially executes on entering a specific state.
They are executed in the order in which they are defined.
This table lists the parameters that you can define for an action.
Parameter | Description | Type | Required | Default
:--- | :--- |:--- |:--- |
`timeout` | The timeout period for the action. Accepts time units for minutes, hours, and days. | `time unit` | No | -
`retry` | The retry configuration for the action. | `object` | No | Specific to action
The `retry` operation has the following parameters:
Parameter | Description | Type | Required | Default
:--- | :--- |:--- |:--- |
`count` | The number of retry counts. | `number` | Yes | -
`backoff` | The backoff policy type to use when retrying. | `string` | No | Exponential
`delay` | The time to wait between retries. Accepts time units for minutes, hours, and days. | `time unit` | No | 1 minute
The following example action has a timeout period of one hour. The policy retries this action three times with an exponential backoff policy, with a delay of 10 minutes between each retry:
```json
"actions": {
"timeout": "1h",
"retry": {
"count": 3,
"backoff": "exponential",
"delay": "10m"
}
}
```
For a list of available unit types, see [Supported units](../../../opensearch/units/).
## ISM supported operations
ISM supports the following operations:
- [force_merge](#forcemerge)
- [read_only](#read_only)
- [read_write](#read_write)
- [replica_count](#replica_count)
- [close](#close)
- [open](#open)
- [delete](#delete)
- [rollover](#rollover)
- [notification](#notification)
- [snapshot](#snapshot)
- [index_priority](#index_priority)
- [allocation](#allocation)
### force_merge
Reduces the number of Lucene segments by merging the segments of individual shards. This operation attempts to set the index to a `read-only` state before starting the merging process.
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`max_num_segments` | The number of segments to reduce the shard to. | `number` | Yes
```json
{
"force_merge": {
"max_num_segments": 1
}
}
```
### read_only
Sets a managed index to be read only.
```json
{
"read_only": {}
}
```
### read_write
Sets a managed index to be writeable.
```json
{
"read_write": {}
}
```
### replica_count
Sets the number of replicas to assign to an index.
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`number_of_replicas` | Defines the number of replicas to assign to an index. | `number` | Yes
```json
{
"replica_count": {
"number_of_replicas": 2
}
}
```
For information about setting replicas, see [Primary and replica shards](../../../opensearch/#primary-and-replica-shards).
### close
Closes the managed index.
```json
{
"close": {}
}
```
Closed indices remain on disk, but consume no CPU or memory. You can't read from, write to, or search closed indices.
Closing an index is a good option if you need to retain data for longer than you need to actively search it and have sufficient disk space on your data nodes. If you need to search the data again, reopening a closed index is simpler than restoring an index from a snapshot.
### open
Opens a managed index.
```json
{
"open": {}
}
```
### delete
Deletes a managed index.
```json
{
"delete": {}
}
```
### rollover
Rolls an alias over to a new index when the managed index meets one of the rollover conditions.
The index format must match the pattern: `^.*-\d+$`. For example, `(logs-000001)`.
Set `index.plugins.index_state_management.rollover_alias` as the alias to rollover.
Parameter | Description | Type | Example | Required
:--- | :--- |:--- |:--- |
`min_size` | The minimum size of the total primary shard storage (not counting replicas) required to roll over the index. For example, if you set `min_size` to 100 GiB and your index has 5 primary shards and 5 replica shards of 20 GiB each, the total size of the primaries is 100 GiB, so the rollover occurs. ISM doesn't check indices continually, so it doesn't roll over indices at exactly 100 GiB. Instead, if an index is continuously growing, ISM might check it at 99 GiB, not perform the rollover, check again when the shards reach 105 GiB, and then perform the operation. | `string` | `20gb` or `5mb` | No
`min_doc_count` | The minimum number of documents required to roll over the index. | `number` | `2000000` | No
`min_index_age` | The minimum age required to roll over the index. Index age is the time between its creation and the present. | `string` | `5d` or `7h` | No
```json
{
"rollover": {
"min_size": "50gb"
}
}
```
```json
{
"rollover": {
"min_doc_count": 100000000
}
}
```
```json
{
"rollover": {
"min_index_age": "30d"
}
}
```
### notification
Sends you a notification.
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`destination` | The destination URL. | `Slack, Amazon Chime, or webhook URL` | Yes
`message_template` | The text of the message. You can add variables to your messages using [Mustache templates](https://mustache.github.io/mustache.5.html). | `object` | Yes
The destination system **must** return a response otherwise the notification operation throws an error.
#### Example 1: Chime notification
```json
{
"notification": {
"destination": {
"chime": {
"url": "<url>"
}
},
"message_template": {
"source": "the index is {% raw %}{{ctx.index}}{% endraw %}"
}
}
}
```
#### Example 2: Custom webhook notification
```json
{
"notification": {
"destination": {
"custom_webhook": {
"url": "https://<your_webhook>"
}
},
"message_template": {
"source": "the index is {% raw %}{{ctx.index}}{% endraw %}"
}
}
}
```
#### Example 3: Slack notification
```json
{
"notification": {
"destination": {
"slack": {
"url": "https://hooks.slack.com/services/xxx/xxxxxx"
}
},
"message_template": {
"source": "the index is {% raw %}{{ctx.index}}{% endraw %}"
}
}
}
```
You can use `ctx` variables in your message to represent a number of policy parameters based on the past executions of your policy. For example, if your policy has a rollover action, you can use `{% raw %}{{ctx.action.name}}{% endraw %}` in your message to represent the name of the rollover.
The following `ctx` variable options are available for every policy:
#### Guaranteed variables
Parameter | Description | Type
:--- | :--- |:--- |:--- |
`index` | The name of the index. | `string`
`index_uuid` | The uuid of the index. | `string`
`policy_id` | The name of the policy. | `string`
### snapshot
Backup your clusters indices and state. For more information about snapshots, see [Take and restore snapshots](../../../opensearch/snapshot-restore/).
The `snapshot` operation has the following parameters:
Parameter | Description | Type | Required | Default
:--- | :--- |:--- |:--- |
`repository` | The repository name that you register through the native snapshot API operations. | `string` | Yes | -
`snapshot` | The name of the snapshot. | `string` | Yes | -
```json
{
"snapshot": {
"repository": "my_backup",
"snapshot": "my_snapshot"
}
}
```
### index_priority
Set the priority for the index in a specific state. Unallocated shards of indices are recovered in the order of their priority, whenever possible. The indices with higher priority values are recovered first followed by the indices with lower priority values.
The `index_priority` operation has the following parameter:
Parameter | Description | Type | Required | Default
:--- | :--- |:--- |:--- |:---
`priority` | The priority for the index as soon as it enters a state. | `number` | Yes | 1
```json
"actions": [
{
"index_priority": {
"priority": 50
}
}
]
```
### allocation
Allocate the index to a node with a specific attribute.
For example, setting `require` to `warm` moves your data only to "warm" nodes.
The `allocation` operation has the following parameters:
Parameter | Description | Type | Required
:--- | :--- |:--- |:---
`require` | Allocate the index to a node with a specified attribute. | `string` | Yes
`include` | Allocate the index to a node with any of the specified attributes. | `string` | Yes
`exclude` | Dont allocate the index to a node with any of the specified attributes. | `string` | Yes
`wait_for` | Wait for the policy to execute before allocating the index to a node with a specified attribute. | `string` | Yes
```json
"actions": [
{
"allocation": {
"require": { "box_type": "warm" }
}
}
]
```
---
## Transitions
Transitions define the conditions that need to be met for a state to change. After all actions in the current state are completed, the policy starts checking the conditions for transitions.
Transitions are evaluated in the order in which they are defined. For example, if the conditions for the first transition are met, then this transition takes place and the rest of the transitions are dismissed.
If you don't specify any conditions in a transition and leave it empty, then it's assumed to be the equivalent of always true. This means that the policy transitions the index to this state the moment it checks.
This table lists the parameters you can define for transitions.
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`state_name` | The name of the state to transition to if the conditions are met. | `string` | Yes
`conditions` | List the conditions for the transition. | `list` | Yes
The `conditions` object has the following parameters:
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`min_index_age` | The minimum age of the index required to transition. | `string` | No
`min_doc_count` | The minimum document count of the index required to transition. | `number` | No
`min_size` | The minimum size of the index required to transition. | `string` | No
`cron` | The `cron` job that triggers the transition if no other transition happens first. | `object` | No
`cron.cron.expression` | The `cron` expression that triggers the transition. | `string` | Yes
`cron.cron.timezone` | The timezone that triggers the transition. | `string` | Yes
The following example transitions the index to a `cold` state after a period of 30 days:
```json
"transitions": [
{
"state_name": "cold",
"conditions": {
"min_index_age": "30d"
}
}
]
```
ISM checks the conditions on every execution of the policy based on the set interval.
This example uses the `cron` condition to transition indices every Saturday at 5:00 PT:
```json
"transitions": [
{
"state_name": "cold",
"conditions": {
"cron": {
"cron": {
"expression": "* 17 * * SAT",
"timezone": "America/Los_Angeles"
}
}
}
}
]
```
Note that this condition does not execute at exactly 5:00 PM; the job still executes based off the `job_interval` setting. Due to this variance in start time and the amount of time that it can take for actions to complete prior to checking transition conditions, we recommend against overly narrow cron expressions. For example, don't use `15 17 * * SAT` (5:15 PM on Saturday).
A window of an hour, which this example uses, is generally sufficient, but you might increase it to 2--3 hours to avoid missing the window and having to wait a week for the transition to occur. Alternately, you could use a broader expression such as `* * * * SAT,SUN` to have the transition occur at any time during the weekend.
For information on writing cron expressions, see [Cron expression reference](../../../alerting/cron/).
---
## Error notifications
The `error_notification` operation sends you a notification if your managed index fails.
It notifies a single destination with a custom message.
Set up error notifications at the policy level:
```json
{
"policy": {
"description": "hot warm delete workflow",
"default_state": "hot",
"schema_version": 1,
"error_notification": { },
"states": [ ]
}
}
```
Parameter | Description | Type | Required
:--- | :--- |:--- |:--- |
`destination` | The destination URL. | `Slack, Amazon Chime, or webhook URL` | Yes
`message_template` | The text of the message. You can add variables to your messages using [Mustache templates](https://mustache.github.io/mustache.5.html). | `object` | Yes
The destination system **must** return a response otherwise the `error_notification` operation throws an error.
#### Example 1: Chime notification
```json
{
"error_notification": {
"destination": {
"chime": {
"url": "<url>"
}
},
"message_template": {
"source": "The index {% raw %}{{ctx.index}}{% endraw %} failed during policy execution."
}
}
}
```
#### Example 2: Custom webhook notification
```json
{
"error_notification": {
"destination": {
"custom_webhook": {
"url": "https://<your_webhook>"
}
},
"message_template": {
"source": "The index {% raw %}{{ctx.index}}{% endraw %} failed during policy execution."
}
}
}
```
#### Example 3: Slack notification
```json
{
"error_notification": {
"destination": {
"slack": {
"url": "https://hooks.slack.com/services/xxx/xxxxxx"
}
},
"message_template": {
"source": "The index {% raw %}{{ctx.index}}{% endraw %} failed during policy execution."
}
}
}
```
You can use the same options for `ctx` variables as the [notification](#notification) operation.
## Sample policy with ISM template
The following sample template policy is for a rollover use case.
1. Create a policy with an `ism_template` field:
```json
PUT _plugins/_ism/policies/rollover_policy
{
"policy": {
"description": "Example rollover policy.",
"default_state": "rollover",
"states": [
{
"name": "rollover",
"actions": [
{
"rollover": {
"min_doc_count": 1
}
}
],
"transitions": []
}
],
"ism_template": {
"index_patterns": ["log*"],
"priority": 100
}
}
}
```
You need to specify the `index_patterns` field. If you don't specify a value for `priority`, it defaults to 0.
2. Set up a template with the `rollover_alias` as `log` :
```json
PUT _index_template/ism_rollover
{
"index_patterns": ["log*"],
"settings": {
"plugins.index_state_management.rollover_alias": "log"
}
}
```
3. Create an index with the `log` alias:
```json
PUT log-000001
{
"aliases": {
"log": {
"is_write_index": true
}
}
}
```
4. Index a document to trigger the rollover condition:
```json
POST log/_doc
{
"message": "dummy"
}
```
## Example policy
The following example policy implements a `hot`, `warm`, and `delete` workflow. You can use this policy as a template to prioritize resources to your indices based on their levels of activity.
In this case, an index is initially in a `hot` state. After a day, it changes to a `warm` state, where the number of replicas increases to 5 to improve the read performance.
After 30 days, the policy moves this index into a `delete` state. The service sends a notification to a Chime room that the index is being deleted, and then permanently deletes it.
```json
{
"policy": {
"description": "hot warm delete workflow",
"default_state": "hot",
"schema_version": 1,
"states": [
{
"name": "hot",
"actions": [
{
"rollover": {
"min_index_age": "1d"
}
}
],
"transitions": [
{
"state_name": "warm"
}
]
},
{
"name": "warm",
"actions": [
{
"replica_count": {
"number_of_replicas": 5
}
}
],
"transitions": [
{
"state_name": "delete",
"conditions": {
"min_index_age": "30d"
}
}
]
},
{
"name": "delete",
"actions": [
{
"notification": {
"destination": {
"chime": {
"url": "<URL>"
}
},
"message_template": {
"source": "The index {% raw %}{{ctx.index}}{% endraw %} is being deleted"
}
}
},
{
"delete": {}
}
]
}
]
}
}
```
This diagram shows the `states`, `transitions`, and `actions` of the above policy as a finite-state machine. For more information about finite-state machines, see [Wikipedia](https://en.wikipedia.org/wiki/Finite-state_machine).
![Policy State Machine](../../images/ism.png)

View File

@ -0,0 +1,50 @@
---
layout: default
title: Settings
parent: Index State Management
grand_parent: Index management
redirect_from: /docs/ism/settings/
nav_order: 4
---
# ISM Settings
We don't recommend changing these settings; the defaults should work well for most use cases.
Index State Management (ISM) stores its configuration in the `.opendistro-ism-config` index. Don't modify this index without using the [ISM API operations](../api/).
All settings are available using the OpenSearch `_cluster/settings` operation. None require a restart, and all can be marked `persistent` or `transient`.
Setting | Default | Description
:--- | :--- | :---
`plugins.index_state_management.enabled` | True | Specifies whether ISM is enabled or not.
`plugins.index_state_management.job_interval` | 5 minutes | The interval at which the managed index jobs are run.
`plugins.index_state_management.coordinator.sweep_period` | 10 minutes | How often the routine background sweep is run.
`plugins.index_state_management.coordinator.backoff_millis` | 50 milliseconds | The backoff time between retries for failures in the `ManagedIndexCoordinator` (such as when we update managed indices).
`plugins.index_state_management.coordinator.backoff_count` | 2 | The count of retries for failures in the `ManagedIndexCoordinator`.
`plugins.index_state_management.history.enabled` | True | Specifies whether audit history is enabled or not. The logs from ISM are automatically indexed to a logs document.
`plugins.index_state_management.history.max_docs` | 2,500,000 | The maximum number of documents before rolling over the audit history index.
`plugins.index_state_management.history.max_age` | 24 hours | The maximum age before rolling over the audit history index.
`plugins.index_state_management.history.rollover_check_period` | 8 hours | The time between rollover checks for the audit history index.
`plugins.index_state_management.history.rollover_retention_period` | 30 days | How long audit history indices are kept.
`plugins.index_state_management.allow_list` | All actions | List of actions that you can use.
## Audit history indices
If you don't want to disable ISM audit history or shorten the retention period, you can create an [index template](../../../opensearch/index-templates/) to reduce the shard count of the history indices:
```json
PUT _index_template/ism_history_indices
{
"index_patterns": [
".opendistro-ism-managed-index-history-*"
],
"template": {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
}
```

View File

@ -0,0 +1,40 @@
---
layout: default
title: Refresh Search Analyzer
nav_order: 40
parent: Index management
has_children: false
redirect_from: /docs/ism/refresh-analyzer/
has_toc: false
---
# Refresh search analyzer
With ISM installed, you can refresh search analyzers in real time with the following API:
```json
POST /_plugins/_refresh_search_analyzers/<index or alias or wildcard>
```
For example, if you change the synonym list in your analyzer, the change takes effect without you needing to close and reopen the index.
To work, the token filter must have an `updateable` flag of `true`:
```json
{
"analyzer": {
"my_synonyms": {
"tokenizer": "whitespace",
"filter": [
"synonym"
]
}
},
"filter": {
"synonym": {
"type": "synonym_graph",
"synonyms_path": "synonyms.txt",
"updateable": true
}
}
}
```

View File

@ -0,0 +1,153 @@
---
layout: default
title: API
nav_order: 4
parent: k-NN
has_children: false
---
# k-NN plugin API
The k-NN plugin adds two API operations to help you better manage the plugin's functionality.
## Stats
The k-NN `stats` API provides information about the current status of the k-NN plugin. The plugin keeps track of both cluster-level and node-level statistics. Cluster-level statistics have a single value for the entire cluster. Node-level statistics have a single value for each node in the cluster. You can filter the query by `nodeId` and `statName`:
```
GET /_plugins/_knn/nodeId1,nodeId2/stats/statName1,statName2
```
Statistic | Description
:--- | :---
`circuit_breaker_triggered` | Indicates whether the circuit breaker is triggered. This statistic is only relevant to approximate k-NN search.
`total_load_time` | The time in nanoseconds that k-NN has taken to load graphs into the cache. This statistic is only relevant to approximate k-NN search.
`eviction_count` | The number of graphs that have been evicted from the cache due to memory constraints or idle time. This statistic is only relevant to approximate k-NN search. <br /> **Note**: Explicit evictions that occur because of index deletion aren't counted.
`hit_count` | The number of cache hits. A cache hit occurs when a user queries a graph that's already loaded into memory. This statistic is only relevant to approximate k-NN search.
`miss_count` | The number of cache misses. A cache miss occurs when a user queries a graph that isn't loaded into memory yet. This statistic is only relevant to approximate k-NN search.
`graph_memory_usage` | Current cache size (total size of all graphs in memory) in kilobytes. This statistic is only relevant to approximate k-NN search.
`graph_memory_usage_percentage` | The current weight of the cache as a percentage of the maximum cache capacity.
`graph_index_requests` | The number of requests to add the `knn_vector` field of a document into a graph.
`graph_index_errors` | The number of requests to add the `knn_vector` field of a document into a graph that have produced an error.
`graph_query_requests` | The number of graph queries that have been made.
`graph_query_errors` | The number of graph queries that have produced an error.
`knn_query_requests` | The number of k-NN query requests received.
`cache_capacity_reached` | Whether `knn.memory.circuit_breaker.limit` has been reached. This statistic is only relevant to approximate k-NN search.
`load_success_count` | The number of times k-NN successfully loaded a graph into the cache. This statistic is only relevant to approximate k-NN search.
`load_exception_count` | The number of times an exception occurred when trying to load a graph into the cache. This statistic is only relevant to approximate k-NN search.
`indices_in_cache` | For each index that has graphs in the cache, this statistic provides the number of graphs that index has and the total `graph_memory_usage` that index is using, in kilobytes.
`script_compilations` | The number of times the k-NN script has been compiled. This value should usually be 1 or 0, but if the cache containing the compiled scripts is filled, the k-NN script might be recompiled. This statistic is only relevant to k-NN score script search.
`script_compilation_errors` | The number of errors during script compilation. This statistic is only relevant to k-NN score script search.
`script_query_requests` | The total number of script queries. This statistic is only relevant to k-NN score script search.
`script_query_errors` | The number of errors during script queries. This statistic is only relevant to k-NN score script search.
### Usage
```json
GET /_plugins/_knn/stats?pretty
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "_run",
"circuit_breaker_triggered" : false,
"nodes" : {
"HYMrXXsBSamUkcAjhjeN0w" : {
"eviction_count" : 0,
"miss_count" : 1,
"graph_memory_usage" : 1,
"graph_memory_usage_percentage" : 3.68,
"graph_index_requests" : 7,
"graph_index_errors" : 1,
"knn_query_requests" : 4,
"graph_query_requests" : 30,
"graph_query_errors" : 15,
"indices_in_cache" : {
"myindex" : {
"graph_memory_usage" : 2,
"graph_memory_usage_percentage" : 3.68,
"graph_count" : 2
}
},
"cache_capacity_reached" : false,
"load_exception_count" : 0,
"hit_count" : 0,
"load_success_count" : 1,
"total_load_time" : 2878745,
"script_compilations" : 1,
"script_compilation_errors" : 0,
"script_query_requests" : 534,
"script_query_errors" : 0
}
}
}
```
```json
GET /_plugins/_knn/HYMrXXsBSamUkcAjhjeN0w/stats/circuit_breaker_triggered,graph_memory_usage?pretty
{
"_nodes" : {
"total" : 1,
"successful" : 1,
"failed" : 0
},
"cluster_name" : "_run",
"circuit_breaker_triggered" : false,
"nodes" : {
"HYMrXXsBSamUkcAjhjeN0w" : {
"graph_memory_usage" : 1
}
}
}
```
## Warmup operation
The Hierarchical Navigable Small World (HNSW) graphs used to perform an approximate k-Nearest Neighbor (k-NN) search are stored as `.hnsw` files with other Apache Lucene segment files. In order for you to perform a search on these graphs using the k-NN plugin, the plugin needs to load these files into native memory.
If the plugin hasn't loaded the graphs into native memory, it loads them when it receives a search request. The loading time can cause high latency during initial queries. To avoid this situation, users often run random queries during a warmup period. After this warmup period, the graphs are loaded into native memory and their production workloads can begin. This loading process is indirect and requires extra effort.
As an alternative, you can avoid this latency issue by running the k-NN plugin warmup API operation on whatever indices you're interested in searching. This operation loads all the graphs for all of the shards (primaries and replicas) of all the indices specified in the request into native memory.
After the process finishes, you can start searching against the indices with no initial latency penalties. The warmup API operation is idempotent, so if a segment's graphs are already loaded into memory, this operation has no impact on those graphs. It only loads graphs that aren't currently in memory.
### Usage
This request performs a warmup on three indices:
```json
GET /_plugins/_knn/warmup/index1,index2,index3?pretty
{
"_shards" : {
"total" : 6,
"successful" : 6,
"failed" : 0
}
}
```
`total` indicates how many shards the k-NN plugin attempted to warm up. The response also includes the number of shards the plugin succeeded and failed to warm up.
The call doesn't return results until the warmup operation finishes or the request times out. If the request times out, the operation still continues on the cluster. To monitor the warmup operation, use the OpenSearch `_tasks` API:
```json
GET /_tasks
```
After the operation has finished, use the [k-NN `_stats` API operation](#Stats) to see what the k-NN plugin loaded into the graph.
### Best practices
For the warmup operation to function properly, follow these best practices:
* Don't run merge operations on indices that you want to warm up. During merge, the k-NN plugin creates new segments, and old segments are sometimes deleted. For example, you could encounter a situation in which the warmup API operation loads graphs A and B into native memory, but segment C is created from segments A and B being merged. The graphs for A and B would no longer be in memory, and graph C would also not be in memory. In this case, the initial penalty for loading graph C is still present.
* Confirm that all graphs you want to warm up can fit into native memory. For more information about the native memory limit, see the [knn.memory.circuit_breaker.limit statistic](../settings/#cluster-settings). High graph memory usage causes cache thrashing, which can lead to operations constantly failing and attempting to run again.
* Don't index any documents that you want to load into the cache. Writing new information to segments prevents the warmup API operation from loading the graphs until they're searchable. This means that you would have to run the warmup operation again after indexing finishes.

View File

@ -0,0 +1,162 @@
---
layout: default
title: Approximate search
nav_order: 1
parent: k-NN
has_children: false
has_math: true
---
# Approximate k-NN search
The approximate k-NN method uses [nmslib's](https://github.com/nmslib/nmslib/) implementation of the Hierarchical Navigable Small World (HNSW) algorithm to power k-NN search. In this case, approximate means that for a given search, the neighbors returned are an estimate of the true k-nearest neighbors. Of the three methods, this method offers the best search scalability for large data sets. Generally speaking, once the data set gets into the hundreds of thousands of vectors, this approach is preferred.
The k-NN plugin builds an HNSW graph of the vectors for each "knn-vector field"/ "Lucene segment" pair during indexing that can be used to efficiently find the k-nearest neighbors to a query vector during search. To learn more about Lucene segments, please refer to [Apache Lucene's documentation](https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/codecs/lucene87/package-summary.html#package.description). These graphs are loaded into native memory during search and managed by a cache. To learn more about pre-loading graphs into memory, refer to the [warmup API](../api#warmup). Additionally, you can see what graphs are already loaded in memory, which you can learn more about in the [stats API section](../api#stats).
Because the graphs are constructed during indexing, it is not possible to apply a filter on an index and then use this search method. All filters are applied on the results produced by the approximate nearest neighbor search.
## Get started with approximate k-NN
To use the k-NN plugin's approximate search functionality, you must first create a k-NN index with setting `index.knn` to `true`. This setting tells the plugin to create HNSW graphs for the index.
Additionally, if you're using the approximate k-nearest neighbor method, specify `knn.space_type` to the space you're interested in. You can't change this setting after it's set. To see what spaces we support, see [spaces](#spaces). By default, `index.knn.space_type` is `l2`. For more information about index settings, such as algorithm parameters you can tweak to tune performance, see [Index settings](../settings#index-settings).
Next, you must add one or more fields of the `knn_vector` data type. This example creates an index with two `knn_vector` fields and uses cosine similarity:
```json
PUT my-knn-index-1
{
"settings": {
"index": {
"knn": true,
"knn.space_type": "cosinesimil"
}
},
"mappings": {
"properties": {
"my_vector1": {
"type": "knn_vector",
"dimension": 2
},
"my_vector2": {
"type": "knn_vector",
"dimension": 4
}
}
}
}
```
The `knn_vector` data type supports a vector of floats that can have a dimension of up to 10,000, as set by the dimension mapping parameter.
In OpenSearch, codecs handle the storage and retrieval of indices. The k-NN plugin uses a custom codec to write vector data to graphs so that the underlying k-NN search library can read it.
{: .tip }
After you create the index, you can add some data to it:
```json
POST _bulk
{ "index": { "_index": "my-knn-index-1", "_id": "1" } }
{ "my_vector1": [1.5, 2.5], "price": 12.2 }
{ "index": { "_index": "my-knn-index-1", "_id": "2" } }
{ "my_vector1": [2.5, 3.5], "price": 7.1 }
{ "index": { "_index": "my-knn-index-1", "_id": "3" } }
{ "my_vector1": [3.5, 4.5], "price": 12.9 }
{ "index": { "_index": "my-knn-index-1", "_id": "4" } }
{ "my_vector1": [5.5, 6.5], "price": 1.2 }
{ "index": { "_index": "my-knn-index-1", "_id": "5" } }
{ "my_vector1": [4.5, 5.5], "price": 3.7 }
{ "index": { "_index": "my-knn-index-1", "_id": "6" } }
{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 10.3 }
{ "index": { "_index": "my-knn-index-1", "_id": "7" } }
{ "my_vector2": [2.5, 3.5, 5.6, 6.7], "price": 5.5 }
{ "index": { "_index": "my-knn-index-1", "_id": "8" } }
{ "my_vector2": [4.5, 5.5, 6.7, 3.7], "price": 4.4 }
{ "index": { "_index": "my-knn-index-1", "_id": "9" } }
{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 8.9 }
```
Then you can execute an approximate nearest neighbor search on the data using the `knn` query type:
```json
GET my-knn-index-1/_search
{
"size": 2,
"query": {
"knn": {
"my_vector2": {
"vector": [2, 3, 5, 6],
"k": 2
}
}
}
}
```
`k` is the number of neighbors the search of each graph will return. You must also include the `size` option, which indicates how many results the query actually returns. The plugin returns `k` amount of results for each shard (and each segment) and `size` amount of results for the entire query. The plugin supports a maximum `k` value of 10,000.
### Using approximate k-NN with filters
If you use the `knn` query alongside filters or other clauses (e.g. `bool`, `must`, `match`), you might receive fewer than `k` results. In this example, `post_filter` reduces the number of results from 2 to 1:
```json
GET my-knn-index-1/_search
{
"size": 2,
"query": {
"knn": {
"my_vector2": {
"vector": [2, 3, 5, 6],
"k": 2
}
}
},
"post_filter": {
"range": {
"price": {
"gte": 5,
"lte": 10
}
}
}
}
```
## Spaces
A space corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a greater score equates to a better result. To convert distances to OpenSearch scores, we take 1 / (1 + distance). Currently, the k-NN plugin supports the following spaces:
<table>
<thead style="text-align: left">
<tr>
<th>spaceType</th>
<th>Distance Function</th>
<th>OpenSearch Score</th>
</tr>
</thead>
<tr>
<td>l2</td>
<td>\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i)^2 \]</td>
<td>1 / (1 + Distance Function)</td>
</tr>
<tr>
<td>l1</td>
<td>\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i) \]</td>
<td>1 / (1 + Distance Function)</td>
</tr>
<tr>
<td>cosinesimil</td>
<td>\[ 1 - {A &middot; B \over \|A\| &middot; \|B\|} = 1 -
{\sum_{i=1}^n (A_i &middot; B_i) \over \sqrt{\sum_{i=1}^n A_i^2} &middot; \sqrt{\sum_{i=1}^n B_i^2}}\]
where \(\|A\|\) and \(\|B\|\) represent normalized vectors.</td>
<td>1 / (1 + Distance Function)</td>
</tr>
<tr>
<td>hammingbit</td>
<td style="text-align:center">Distance = countSetBits(X \(\oplus\) Y)</td>
<td>1 / (1 + Distance Function)</td>
</tr>
</table>
The cosine similarity formula does not include the `1 -` prefix. However, because nmslib equates smaller scores with closer results, they return `1 - cosineSimilarity` for their cosine similarity space---that's why `1 -` is included in the distance function.
{: .note }

View File

@ -0,0 +1,42 @@
---
layout: default
title: k-NN
nav_order: 50
has_children: true
has_toc: false
---
# k-NN
Short for *k-nearest neighbors*, the k-NN plugin enables users to search for the k-nearest neighbors to a query point across an index of vectors. To determine the neighbors, you can specify the space (the distance function) you want to use to measure the distance between points.
Use cases include recommendations (for example, an "other songs you might like" feature in a music application), image recognition, and fraud detection. For more background information on k-NN search, see [Wikipedia](https://en.wikipedia.org/wiki/Nearest_neighbor_search).
This plugin supports three different methods for obtaining the k-nearest neighbors from an index of vectors:
1. **Approximate k-NN**
The first method takes an approximate nearest neighbor approach---it uses the HNSW algorithm to return the approximate k-nearest neighbors to a query vector. This algorithm sacrifices indexing speed and search accuracy in return for lower latency and more scalable search. To learn more about the algorithm, please refer to [nmslib's documentation](https://github.com/nmslib/nmslib/) or [the paper introducing the algorithm](https://arxiv.org/abs/1603.09320).
Approximate k-NN is the best choice for searches over large indices (i.e. hundreds of thousands of vectors or more) that require low latency. You should not use approximate k-NN if you want to apply a filter on the index before the k-NN search, which greatly reduces the number of vectors to be searched. In this case, you should use either the script scoring method or painless extensions.
For more details about this method, see [Approximate k-NN search](approximate-knn).
2. **Script Score k-NN**
The second method extends OpenSearch's script scoring functionality to execute a brute force, exact k-NN search over "knn_vector" fields or fields that can represent binary objects. With this approach, you can run k-NN search on a subset of vectors in your index (sometimes referred to as a pre-filter search).
Use this approach for searches over smaller bodies of documents or when a pre-filter is needed. Using this approach on large indices may lead to high latencies.
For more details about this method, see [Exact k-NN with scoring script](knn-score-script).
3. **Painless extensions**
The third method adds the distance functions as painless extensions that you can use in more complex combinations. Similar to the k-NN Script Score, you can use this method to perform a brute force, exact k-NN search across an index, which also supports pre-filtering.
This approach has slightly slower query performance compared to the k-NN Script Score. If your use case requires more customization over the final score, you should use this approach over Script Score k-NN.
For more details about this method, see [Painless scripting functions](painless-functions).
Overall, for larger data sets, you should generally choose the approximate nearest neighbor method because it scales significantly better. For smaller data sets, where you may want to apply a filter, you should choose the custom scoring approach. If you have a more complex use case where you need to use a distance function as part of their scoring method, you should use the painless scripting approach.

View File

@ -0,0 +1,12 @@
---
layout: default
title: JNI library
nav_order: 5
parent: k-NN
has_children: false
---
# JNI library
To integrate [nmslib's](https://github.com/nmslib/nmslib/) approximate k-NN functionality (implemented in C++) into the k-NN plugin (implemented in Java), we created a Java Native Interface library, which lets the k-NN plugin leverage nmslib's functionality. To see how we build the JNI library binary and learn how to get the most of it in your production environment, see [JNI Library Artifacts](https://github.com/opensearch-project/k-NN#jni-library-artifacts).
For more information about JNI, see [Java Native Interface](https://en.wikipedia.org/wiki/Java_Native_Interface) on Wikipedia.

View File

@ -0,0 +1,316 @@
---
layout: default
title: Exact k-NN with scoring script
nav_order: 2
parent: k-NN
has_children: false
has_math: true
---
# Exact k-NN with scoring script
The k-NN plugin implements the OpenSearch score script plugin that you can use to find the exact k-nearest neighbors to a given query point. Using the k-NN score script, you can apply a filter on an index before executing the nearest neighbor search. This is useful for dynamic search cases where the index body may vary based on other conditions.
Because the score script approach executes a brute force search, it doesn't scale as well as the [approximate approach](../approximate-knn). In some cases, it might be better to think about refactoring your workflow or index structure to use the approximate approach instead of the score script approach.
## Getting started with the score script for vectors
Similar to approximate nearest neighbor search, in order to use the score script on a body of vectors, you must first create an index with one or more `knn_vector` fields.
If you intend to just use the score script approach (and not the approximate approach) you can set `index.knn` to `false` and not set `index.knn.space_type`. You can choose the space type during search. See [spaces](#spaces) for the spaces the k-NN score script suppports.
This example creates an index with two `knn_vector` fields:
```json
PUT my-knn-index-1
{
"mappings": {
"properties": {
"my_vector1": {
"type": "knn_vector",
"dimension": 2
},
"my_vector2": {
"type": "knn_vector",
"dimension": 4
}
}
}
}
```
If you *only* want to use the score script, you can omit `"index.knn": true`. The benefit of this approach is faster indexing speed and lower memory usage, but you lose the ability to perform standard k-NN queries on the index.
{: .tip}
After you create the index, you can add some data to it:
```json
POST _bulk
{ "index": { "_index": "my-knn-index-1", "_id": "1" } }
{ "my_vector1": [1.5, 2.5], "price": 12.2 }
{ "index": { "_index": "my-knn-index-1", "_id": "2" } }
{ "my_vector1": [2.5, 3.5], "price": 7.1 }
{ "index": { "_index": "my-knn-index-1", "_id": "3" } }
{ "my_vector1": [3.5, 4.5], "price": 12.9 }
{ "index": { "_index": "my-knn-index-1", "_id": "4" } }
{ "my_vector1": [5.5, 6.5], "price": 1.2 }
{ "index": { "_index": "my-knn-index-1", "_id": "5" } }
{ "my_vector1": [4.5, 5.5], "price": 3.7 }
{ "index": { "_index": "my-knn-index-1", "_id": "6" } }
{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 10.3 }
{ "index": { "_index": "my-knn-index-1", "_id": "7" } }
{ "my_vector2": [2.5, 3.5, 5.6, 6.7], "price": 5.5 }
{ "index": { "_index": "my-knn-index-1", "_id": "8" } }
{ "my_vector2": [4.5, 5.5, 6.7, 3.7], "price": 4.4 }
{ "index": { "_index": "my-knn-index-1", "_id": "9" } }
{ "my_vector2": [1.5, 5.5, 4.5, 6.4], "price": 8.9 }
```
Finally, you can execute an exact nearest neighbor search on the data using the `knn` script:
```json
GET my-knn-index-1/_search
{
"size": 4,
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"source": "knn_score",
"lang": "knn",
"params": {
"field": "my_vector2",
"query_value": [2.0, 3.0, 5.0, 6.0],
"space_type": "cosinesimil"
}
}
}
}
}
```
All parameters are required.
- `lang` is the script type. This value is usually `painless`, but here you must specify `knn`.
- `source` is the name of the script, `knn_score`.
This script is part of the k-NN plugin and isn't available at the standard `_scripts` path. A GET request to `_cluster/state/metadata` doesn't return it, either.
- `field` is the field that contains your vector data.
- `query_value` is the point you want to find the nearest neighbors for. For the Euclidean and cosine similarity spaces, the value must be an array of floats that matches the dimension set in the field's mapping. For Hamming bit distance, this value can be either of type signed long or a base64-encoded string (for the long and binary field types, respectively).
- `space_type` corresponds to the distance function. See the [spaces section](#spaces).
The [post filter example in the approximate approach](../approximate-knn/#using-approximate-k-nn-with-filters) shows a search that returns fewer than `k` results. If you want to avoid this situation, the score script method lets you essentially invert the order of events. In other words, you can filter down the set of documents over which to execute the k-nearest neighbor search.
This example shows a pre-filter approach to k-NN search with the score script approach. First, create the index:
```json
PUT my-knn-index-2
{
"mappings": {
"properties": {
"my_vector": {
"type": "knn_vector",
"dimension": 2
},
"color": {
"type": "keyword"
}
}
}
}
```
Then add some documents:
```json
POST _bulk
{ "index": { "_index": "my-knn-index-2", "_id": "1" } }
{ "my_vector": [1, 1], "color" : "RED" }
{ "index": { "_index": "my-knn-index-2", "_id": "2" } }
{ "my_vector": [2, 2], "color" : "RED" }
{ "index": { "_index": "my-knn-index-2", "_id": "3" } }
{ "my_vector": [3, 3], "color" : "RED" }
{ "index": { "_index": "my-knn-index-2", "_id": "4" } }
{ "my_vector": [10, 10], "color" : "BLUE" }
{ "index": { "_index": "my-knn-index-2", "_id": "5" } }
{ "my_vector": [20, 20], "color" : "BLUE" }
{ "index": { "_index": "my-knn-index-2", "_id": "6" } }
{ "my_vector": [30, 30], "color" : "BLUE" }
```
Finally, use the `script_score` query to pre-filter your documents before identifying nearest neighbors:
```json
GET my-knn-index-2/_search
{
"size": 2,
"query": {
"script_score": {
"query": {
"bool": {
"filter": {
"term": {
"color": "BLUE"
}
}
}
},
"script": {
"lang": "knn",
"source": "knn_score",
"params": {
"field": "my_vector",
"query_value": [9.9, 9.9],
"space_type": "l2"
}
}
}
}
}
```
## Getting started with the score script for binary data
The k-NN score script also allows you to run k-NN search on your binary data with the Hamming distance space.
In order to use Hamming distance, the field of interest must have either a `binary` or `long` field type. If you're using `binary` type, the data must be a base64-encoded string.
This example shows how to use the Hamming distance space with a `binary` field type:
```json
PUT my-index
{
"mappings": {
"properties": {
"my_binary": {
"type": "binary",
"doc_values": true
},
"color": {
"type": "keyword"
}
}
}
}
```
Then add some documents:
```json
POST _bulk
{ "index": { "_index": "my-index", "_id": "1" } }
{ "my_binary": "SGVsbG8gV29ybGQh", "color" : "RED" }
{ "index": { "_index": "my-index", "_id": "2" } }
{ "my_binary": "ay1OTiBjdXN0b20gc2NvcmluZyE=", "color" : "RED" }
{ "index": { "_index": "my-index", "_id": "3" } }
{ "my_binary": "V2VsY29tZSB0byBrLU5O", "color" : "RED" }
{ "index": { "_index": "my-index", "_id": "4" } }
{ "my_binary": "SSBob3BlIHRoaXMgaXMgaGVscGZ1bA==", "color" : "BLUE" }
{ "index": { "_index": "my-index", "_id": "5" } }
{ "my_binary": "QSBjb3VwbGUgbW9yZSBkb2NzLi4u", "color" : "BLUE" }
{ "index": { "_index": "my-index", "_id": "6" } }
{ "my_binary": "TGFzdCBvbmUh", "color" : "BLUE" }
```
Finally, use the `script_score` query to pre-filter your documents before identifying nearest neighbors:
```json
GET my-index/_search
{
"size": 2,
"query": {
"script_score": {
"query": {
"bool": {
"filter": {
"term": {
"color": "BLUE"
}
}
}
},
"script": {
"lang": "knn",
"source": "knn_score",
"params": {
"field": "my_binary",
"query_value": "U29tZXRoaW5nIEltIGxvb2tpbmcgZm9y",
"space_type": "hammingbit"
}
}
}
}
}
```
Similarly, you can encode your data with the `long` field and run a search:
```json
GET my-long-index/_search
{
"size": 2,
"query": {
"script_score": {
"query": {
"bool": {
"filter": {
"term": {
"color": "BLUE"
}
}
}
},
"script": {
"lang": "knn",
"source": "knn_score",
"params": {
"field": "my_long",
"query_value": 23,
"space_type": "hammingbit"
}
}
}
}
}
```
## Spaces
A space corresponds to the function used to measure the distance between two points in order to determine the k-nearest neighbors. From the k-NN perspective, a lower score equates to a closer and better result. This is the opposite of how OpenSearch scores results, where a greater score equates to a better result. The following table illustrates how OpenSearch converts spaces to scores:
<table>
<thead style="text-align: left">
<tr>
<th>spaceType</th>
<th>Distance Function</th>
<th>OpenSearch Score</th>
</tr>
</thead>
<tr>
<td>l2</td>
<td>\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i)^2 \]</td>
<td>1 / (1 + Distance Function)</td>
</tr>
<tr>
<td>l1</td>
<td>\[ Distance(X, Y) = \sum_{i=1}^n (X_i - Y_i) \]</td>
<td>1 / (1 + Distance Function)</td>
</tr>
<tr>
<td>cosinesimil</td>
<td>\[ {A &middot; B \over \|A\| &middot; \|B\|} =
{\sum_{i=1}^n (A_i &middot; B_i) \over \sqrt{\sum_{i=1}^n A_i^2} &middot; \sqrt{\sum_{i=1}^n B_i^2}}\]
where \(\|A\|\) and \(\|B\|\) represent normalized vectors.</td>
<td>1 + Distance Function</td>
</tr>
<tr>
<td>hammingbit</td>
<td style="text-align:center">Distance = countSetBits(X \(\oplus\) Y)</td>
<td> 1 / (1 + Distance Function)</td>
</tr>
</table>
Cosine similarity returns a number between -1 and 1, and because OpenSearch relevance scores can't be below 0, the k-NN plugin adds 1 to get the final score.

View File

@ -0,0 +1,63 @@
---
layout: default
title: k-NN Painless extensions
nav_order: 3
parent: k-NN
has_children: false
has_math: true
---
# k-NN Painless Scripting extensions
With the k-NN plugin's Painless Scripting extensions, you can use k-NN distance functions directly in your Painless scripts to perform operations on `knn_vector` fields. Painless has a strict list of allowed functions and classes per context to ensure its scripts are secure. The k-NN plugin adds Painless Scripting extensions to a few of the distance functions used in [k-NN score script](../knn-score-script), so you can use them to customize your k-NN workload.
## Get started with k-NN's Painless Scripting functions
To use k-NN's Painless Scripting functions, first create an index with `knn_vector` fields like in [k-NN score script](../knn-score-script#Getting-started-with-the-score-script). Once the index is created and you ingest some data, you can use the painless extensions:
```json
GET my-knn-index-2/_search
{
"size": 2,
"query": {
"script_score": {
"query": {
"bool": {
"filter": {
"term": {
"color": "BLUE"
}
}
}
},
"script": {
"source": "1.0 + cosineSimilarity(params.query_value, doc[params.field])",
"params": {
"field": "my_vector",
"query_value": [9.9, 9.9]
}
}
}
}
}
```
`field` needs to map to a `knn_vector` field, and `query_value` needs to be a floating point array with the same dimension as `field`.
## Function types
The following table describes the available painless functions the k-NN plugin provides:
Function name | Function signature | Description
:--- | :---
l2Squared | `float l2Squared (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors.
l1Norm | `float l1Norm (float[] queryVector, doc['vector field'])` | This function calculates the square of the L2 distance (Euclidean distance) between a given query vector and document vectors. The shorter the distance, the more relevant the document is, so this example inverts the return value of the l2Squared function. If the document vector matches the query vector, the result is 0, so this example also adds 1 to the distance to avoid divide by zero errors.
cosineSimilarity | `float cosineSimilarity (float[] queryVector, doc['vector field'])` | Cosine similarity is an inner product of the query vector and document vector normalized to both have a length of 1. If the magnitude of the query vector doesn't change throughout the query, you can pass the magnitude of the query vector to improve performance, instead of calculating the magnitude every time for every filtered document:<br /> `float cosineSimilarity (float[] queryVector, doc['vector field'], float normQueryVector)` <br />In general, the range of cosine similarity is [-1, 1]. However, in the case of information retrieval, the cosine similarity of two documents ranges from 0 to 1 because the tf-idf statistic can't be negative. Therefore, the k-NN plugin adds 1.0 in order to always yield a positive cosine similarity score.
## Constraints
1. If a documents `knn_vector` field has different dimensions than the query, the function throws an `IllegalArgumentException`.
2. If a vector field doesn't have a value, the function throws an <code>IllegalStateException</code>.
You can avoid this situation by first checking if a document has a value in its field:
```
"source": "doc[params.field].size() == 0 ? 0 : 1 / (1 + l2Squared(params.query_value, doc[params.field]))",
```
Because scores can only be positive, this script ranks documents with vector fields higher than those without.

View File

@ -0,0 +1,111 @@
---
layout: default
title: Performance tuning
parent: k-NN
nav_order: 7
---
# Performance tuning
This topic provides performance tuning recommendations to improve indexing and search performance for approximate k-NN. From a high level, k-NN works according to these principles:
* Graphs are created per knn_vector field / (Lucene) segment pair.
* Queries execute on segments sequentially inside the shard (same as any other OpenSearch query).
* Each graph in the segment returns <=k neighbors.
* The coordinator node picks up final size number of neighbors from the neighbors returned by each shard.
This topic also provides recommendations for comparing approximate k-NN to exact k-NN with score script.
## Indexing performance tuning
Take the following steps to improve indexing performance, especially when you plan to index a large number of vectors at once:
* **Disable the refresh interval**
Either disable the refresh interval (default = 1 sec), or set a long duration for the refresh interval to avoid creating multiple small segments:
```json
PUT /<index_name>/_settings
{
"index" : {
"refresh_interval" : "-1"
}
}
```
**Note**: Make sure to reenable `refresh_interval` after indexing finishes.
* **Disable replicas (no OpenSearch replica shard)**
Set replicas to `0` to prevent duplicate construction of graphs in both primary and replica shards. When you enable replicas after indexing finishes, the serialized graphs are directly copied. If you have no replicas, losing nodes might cause data loss, so it's important that the data lives elsewhere so this initial load can be retried in case of an issue.
* **Increase the number of indexing threads**
If the hardware you choose has multiple cores, you can allow multiple threads in graph construction by speeding up the indexing process. Determine the number of threads to allot with the [knn.algo_param.index_thread_qty](../settings/#Cluster-settings) setting.
Keep an eye on CPU utilization and choose the correct number of threads. Because graph construction is costly, having multiple threads can cause additional CPU load.
## Search performance tuning
Take the following steps to improve search performance:
* **Reduce segment count**
To improve search performance, you must keep the number of segments under control. Lucene's IndexSearcher searches over all of the segments in a shard to find the 'size' best results. However, because the complexity of search for the HNSW algorithm is logarithmic with respect to the number of vectors, searching over five graphs with 100 vectors each and then taking the top 'size' results from 5*k results will take longer than searching over one graph with 500 vectors and then taking the top size results from k results.
Ideally, having one segment per shard provides the optimal performance with respect to search latency. You can configure an index to have multiple shards to avoid giant shards and achieve more parallelism.
You can control the number of segments by choosing a larger refresh interval, or during indexing by asking OpenSearch to slow down segment creation by disabling the refresh interval.
* **Warm up the index**
Graphs are constructed during indexing, but they're loaded into memory during the first search. In Lucene, each segment is searched sequentially (so, for k-NN, each segment returns up to k nearest neighbors of the query point), and the top 'size' number of results based on the score are returned from all the results returned by segements at a shard level (higher score = better result).
Once a graph is loaded (graphs are loaded outside OpenSearch JVM), OpenSearch caches them in memory. Initial queries are expensive and take a few seconds, while subsequent queries are faster and take milliseconds (assuming the k-NN circuit breaker isn't hit).
To avoid this latency penalty during your first queries, you can use the warmup API operation on the indices you want to search:
```json
GET /_plugins/_knn/warmup/index1,index2,index3?pretty
{
"_shards" : {
"total" : 6,
"successful" : 6,
"failed" : 0
}
}
```
The warmup API operation loads all graphs for all shards (primary and replica) for the specified indices into the cache, so there's no penalty to load graphs during initial searches.
**Note**: This API operation only loads the segments of the indices it ***sees*** into the cache. If a merge or refresh operation finishes after the API runs, or if you add new documents, you need to rerun the API to load those graphs into memory.
* **Avoid reading stored fields**
If your use case is simply to read the IDs and scores of the nearest neighbors, you can disable reading stored fields, which saves time retrieving the vectors from stored fields.
## Improving recall
Recall depends on multiple factors like number of vectors, number of dimensions, segments, and so on. Searching over a large number of small segments and aggregating the results leads to better recall than searching over a small number of large segments and aggregating results. The larger the graph, the more chances of losing recall if you're using smaller algorithm parameters. Choosing larger values for algorithm parameters should help solve this issue but sacrifices search latency and indexing time. That being said, it's important to understand your system's requirements for latency and accuracy, and then choose the number of segments you want your index to have based on experimentation.
To configure recall, adjust the algorithm parameters of the HNSW algorithm exposed through index settings. Algorithm parameters that control recall are `m`, `ef_construction`, and `ef_search`. For more information about how algorithm parameters influence indexing and search recall, see [HNSW algorithm parameters](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md). Increasing these values can help recall and lead to better search results, but at the cost of higher memory utilization and increased indexing time.
The default recall values work on a broader set of use cases, but make sure to run your own experiments on your data sets and choose the appropriate values. For index-level settings, see [Index settings](../settings#index-settings).
## Estimating memory usage
In a typical OpenSearch cluster, a certain portion of RAM is set aside for the JVM heap. The k-NN plugin allocates graphs to a portion of the remaining RAM. This portion's size is determined by the `circuit_breaker_limit` cluster setting. By default, the limit is set at 50%.
The memory required for graphs is estimated to be `1.1 * (4 * dimension + 8 * M)` bytes/vector.
As an example, assume you have a million vectors with a dimension of 256 and M of 16. The memory requirement can be estimated as follows:
```
1.1 * (4 *256 + 8 * 16) * 1,000,000 ~= 1.26 GB
```
**Note**: Remember that having a replica doubles the total number of vectors.
## Approximate nearest neighbor versus score script
The standard k-NN query and custom scoring option perform differently. Test with a representative set of documents to see if the search results and latencies match your expectations.
Custom scoring works best if the initial filter reduces the number of documents to no more than 20,000. Increasing shard count can improve latency, but be sure to keep shard size within the [recommended guidelines](../../opensearch/#primary-and-replica-shards).

View File

@ -0,0 +1,36 @@
---
layout: default
title: Settings
parent: k-NN
nav_order: 6
---
# k-NN settings
The k-NN plugin adds several new index and cluster settings.
## Index settings
The default values work well for most use cases, but you can change these settings when you create the index.
Setting | Default | Description
:--- | :--- | :---
`index.knn.algo_param.ef_search` | 512 | The size of the dynamic list used during k-NN searches. Higher values lead to more accurate but slower searches.
`index.knn.algo_param.ef_construction` | 512 | The size of the dynamic list used during k-NN graph creation. Higher values lead to a more accurate graph, but slower indexing speed.
`index.knn.algo_param.m` | 16 | The number of bidirectional links that the plugin creates for each new element. Increasing and decreasing this value can have a large impact on memory consumption. Keep this value between 2-100.
`index.knn.space_type` | "l2" | The vector space used to calculate the distance between vectors. Currently, the k-NN plugin supports the `l2` space (Euclidean distance) and `cosinesimil` space (cosine similarity). For more information on these spaces, see the [nmslib documentation](https://github.com/nmslib/nmslib/blob/master/manual/spaces.md).
## Cluster settings
Setting | Default | Description
:--- | :--- | :---
`knn.algo_param.index_thread_qty` | 1 | The number of threads used for graph creation. Keeping this value low reduces the CPU impact of the k-NN plugin, but also reduces indexing performance.
`knn.cache.item.expiry.enabled` | false | Whether to remove graphs that have not been accessed for a certain duration from memory.
`knn.cache.item.expiry.minutes` | 3h | If enabled, the idle time before removing a graph from memory.
`knn.circuit_breaker.unset.percentage` | 75.0 | The native memory usage threshold for the circuit breaker. Memory usage must be below this percentage of `knn.memory.circuit_breaker.limit` for `knn.circuit_breaker.triggered` to remain false.
`knn.circuit_breaker.triggered` | false | True when memory usage exceeds the `knn.circuit_breaker.unset.percentage` value.
`knn.memory.circuit_breaker.limit` | 50% | The native memory limit for graphs. At the default value, if a machine has 100 GB of memory and the JVM uses 32 GB, the k-NN plugin uses 50% of the remaining 68 GB (34 GB). If memory usage exceeds this value, k-NN removes the least recently used graphs.
`knn.memory.circuit_breaker.enabled` | true | Whether to enable the k-NN memory circuit breaker.
`knn.plugin.enabled`| true | Enables or disables the k-NN plugin.

View File

@ -0,0 +1,196 @@
---
layout: default
title: API
parent: Performance Analyzer
nav_order: 1
---
# Performance Analyzer API
Performance Analyzer uses a single HTTP method and URI for most requests:
```
GET <endpoint>:9600/_opensearch/_performanceanalyzer/metrics
```
Note the use of port 9600. Provide parameters for metrics, aggregations, dimensions, and nodes (optional):
```
?metrics=<metrics>&agg=<aggregations>&dim=<dimensions>&nodes=all"
```
For a full list of metrics, see [Metrics reference](../reference/). Performance Analyzer updates its data every five seconds. If you create a custom client, we recommend using that same interval for calls to the API.
#### Sample request
```
GET localhost:9600/_opensearch/_performanceanalyzer/metrics?metrics=Latency,CPU_Utilization&agg=avg,max&dim=ShardID&nodes=all
```
#### Sample response
```json
{
"keHlhQbbTpm1BYicficEQg": {
"timestamp": 1554940530000,
"data": {
"fields": [{
"name": "ShardID",
"type": "VARCHAR"
},
{
"name": "Latency",
"type": "DOUBLE"
},
{
"name": "CPU_Utilization",
"type": "DOUBLE"
}
],
"records": [
[
null,
null,
0.012552206029147535
],
[
"1",
4.8,
0.0009780939762972104
]
]
}
},
"bHdpbMJZTs-TKtZro2SmYA": {
"timestamp": 1554940530000,
"data": {
"fields": [{
"name": "ShardID",
"type": "VARCHAR"
},
{
"name": "Latency",
"type": "DOUBLE"
},
{
"name": "CPU_Utilization",
"type": "DOUBLE"
}
],
"records": [
[
null,
18.2,
0.011966493817311527
],
[
"1",
14.8,
0.0007670829370071493
]
]
}
}
}
```
In this case, each top-level object represents a node. The API returns names and data types for the metrics and dimensions that you specified, along with values from five seconds ago and current values (if different). Null values represent inactivity during that time period.
Performance Analyzer has one additional URI that returns the unit for each metric.
#### Sample request
```
GET localhost:9600/_opensearch/_performanceanalyzer/metrics/units
```
#### Sample response
```json
{
"Disk_Utilization": "%",
"Cache_Request_Hit": "count",
"TermVectors_Memory": "B",
"Segments_Memory": "B",
"HTTP_RequestDocs": "count",
"Net_TCP_Lost": "segments/flow",
"Refresh_Time": "ms",
"GC_Collection_Event": "count",
"Merge_Time": "ms",
"Sched_CtxRate": "count/s",
"Cache_Request_Size": "B",
"ThreadPool_QueueSize": "count",
"Sched_Runtime": "s/ctxswitch",
"Disk_ServiceRate": "MB/s",
"Heap_AllocRate": "B/s",
"Heap_Max": "B",
"Sched_Waittime": "s/ctxswitch",
"ShardBulkDocs": "count",
"Thread_Blocked_Time": "s/event",
"VersionMap_Memory": "B",
"Master_Task_Queue_Time": "ms",
"Merge_CurrentEvent": "count",
"Indexing_Buffer": "B",
"Bitset_Memory": "B",
"Norms_Memory": "B",
"Net_PacketDropRate4": "packets/s",
"Heap_Committed": "B",
"Net_PacketDropRate6": "packets/s",
"Thread_Blocked_Event": "count",
"GC_Collection_Time": "ms",
"Cache_Query_Miss": "count",
"IO_TotThroughput": "B/s",
"Latency": "ms",
"Net_PacketRate6": "packets/s",
"Cache_Query_Hit": "count",
"IO_ReadSyscallRate": "count/s",
"Net_PacketRate4": "packets/s",
"Cache_Request_Miss": "count",
"CB_ConfiguredSize": "B",
"CB_TrippedEvents": "count",
"ThreadPool_RejectedReqs": "count",
"Disk_WaitTime": "ms",
"Net_TCP_TxQ": "segments/flow",
"Master_Task_Run_Time": "ms",
"IO_WriteSyscallRate": "count/s",
"IO_WriteThroughput": "B/s",
"Flush_Event": "count",
"Net_TCP_RxQ": "segments/flow",
"Refresh_Event": "count",
"Points_Memory": "B",
"Flush_Time": "ms",
"Heap_Init": "B",
"CPU_Utilization": "cores",
"HTTP_TotalRequests": "count",
"ThreadPool_ActiveThreads": "count",
"Cache_Query_Size": "B",
"Paging_MinfltRate": "count/s",
"Merge_Event": "count",
"Net_TCP_SendCWND": "B/flow",
"Cache_Request_Eviction": "count",
"Segments_Total": "count",
"Terms_Memory": "B",
"DocValues_Memory": "B",
"Heap_Used": "B",
"Cache_FieldData_Eviction": "count",
"IO_TotalSyscallRate": "count/s",
"CB_EstimatedSize": "B",
"Net_Throughput": "B/s",
"Paging_RSS": "pages",
"Indexing_ThrottleTime": "ms",
"StoredFields_Memory": "B",
"IndexWriter_Memory": "B",
"Master_PendingQueueSize": "count",
"Net_TCP_SSThresh": "B/flow",
"Cache_FieldData_Size": "B",
"Paging_MajfltRate": "count/s",
"ThreadPool_TotalThreads": "count",
"IO_ReadThroughput": "B/s",
"ShardEvents": "count",
"Net_TCP_NumFlows": "count"
}
```

View File

@ -0,0 +1,162 @@
---
layout: default
title: Create Dashboards
parent: Performance Analyzer
nav_order: 2
---
# PerfTop dashboards
Dashboards are defined in JSON and composed of three main elements: tables, line graphs, and bar graphs. You define a grid of rows and columns and then place elements within that grid, with each element spanning as many rows and columns as you specify.
The best way to get started with building custom dashboards is to duplicate and modify one of the existing JSON files in the `dashboards` directory.
{: .tip }
---
#### Table of contents
1. TOC
{:toc}
---
## Summary of elements
- Tables show metrics per dimension. For example, if your metric is `CPU_Utilization` and your dimension `ShardID`, a PerfTop table shows a row for each shard on each node.
- Bar graphs are aggregated for the cluster, unless you add `nodeName` to the dashboard. See the [options for all elements](#all-elements).
- Line graphs are aggregated for each node. Each line represents a node.
## Position elements
PerfTop positions elements within a grid. For example, consider this 12 * 12 grid.
![Dashboard grid](../../images/perftop-grid.png)
The upper-left of the grid represents row 0, column 0, so the starting positions for the three boxes are:
- Orange: row 0, column 0
- Purple: row 2, column 2
- Green: row 1, column 6
These boxes span a number of rows and columns. In this case:
- Orange: 2 rows, 4 columns
- Purple: 1 row, 4 columns
- Green: 3 rows, 2 columns
In JSON form, we have the following:
```json
{
"gridOptions": {
"rows": 12,
"cols": 12
},
"graphs": {
"tables": [{
"options": {
"gridPosition": {
"row": 0,
"col": 0,
"rowSpan": 2,
"colSpan": 4
}
}
},
{
"options": {
"gridPosition": {
"row": 2,
"col": 2,
"rowSpan": 1,
"colSpan": 4
}
}
},
{
"options": {
"gridPosition": {
"row": 1,
"col": 6,
"rowSpan": 3,
"colSpan": 2
}
}
}
]
}
}
```
At this point, however, all the JSON does is define the size and position of three tables. To fill elements with data, you specify a query.
## Add queries
Queries use the same elements as the [REST API](../api/), just in JSON form:
```json
{
"queryParams": {
"metrics": "estimated,limitConfigured",
"aggregates": "avg,avg",
"dimensions": "type",
"sortBy": "estimated"
}
}
```
For details on available metrics, see [Metrics reference](../reference/).
## Add options
Options include labels, colors, and a refresh interval. Different elements types have different options.
Dashboards support the 16 ANSI colors: black, red, green, yellow, blue, magenta, cyan, and white. For the "bright" variants of these colors, use the numbers 8--15. If your terminal supports 256 colors, you can also use hex codes (e.g. `#6D40ED`).
{: .note }
### All elements
Option | Type | Description
:--- | :--- | :---
`label` | String or integer | The text in the upper-left corner of the box.
`labelColor` | String or integer | The color of the label.
`refreshInterval` | Integer | The number of milliseconds between calls to the Performance Analyzer API for new data. Minimum value is 5000.
`dimensionFilters` | String array | The dimension value to diplay for the graph. For example, if you query for `metric=Net_Throughput&agg=sum&dim=Direction` and the possible dimension values are `in` and `out`, you can define `dimensionFilters: ["in"]` to only display the metric data for `in` dimension
`nodeName` | String | If non-null, lets you restrict elements to individual nodes. You can specify the node name directly in the dashboard file, but the better approach is to use `"nodeName": "#nodeName"` in the dashboard and include the `--nodename <node_name>` argument when starting PerfTop.
### Tables
Option | Type | Description
:--- | :--- | :---
`bg` | String or integer | The background color.
`fg` | String or integer | The text color.
`selectedFg` | String or integer | The text color for focused text.
`selectedBg` | String or integer | The background color for focused text.
`columnSpacing` | Integer | The amount of space (measured in characters) between columns.
`keys` | Boolean | Has no impact at this time.
### Bars
Option | Type | Description
:--- | :--- | :---
`barWidth` | Integer | The width of each bar (measured in characters) in the graph.
`xOffset` | Integer | The amount of space (measured in characters) between the y-axis and the first bar in the graph.
`maxHeight` | Integer | The maximum height of each bar (measured in characters) in the graph.
### Lines
Option | Type | Description
:--- | :--- | :---
`showNthLabel` | Integer | Which of the `xAxis` labels to show. For example, `"showNthLabel": 2` shows every other label.
`showLegend` | Boolean | Whether or not to display a legend for the line graph.
`legend.width` | Integer | The width of the legend (measured in characters) in the graph.
`xAxis` | String array | Array of labels for the x-axis. For example, `["0:00", "0:10", "0:20", "0:30", "0:40", "0:50"]`.
`colors` | String array | Array of line colors to choose from. For example, `["magenta", "cyan"]`. If you don't provide this value, PerfTop chooses random colors for each line.

View File

@ -0,0 +1,101 @@
---
layout: default
title: Performance Analyzer
nav_order: 58
has_children: true
---
# Performance Analyzer
Performance Analyzer is an agent and REST API that allows you to query numerous performance metrics for your cluster, including aggregations of those metrics, independent of the Java Virtual Machine (JVM). PerfTop is the default command line interface (CLI) for displaying those metrics.
To download PerfTop, see [Download](https://opensearch.org/downloads.html) on the OpenSearch website.
You can also install it using [npm](https://www.npmjs.com/):
```bash
npm install -g @aws/opensearch-perftop
```
![PerfTop screenshot](../images/perftop.png)
## Get started with PerfTop
The basic syntax is:
```bash
./perf-top-<operating_system> --dashboard <dashboard>.json --endpoint <endpoint>
```
If you're using npm, the syntax is similar:
```bash
perf-top --dashboard <dashboard> --endpoint <endpoint>
```
If you're running PerfTop from a node (i.e. locally), specify port 9600:
```bash
./perf-top-linux --dashboard dashboards/<dashboard>.json --endpoint localhost:9600
```
Otherwise, just specify the OpenSearch endpoint:
```bash
./perf-top-macos --dashboard dashboards/<dashboard>.json --endpoint my-cluster.my-domain.com
```
PerfTop has four pre-built dashboards in the `dashboards` directory, but you can also [create your own](dashboards/).
You can also load the pre-built dashboards (ClusterOverview, ClusterNetworkMemoryAnalysis, ClusterThreadAnalysis, or NodeAnalysis) without the JSON files, such as `--dashboard ClusterThreadAnalysis`.
PerfTop has no interactivity. Start the application, monitor the dashboard, and press esc, q, or Ctrl + C to quit.
{: .note }
### Other options
- For NodeAnalysis and similar custom dashboards, you can add the `--nodename <node_name>` argument if you want your dashboard to display metrics for only a single node.
- For troubleshooting, add the `--logfile <log-file>.txt` argument.
## Performance Analyzer configuration
### Storage
Performance Analyzer uses `/dev/shm` for temporary storage. During heavy workloads on a cluster, Performance Analyzer can use up to 1 GB of space.
Docker, however, has a default `/dev/shm` size of 64 MB. To change this value, you can use the `docker run --shm-size 1gb` flag or [a similar setting in Docker Compose](https://docs.docker.com/compose/compose-file/#shm_size).
If you're not using Docker, check the size of `/dev/shm` using `df -h`. The default value is probably plenty, but if you need to change its size, add the following line to `/etc/fstab`:
```bash
tmpfs /dev/shm tmpfs defaults,noexec,nosuid,size=1G 0 0
```
Then remount the file system:
```bash
mount -o remount /dev/shm
```
### Security
Performance Analyzer supports encryption in transit for requests. It currently does *not* support client or server authentication for requests. To enable encryption in transit, edit `performance-analyzer.properties` in your `$ES_HOME` directory:
```bash
vi $ES_HOME/plugins/opensearch_performance_analyzer/pa_config/performance-analyzer.properties
```
Change the following lines to configure encryption in transit. Note that `certificate-file-path` must be a certificate for the server, not a root CA:
```
https-enabled = true
#Setup the correct path for certificates
certificate-file-path = specify_path
private-key-file-path = specify_path
```

View File

@ -0,0 +1,63 @@
---
layout: default
title: API
parent: Root Cause Analysis
grand_parent: Performance Analyzer
nav_order: 1
---
# RCA API
## Sample request
```
# Request all available RCAs
GET localhost:9600/_opensearch/_performanceanalyzer/rca
# Request a specific RCA
GET localhost:9600/_opensearch/_performanceanalyzer/rca?name=HighHeapUsageClusterRca
```
## Sample response
```json
{
"HighHeapUsageClusterRca": [{
"rca_name": "HighHeapUsageClusterRca",
"state": "unhealthy",
"timestamp": 1587426650942,
"HotClusterSummary": [{
"number_of_nodes": 2,
"number_of_unhealthy_nodes": 1,
"HotNodeSummary": [{
"host_address": "192.168.144.2",
"node_id": "JtlEoRowSI6iNpzpjlbp_Q",
"HotResourceSummary": [{
"resource_type": "old gen",
"threshold": 0.65,
"value": 0.81827232588145373,
"avg": NaN,
"max": NaN,
"min": NaN,
"unit_type": "heap usage in percentage",
"time_period_seconds": 600,
"TopConsumerSummary": [{
"name": "CACHE_FIELDDATA_SIZE",
"value": 590702564
},
{
"name": "CACHE_REQUEST_SIZE",
"value": 28375
},
{
"name": "CACHE_QUERY_SIZE",
"value": 12687
}
],
}]
}]
}]
}]
}
```

View File

@ -0,0 +1,17 @@
---
layout: default
title: Root Cause Analysis
nav_order: 50
parent: Performance Analyzer
has_children: true
---
# Root Cause Analysis
The OpenSearch Performance Analyzer plugin (PA) captures OpenSearch and JVM activity, plus their lower-level resource usage (e.g. disk, network, CPU, and memory). Based on this instrumentation, Performance Analyzer computes and exposes diagnostic metrics so that administrators can measure and understand the bottlenecks in their OpenSearch clusters.
The Root Cause Analysis framework (RCA) uses the information from PA to alert administrators about the root cause of performance and availability issues that their clusters might be experiencing.
In broad strokes, the framework helps you access data streams from OpenSearch nodes running Performance Analyzer. You write snippets of Java to choose the streams that matter to you and evaluate the streams' PA metrics against certain thresholds. As RCA runs, you can access the state of each analysis using the REST API.
To learn more about Root Cause Analysis, see [its repository on GitHub](https://github.com/opensearch-project/performance-analyzer-rca).

View File

@ -0,0 +1,11 @@
---
layout: default
title: RCA Reference
parent: Root Cause Analysis
grand_parent: Performance Analyzer
nav_order: 3
---
# RCA reference
You can find a reference of available RCAs and their purposes on [Github](https://github.com/opensearch-project/performance-analyzer-rca/tree/master/docs).

View File

@ -0,0 +1,560 @@
---
layout: default
title: Metrics Reference
parent: Performance Analyzer
nav_order: 3
---
# Metrics reference
This page contains all Performance Analyzer metrics. All metrics support the `avg`, `sum`, `min`, and `max` aggregations, although certain metrics measure only one thing, making the choice of aggregation irrelevant.
For information on dimensions, see the [dimensions reference](#dimensions-reference).
This list is extensive. We recommend using Ctrl/Cmd + F to find what you're looking for.
{: .tip }
<table>
<thead style="text-align: left">
<tr>
<th>Metric</th>
<th>Dimensions</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>CPU_Utilization
</td>
<td rowspan="18">ShardID, IndexName, Operation, ShardRole
</td>
<td>CPU usage ratio. CPU time (in milliseconds) used by the associated thread(s) in the past five seconds, divided by 5000 milliseconds.
</td>
</tr>
<tr>
<td>Paging_MajfltRate
</td>
<td>The number of major faults per second in the past five seconds. A major fault requires the process to load a memory page from disk.
</td>
</tr>
<tr>
<td>Paging_MinfltRate
</td>
<td>The number of minor faults per second in the past five seconds. A minor fault does not requires the process to load a memory page from disk.
</td>
</tr>
<tr>
<td>Paging_RSS
</td>
<td>The number of pages the process has in real memory---the pages that count towards text, data, or stack space. This number does not include pages that have not been demand-loaded in or swapped out.
</td>
</tr>
<tr>
<td>Sched_Runtime
</td>
<td>Time (seconds) spent executing on the CPU per context switch.
</td>
</tr>
<tr>
<td>Sched_Waittime
</td>
<td>Time (seconds) spent waiting on a run queue per context switch.
</td>
</tr>
<tr>
<td>Sched_CtxRate
</td>
<td>Number of times run on the CPU per second in the past five seconds.
</td>
</tr>
<tr>
<td>Heap_AllocRate
</td>
<td>An approximation of the heap memory allocated, in bytes, per second in the past five seconds
</td>
</tr>
<tr>
<td>IO_ReadThroughput
</td>
<td>Number of bytes read per second in the last five seconds.
</td>
</tr>
<tr>
<td>IO_WriteThroughput
</td>
<td>Number of bytes written per second in the last five seconds.
</td>
</tr>
<tr>
<td>IO_TotThroughput
</td>
<td>Number of bytes read or written per second in the last five seconds.
</td>
</tr>
<tr>
<td>IO_ReadSyscallRate
</td>
<td>Read system calls per second in the last five seconds.
</td>
</tr>
<tr>
<td>IO_WriteSyscallRate
</td>
<td>Write system calls per second in the last five seconds.
</td>
</tr>
<tr>
<td>IO_TotalSyscallRate
</td>
<td>Read and write system calls per second in the last five seconds.
</td>
</tr>
<tr>
<td>Thread_Blocked_Time
</td>
<td>Average time (seconds) that the associated thread(s) blocked to enter or reenter a monitor.
</td>
</tr>
<tr>
<td>Thread_Blocked_Event
</td>
<td>The total number of times that the associated thread(s) blocked to enter or reenter a monitor (i.e. the number of times a thread has been in the blocked state).
</td>
</tr>
<tr>
<td>ShardEvents
</td>
<td>The total number of events executed on a shard in the past five seconds.
</td>
</tr>
<tr>
<td>ShardBulkDocs
</td>
<td>The total number of documents indexed in the past five seconds.
</td>
</tr>
<tr>
<td>Indexing_ThrottleTime
</td>
<td rowspan="30">ShardID, IndexName
</td>
<td>Time (milliseconds) that the index has been under merge throttling control in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_Query_Hit
</td>
<td>The number of successful lookups in the query cache in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_Query_Miss
</td>
<td>The number of lookups in the query cache that failed to retrieve a `DocIdSet` in the past five seconds. `DocIdSet` is a set of document IDs in Lucene.
</td>
</tr>
<tr>
<td>Cache_Query_Size
</td>
<td>Query cache memory size in bytes.
</td>
</tr>
<tr>
<td>Cache_FieldData_Eviction
</td>
<td>The number of times OpenSearch has evicted data from the fielddata heap space (occurs when the heap space is full) in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_FieldData_Size
</td>
<td>Fielddata memory size in bytes.
</td>
</tr>
<tr>
<td>Cache_Request_Hit
</td>
<td>The number of successful lookups in the shard request cache in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_Request_Miss
</td>
<td>The number of lookups in the request cache that failed to retrieve the results of search requests in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_Request_Eviction
</td>
<td>The number of times OpenSearch evicts data from shard request cache (occurs when the request cache is full) in the past five seconds.
</td>
</tr>
<tr>
<td>Cache_Request_Size
</td>
<td>Shard request cache memory size in bytes.
</td>
</tr>
<tr>
<td>Refresh_Event
</td>
<td>The total number of refreshes executed in the past five seconds.
</td>
</tr>
<tr>
<td>Refresh_Time
</td>
<td>The total time (milliseconds) spent executing refreshes in the past five seconds
</td>
</tr>
<tr>
<td>Flush_Event
</td>
<td>The total number of flushes executed in the past five seconds.
</td>
</tr>
<tr>
<td>Flush_Time
</td>
<td>The total time (milliseconds) spent executing flushes in the past five seconds.
</td>
</tr>
<tr>
<td>Merge_Event
</td>
<td>The total number of merges executed in the past five seconds.
</td>
</tr>
<tr>
<td>Merge_Time
</td>
<td>The total time (milliseconds) spent executing merges in the past five seconds.
</td>
</tr>
<tr>
<td>Merge_CurrentEvent
</td>
<td>The current number of merges executing.
</td>
</tr>
<tr>
<td>Indexing_Buffer
</td>
<td>Index buffer memory size in bytes.
</td>
</tr>
<tr>
<td>Segments_Total
</td>
<td>The number of segments.
</td>
</tr>
<tr>
<td>Segments_Memory
</td>
<td>Estimated memory usage of segments in bytes.
</td>
</tr>
<tr>
<td>Terms_Memory
</td>
<td>Estimated memory usage of terms dictionaries in bytes.
</td>
</tr>
<tr>
<td>StoredFields_Memory
</td>
<td>Estimated memory usage of stored fields in bytes.
</td>
</tr>
<tr>
<td>TermVectors_Memory
</td>
<td>Estimated memory usage of term vectors in bytes.
</td>
</tr>
<tr>
<td>Norms_Memory
</td>
<td>Estimated memory usage of norms (normalization factors) in bytes.
</td>
</tr>
<tr>
<td>Points_Memory
</td>
<td>Estimated memory usage of points in bytes.
</td>
</tr>
<tr>
<td>DocValues_Memory
</td>
<td>Estimated memory usage of doc values in bytes.
</td>
</tr>
<tr>
<td>IndexWriter_Memory
</td>
<td>Estimated memory usage by the index writer in bytes.
</td>
</tr>
<tr>
<td>Bitset_Memory
</td>
<td>Estimated memory usage for the cached bit sets in bytes.
</td>
</tr>
<tr>
<td>VersionMap_Memory
</td>
<td>Estimated memory usage of the version map in bytes.
</td>
</tr>
<tr>
<td>Shard_Size_In_Bytes
</td>
<td>Estimated disk usage of the shard in bytes.
</td>
</tr>
<tr>
<td>Latency
</td>
<td>Operation, Exception, Indices, HTTPRespCode, ShardID, IndexName, ShardRole
</td>
<td>Latency (milliseconds) of a request.
</td>
</tr>
<tr>
<td>GC_Collection_Event
</td>
<td rowspan="6">MemType
</td>
<td>The number of garbage collections that have occurred in the past five seconds.
</td>
</tr>
<tr>
<td>GC_Collection_Time
</td>
<td>The approximate accumulated time (milliseconds) of all garbage collections that have occurred in the past five seconds.
</td>
</tr>
<tr>
<td>Heap_Committed
</td>
<td>The amount of memory (bytes) that is committed for the JVM to use.
</td>
</tr>
<tr>
<td>Heap_Init
</td>
<td>The amount of memory (bytes) that the JVM initially requests from the operating system for memory management.
</td>
</tr>
<tr>
<td>Heap_Max
</td>
<td>The maximum amount of memory (bytes) that can be used for memory management.
</td>
</tr>
<tr>
<td>Heap_Used
</td>
<td>The amount of used memory in bytes.
</td>
</tr>
<tr>
<td>Disk_Utilization
</td>
<td rowspan="3">DiskName
</td>
<td>Disk utilization rate: percentage of disk time spent reading and writing by the OpenSearch process in the past five seconds.
</td>
</tr>
<tr>
<td>Disk_WaitTime
</td>
<td>Average duration (milliseconds) of read and write operations in the past five seconds.
</td>
</tr>
<tr>
<td>Disk_ServiceRate
</td>
<td>Service rate: MB read or written per second in the past five seconds. This metric assumes that each disk sector stores 512 bytes.
</td>
</tr>
<tr>
<td>Net_TCP_NumFlows
</td>
<td rowspan="6">DestAddr
</td>
<td>Number of samples collected. Performance Analyzer collects one sample every five seconds.
</td>
</tr>
<tr>
<td>Net_TCP_TxQ
</td>
<td>Average number of TCP packets in the send buffer.
</td>
</tr>
<tr>
<td>Net_TCP_RxQ
</td>
<td>Average number of TCP packets in the receive buffer.
</td>
</tr>
<tr>
<td>Net_TCP_Lost
</td>
<td>Average number of unrecovered recurring timeouts. This number is reset when the recovery finishes or `SND.UNA` is advanced. `SND.UNA` is the sequence number of the first byte of data that has been sent, but not yet acknowledged.
</td>
</tr>
<tr>
<td>Net_TCP_SendCWND
</td>
<td>Average size (bytes) of the sending congestion window.
</td>
</tr>
<tr>
<td>Net_TCP_SSThresh
</td>
<td>Average size (bytes) of the slow start size threshold.
</td>
</tr>
<tr>
<td>Net_PacketRate4
</td>
<td rowspan="5">Direction
</td>
<td>The total number of IPv4 datagrams transmitted/received from/by interfaces per second, including those transmitted or received in error
</td>
</tr>
<tr>
<td>Net_PacketDropRate4
</td>
<td>The total number of IPv4 datagrams transmitted or received in error per second.
</td>
</tr>
<tr>
<td>Net_PacketRate6
</td>
<td>The total number of IPv6 datagrams transmitted or received from or by interfaces per second, including those transmitted or received in error.
</td>
</tr>
<tr>
<td>Net_PacketDropRate6
</td>
<td>The total number of IPv6 datagrams transmitted or received in error per second.
</td>
</tr>
<tr>
<td>Net_Throughput
</td>
<td>The number of bits transmitted or received per second by all network interfaces.
</td>
</tr>
<tr>
<td>ThreadPool_QueueSize
</td>
<td rowspan="4">ThreadPoolType
</td>
<td>The size of the task queue.
</td>
</tr>
<tr>
<td>ThreadPool_RejectedReqs
</td>
<td>The number of rejected executions.
</td>
</tr>
<tr>
<td>ThreadPool_TotalThreads
</td>
<td>The current number of threads in the pool.
</td>
</tr>
<tr>
<td>ThreadPool_ActiveThreads
</td>
<td>The approximate number of threads that are actively executing tasks.
</td>
</tr>
<tr>
<td>Master_PendingQueueSize
</td>
<td>N/A
</td>
<td>The current number of pending tasks in the cluster state update thread. Each node has a cluster state update thread that submits cluster state update tasks (create index, update mapping, allocate shard, fail shard, etc.).
</td>
</tr>
<tr>
<td>HTTP_RequestDocs
</td>
<td rowspan="2">Operation, Exception, Indices, HTTPRespCode
</td>
<td>The number of items in the request (only for `_bulk` request type).
</td>
</tr>
<tr>
<td>HTTP_TotalRequests
</td>
<td>The number of finished requests in the past five seconds.
</td>
</tr>
<tr>
<td>CB_EstimatedSize
</td>
<td rowspan="3">CBType
</td>
<td>The current number of estimated bytes.
</td>
</tr>
<tr>
<td>CB_TrippedEvents
</td>
<td>The number of times the circuit breaker has tripped.
</td>
</tr>
<tr>
<td>CB_ConfiguredSize
</td>
<td>The limit (bytes) for how much memory operations can use.
</td>
</tr>
<tr>
<td>Master_Task_Queue_Time
</td>
<td rowspan="2">MasterTaskInsertOrder, MasterTaskPriority, MasterTaskType, MasterTaskMetadata
</td>
<td>The time (milliseconds) that a master task spent in the queue.
</td>
</tr>
<tr>
<td>Master_Task_Run_Time
</td>
<td>The time (milliseconds) that a master task has been executed.
</td>
</tr>
</tbody>
</table>
## Dimensions reference
Dimension | Return values
:--- | :---
ShardID | ID for the shard (e.g. `1`).
IndexName | Name of the index (e.g. `my-index`).
Operation | Type of operation (e.g. `shardbulk`).
ShardRole | `primary`, `replica`
Exception | OpenSearch exceptions (e.g. `org.opensearch.index_not_found_exception`).
Indices | The list of indices in the request URI.
HTTPRespCode | Response code from OpenSearch (e.g. `200`).
MemType | `totYoungGC`, `totFullGC`, `Survivor`, `PermGen`, `OldGen`, `Eden`, `NonHeap`, `Heap`
DiskName | Name of the disk (e.g. `sda1`).
DestAddr | Destination address (e.g. `010015AC`).
Direction | `in`, `out`
ThreadPoolType | The OpenSearch thread pools (e.g. `index`, `search`,`snapshot`).
CBType | `accounting`, `fielddata`, `in_flight_requests`, `parent`, `request`
MasterTaskInsertOrder | The order in which the task was inserted (e.g. `3691`).
MasterTaskPriority | Priority of the task (e.g. `URGENT`). OpenSearch executes higher priority tasks before lower priority ones, regardless of `insert_order`.
MasterTaskType | `shard-started`, `create-index`, `delete-index`, `refresh-mapping`, `put-mapping`, `CleanupSnapshotRestoreState`, `Update snapshot state`
MasterTaskMetadata | Metadata for the task (if any).

View File

@ -0,0 +1,672 @@
---
layout: default
title: Commands
parent: Piped processing language
nav_order: 4
---
# Commands
Start a PPL query with a `search` command to reference a table to search from. You can have the commands that follow in any order.
In the following example, the `search` command refers to an `accounts` index as the source, then uses `fields` and `where` commands for the conditions:
```sql
search source=accounts
| where age > 18
| fields firstname, lastname
```
In the below examples, we represent required arguments in angle brackets `< >` and optional arguments in square brackets `[ ]`.
{: .note }
## search
Use the `search` command to retrieve a document from an index. You can only use the `search` command as the first command in the PPL query.
### Syntax
```sql
search source=<index> [boolean-expression]
```
Field | Description | Required
:--- | :--- |:---
`search` | Specify search keywords. | Yes
`index` | Specify which index to query from. | No
`bool-expression` | Specify an expression that evaluates to a boolean value. | No
*Example 1*: Get all documents
To get all documents from the `accounts` index:
```sql
search source=accounts;
```
| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname |
:--- | :--- |
| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke
| 6 | Hattie | 671 Bristol Street | 5686 | M | Dante | Netagy | TN | 36 | hattiebond@netagy.com | Bond
| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates
| 18 | Dale | 467 Hutchinson Court | 4180 | M | Orick | null | MD | 33 | daleadams@boink.com | Adams
*Example 2*: Get documents that match a condition
To get all documents from the `accounts` index that have either `account_number` equal to 1 or have `gender` as `F`:
```sql
search source=accounts account_number=1 or gender="F";
```
| account_number | firstname | address | balance | gender | city | employer | state | age | email | lastname |
:--- | :--- |
| 1 | Amber | 880 Holmes Lane | 39225 | M | Brogan | Pyrami | IL | 32 | amberduke@pyrami.com | Duke |
| 13 | Nanette | 789 Madison Street | 32838 | F | Nogal | Quility | VA | 28 | null | Bates |
## dedup
The `dedup` (data deduplication) command removes duplicate documents defined by a field from the search result.
### Syntax
```sql
dedup [int] <field-list> [keepempty=<bool>] [consecutive=<bool>]
```
Field | Description | Type | Required | Default
:--- | :--- |:--- |:--- |:---
`int` | Retain the specified number of duplicate events for each combination. The number must be greater than 0. If you do not specify a number, only the first occurring event is kept and all other duplicates are removed from the results. | `string` | No | 1
`keepempty` | If true, keep the document if any field in the field list has a null value or a field missing. | `nested list of objects` | No | False
`consecutive` | If true, remove only consecutive events with duplicate combinations of values. | No | False | -
`field-list` | Specify a comma-delimited field list. At least one field is required. | Yes | - | -
*Example 1*: Dedup by one field
To remove duplicate documents with the same gender:
```sql
search source=accounts | dedup gender | fields account_number, gender;
```
| account_number | gender
:--- | :--- |
1 | M
13 | F
*Example 2*: Keep two duplicate documents
To keep two duplicate documents with the same gender:
```sql
search source=accounts | dedup 2 gender | fields account_number, gender;
```
| account_number | gender
:--- | :--- |
1 | M
6 | M
13 | F
*Example 3*: Keep or ignore an empty field by default
To keep two duplicate documents with a `null` field value:
```sql
search source=accounts | dedup email keepempty=true | fields account_number, email;
```
| account_number | email
:--- | :--- |
1 | amberduke@pyrami.com
6 | hattiebond@netagy.com
13 | null
18 | daleadams@boink.com
To remove duplicate documents with the `null` field value:
```sql
search source=accounts | dedup email | fields account_number, email;
```
| account_number | email
:--- | :--- |
1 | amberduke@pyrami.com
6 | hattiebond@netagy.com
18 | daleadams@boink.com
*Example 4*: Dedup of consecutive documents
To remove duplicates of consecutive documents:
```sql
search source=accounts | dedup gender consecutive=true | fields account_number, gender;
```
| account_number | gender
:--- | :--- |
1 | M
13 | F
18 | M
## eval
The `eval` command evaluates an expression and appends its result to the search result.
### Syntax
```sql
eval <field>=<expression> ["," <field>=<expression> ]...
```
Field | Description | Required
:--- | :--- |:---
`field` | If a field name does not exist, a new field is added. If the field name already exists, it's overwritten. | Yes
`expression` | Specify any supported expression. | Yes
*Example 1*: Create a new field
To create a new `doubleAge` field for each document. `doubleAge` is the result of `age` multiplied by 2:
```sql
search source=accounts | eval doubleAge = age * 2 | fields age, doubleAge;
```
| age | doubleAge
:--- | :--- |
32 | 64
36 | 72
28 | 56
33 | 66
*Example 2*: Overwrite the existing field
To overwrite the `age` field with `age` plus 1:
```sql
search source=accounts | eval age = age + 1 | fields age;
```
| age
:--- |
| 33
| 37
| 29
| 34
*Example 3*: Create a new field with a field defined with the `eval` command
To create a new field `ddAge`. `ddAge` is the result of `doubleAge` multiplied by 2, where `doubleAge` is defined in the `eval` command:
```sql
search source=accounts | eval doubleAge = age * 2, ddAge = doubleAge * 2 | fields age, doubleAge, ddAge;
```
| age | doubleAge | ddAge
:--- | :--- |
| 32 | 64 | 128
| 36 | 72 | 144
| 28 | 56 | 112
| 33 | 66 | 132
## fields
Use the `field` command to keep or remove fields from a search result.
### Syntax
```sql
field [+|-] <field-list>
```
Field | Description | Required | Default
:--- | :--- |:---|:---
`index` | Plus (+) keeps only fields specified in the field list. Minus (-) removes all fields specified in the field list. | No | +
`field list` | Specify a comma-delimited list of fields. | Yes | No default
*Example 1*: Select specified fields from result
To get `account_number`, `firstname`, and `lastname` fields from a search result:
```sql
search source=accounts | fields account_number, firstname, lastname;
```
| account_number | firstname | lastname
:--- | :--- |
| 1 | Amber | Duke
| 6 | Hattie | Bond
| 13 | Nanette | Bates
| 18 | Dale | Adams
*Example 2*: Remove specified fields from a search result
To remove the `account_number` field from the search results:
```sql
search source=accounts | fields account_number, firstname, lastname | fields - account_number;
```
| firstname | lastname
:--- | :--- |
| Amber | Duke
| Hattie | Bond
| Nanette | Bates
| Dale | Adams
## rename
Use the `rename` command to rename one or more fields in the search result.
### Syntax
```sql
rename <source-field> AS <target-field>["," <source-field> AS <target-field>]...
```
Field | Description | Required
:--- | :--- |:---
`source-field` | The name of the field that you want to rename. | Yes
`target-field` | The name you want to rename to. | Yes
*Example 1*: Rename one field
Rename the `account_number` field as `an`:
```sql
search source=accounts | rename account_number as an | fields an;
```
| an
:--- |
| 1
| 6
| 13
| 18
*Example 2*: Rename multiple fields
Rename the `account_number` field as `an` and `employer` as `emp`:
```sql
search source=accounts | rename account_number as an, employer as emp | fields an, emp;
```
| an | emp
:--- | :--- |
| 1 | Pyrami
| 6 | Netagy
| 13 | Quility
| 18 | null
## sort
Use the `sort` command to sort search results by a specified field.
### Syntax
```sql
sort [count] <[+|-] sort-field>...
```
Field | Description | Required | Default
:--- | :--- |:---
`count` | The maximum number results to return from the sorted result. If count=0, all results are returned. | No | 1000
`[+|-]` | Use plus [+] to sort by ascending order and minus [-] to sort by descending order. | No | Ascending order
`sort-field` | Specify the field that you want to sort by. | Yes | -
*Example 1*: Sort by one field
To sort all documents by the `age` field in ascending order:
```sql
search source=accounts | sort age | fields account_number, age;
```
| account_number | age |
:--- | :--- |
| 13 | 28
| 1 | 32
| 18 | 33
| 6 | 36
*Example 2*: Sort by one field and return all results
To sort all documents by the `age` field in ascending order and specify count as 0 to get back all results:
```sql
search source=accounts | sort 0 age | fields account_number, age;
```
| account_number | age |
:--- | :--- |
| 13 | 28
| 1 | 32
| 18 | 33
| 6 | 36
*Example 3*: Sort by one field in descending order
To sort all documents by the `age` field in descending order:
```sql
search source=accounts | sort - age | fields account_number, age;
```
| account_number | age |
:--- | :--- |
| 6 | 36
| 18 | 33
| 1 | 32
| 13 | 28
*Example 4*: Specify the number of sorted documents to return
To sort all documents by the `age` field in ascending order and specify count as 2 to get back two results:
```sql
search source=accounts | sort 2 age | fields account_number, age;
```
| account_number | age |
:--- | :--- |
| 13 | 28
| 1 | 32
*Example 5*: Sort by multiple fields
To sort all documents by the `gender` field in ascending order and `age` field in descending order:
```sql
search source=accounts | sort + gender, - age | fields account_number, gender, age;
```
| account_number | gender | age |
:--- | :--- | :--- |
| 13 | F | 28
| 6 | M | 36
| 18 | M | 33
| 1 | M | 32
## stats
Use the `stats` command to aggregate from search results.
The following table lists the aggregation functions and also indicates how each one handles null or missing values:
Function | NULL | MISSING
:--- | :--- |:---
`COUNT` | Not counted | Not counted
`SUM` | Ignore | Ignore
`AVG` | Ignore | Ignore
`MAX` | Ignore | Ignore
`MIN` | Ignore | Ignore
### Syntax
```
stats <aggregation>... [by-clause]...
```
Field | Description | Required | Default
:--- | :--- |:---
`aggregation` | Specify a statistical aggregation function. The argument of this function must be a field. | Yes | 1000
`by-clause` | Specify one or more fields to group the results by. If not specified, the `stats` command returns only one row, which is the aggregation over the entire result set. | No | -
*Example 1*: Calculate the average value of a field
To calculate the average `age` of all documents:
```sql
search source=accounts | stats avg(age);
```
| avg(age)
:--- |
| 32.25
*Example 2*: Calculate the average value of a field by group
To calculate the average age grouped by gender:
```sql
search source=accounts | stats avg(age) by gender;
```
| gender | avg(age)
:--- | :--- |
| F | 28.0
| M | 33.666666666666664
*Example 3*: Calculate the average and sum of a field by group
To calculate the average and sum of age grouped by gender:
```sql
search source=accounts | stats avg(age), sum(age) by gender;
```
| gender | avg(age) | sum(age)
:--- | :--- |
| F | 28 | 28
| M | 33.666666666666664 | 101
*Example 4*: Calculate the maximum value of a field
To calculate the maximum age:
```sql
search source=accounts | stats max(age);
```
| max(age)
:--- |
| 36
*Example 5*: Calculate the maximum and minimum value of a field by group
To calculate the maximum and minimum age values grouped by gender:
```sql
search source=accounts | stats max(age), min(age) by gender;
```
| gender | min(age) | max(age)
:--- | :--- | :--- |
| F | 28 | 28
| M | 32 | 36
## where
Use the `where` command with a bool expression to filter the search result. The `where` command only returns the result when the bool expression evaluates to true.
### Syntax
```sql
where <boolean-expression>
```
Field | Description | Required
:--- | :--- |:---
`bool-expression` | An expression that evaluates to a boolean value. | No
*Example 1*: Filter result set with a condition
To get all documents from the `accounts` index where `account_number` is 1 or gender is `F`:
```sql
search source=accounts | where account_number=1 or gender="F" | fields account_number, gender;
```
| account_number | gender
:--- | :--- |
| 1 | M
| 13 | F
## head
Use the `head` command to return the first N number of results in a specified search order.
### Syntax
```sql
head [keeplast = (true | false)] [while "("<boolean-expression>")"] [N]
```
Field | Description | Required | Default
:--- | :--- |:---
`keeplast` | Use along with the `while` argument to check if the last result in the result set is retained. The last result is what caused the `while` condition to evaluate to false or NULL. Set `keeplast` to true to retain the last result and false to discard it. | No | True
`while` | An expression that evaluates to either true or false. You cannot use statistical functions in this expression. | No | False
`N` | Specify the number of results to return. | No | 10
*Example 1*: Get the first 10 results
To get the first 10 results:
```sql
search source=accounts | fields firstname, age | head;
```
| firstname | age
:--- | :--- |
| Amber | 32
| Hattie | 36
| Nanette | 28
*Example 2*: Get the first N results
To get the first two results:
```sql
search source=accounts | fields firstname, age | head 2;
```
| firstname | age
:--- | :--- |
| Amber | 32
| Hattie | 36
*Example 3*: Get the first N results that match a while condition
To get the first 3 results from all accounts with age less than 30:
```sql
search source=accounts | fields firstname, age | sort age | head while(age < 30) 3;
```
| firstname | age
:--- | :--- |
| Nanette | 28
| Amber | 32
*Example 4*: Get the first N results with a while condition with the last result that failed the condition
To get the first 3 results from all accounts with age less than 30 and include the last failed condition:
```sql
search source=accounts | fields firstname, age | sort age | head keeplast=false while(age < 30) 3;
```
| firstname | age
:--- | :--- |
| Nanette | 28
## rare
Use the `rare` command to find the least common values of all fields in a field list.
A maximum of 10 results are returned for each distinct set of values of the group-by fields.
### Syntax
```sql
rare <field-list> [by-clause]
```
Field | Description | Required
:--- | :--- |:---
`field-list` | Specify a comma-delimited list of field names. | No
`by-clause` | Specify one or more fields to group the results by. | No
*Example 1*: Find the least common values in a field
To find the least common values of gender:
```sql
search source=accounts | rare gender;
```
| gender
:--- |
| F
| M
*Example 2*: Find the least common values grouped by gender
To find the least common age grouped by gender:
```sql
search source=accounts | rare age by gender;
```
| gender | age
:--- | :--- |
| F | 28
| M | 32
| M | 33
## top {#top-command}
Use the `top` command to find the most common values of all fields in the field list.
### Syntax
```sql
top [N] <field-list> [by-clause]
```
Field | Description | Default
:--- | :--- |:---
`N` | Specify the number of results to return. | 10
`field-list` | Specify a comma-delimited list of field names. | -
`by-clause` | Specify one or more fields to group the results by. | -
*Example 1*: Find the most common values in a field
To find the most common genders:
```sql
search source=accounts | top gender;
```
| gender
:--- |
| M
| F
*Example 2*: Find the most common value in a field
To find the most common gender:
```sql
search source=accounts | top 1 gender;
```
| gender
:--- |
| M
*Example 2*: Find the most common values grouped by gender
To find the most common age grouped by gender:
```sql
search source=accounts | top 1 age by gender;
```
| gender | age
:--- | :--- |
| F | 28
| M | 32

View File

@ -0,0 +1,36 @@
---
layout: default
title: Data Types
parent: Piped processing language
nav_order: 6
---
# Data types
The following table shows the data types supported by the PPL plugin and how each one maps to OpenSearch and SQL data types:
PPL Type | OpenSearch Type | SQL Type
:--- | :--- | :---
boolean | boolean | BOOLEAN
byte | byte | TINYINT
byte | short | SMALLINT
integer | integer | INTEGER
long | long | BIGINT
float | float | REAL
float | half_float | FLOAT
float | scaled_float | DOUBLE
double | double | DOUBLE
string | keyword | VARCHAR
text | text | VARCHAR
timestamp | date | TIMESTAMP
ip | ip | VARCHAR
timestamp | date | TIMESTAMP
binary | binary | VARBINARY
struct | object | STRUCT
array | nested | STRUCT
In addition to this list, the PPL plugin also supports the `datetime` type, though it doesn't have a corresponding mapping with OpenSearch.
To use a function without a corresponding mapping, you must explicitly convert the data type to one that does.
The PPL plugin supports all SQL date and time types. To learn more, see [SQL Data Types](../../sql/datatypes/).

View File

@ -0,0 +1,22 @@
---
layout: default
title: Endpoint
parent: Piped processing language
nav_order: 1
---
# Endpoint
To send a query request to PPL plugin, use the HTTP POST request.
We recommend a POST request because it doesn't have any length limit and it allows you to pass other parameters to the plugin for other functionality.
Use the explain endpoint for query translation and troubleshooting.
## Request Format
To use the PPL plugin with your own applications, send requests to `_opensearch/_ppl`, with your query in the request body:
```json
curl -H 'Content-Type: application/json' -X POST localhost:9200/_opensearch/_ppl \
... -d '{"query" : "source=accounts | fields firstname, lastname"}'
```

View File

@ -0,0 +1,10 @@
---
layout: default
title: Functions
parent: Piped processing language
nav_order: 10
---
# Functions
The PPL plugin supports all SQL functions. To learn more, see [SQL Functions](../../sql/functions/).

View File

@ -0,0 +1,72 @@
---
layout: default
title: Identifiers
parent: Piped processing language
nav_order: 7
---
# Identifiers
An identifier is an ID to name your database objects, such as index names, field names, aliases, and so on.
OpenSearch supports two types of identifiers: regular identifiers and delimited identifiers.
## Regular identifiers
A regular identifier is a string of characters that starts with an ASCII letter (lower or upper case).
The next character can either be a letter, digit, or underscore (_). It can't be a reserved keyword.
Whitespace and other special characters are also not allowed.
OpenSearch supports the following regular identifiers:
1. Identifiers prefixed by a dot `.` sign. Use to hide an index. For example `.opensearch-dashboards`.
2. Identifiers prefixed by an `@` sign. Use for meta fields generated by Logstash ingestion.
3. Identifiers with hyphen `-` in the middle. Use for index names with date information.
4. Identifiers with star `*` present. Use for wildcard match of index patterns.
For regular identifiers, you can use the name without any back tick or escape characters.
In this example, `source`, `fields`, `account_number`, `firstname`, and `lastname` are all identifiers. Out of these, the `source` field is a reserved identifier.
```sql
source=accounts | fields account_number, firstname, lastname;
```
| account_number | firstname | lastname |
:--- | :--- |
| 1 | Amber | Duke
| 6 | Hattie | Bond
| 13 | Nanette | Bates
| 18 | Dale | Adams
## Delimited identifiers
A delimited identifier can contain special characters not allowed by a regular identifier.
You must enclose delimited identifiers with back ticks (\`\`). Back ticks differentiate the identifier from special characters.
If the index name includes a dot (`.`), for example, `log-2021.01.11`, use delimited identifiers with back ticks to escape it \``log-2021.01.11`\`.
Typical examples of using delimited identifiers:
1. Identifiers with reserved keywords.
2. Identifiers with a `.` present. Similarly, `-` to include date information.
3. Identifiers with other special characters. For example, Unicode characters.
To quote an index name with back ticks:
```sql
source=`accounts` | fields `account_number`;
```
| account_number |
:--- |
| 1 |
| 6 |
| 13 |
| 18 |
## Case sensitivity
Identifiers are case sensitive. They must be exactly the same as what's stored in OpenSearch.
For example, if you run `source=Accounts`, you'll get an index not found exception because the actual index name is in lower case.

View File

@ -0,0 +1,58 @@
---
layout: default
title: Piped processing language
nav_order: 42
has_children: true
has_toc: false
---
# Piped Processing Language
Piped Processing Language (PPL) is a query language that lets you use pipe (`|`) syntax to explore, discover, and query data stored in OpenSearch.
To quickly get up and running with PPL, use **Query Workbench** in OpenSearch Dashboards. To learn more, see [Workbench](../sql/workbench/).
The PPL syntax consists of commands delimited by the pipe character (`|`) where data flows from left to right through each pipeline.
```sql
search command | command 1 | command 2 ...
```
You can only use read-only commands like `search`, `where`, `fields`, `rename`, `dedup`, `stats`, `sort`, `eval`, `head`, `top`, and `rare`.
## Quick start
To get started with PPL, choose **Dev Tools** in OpenSearch Dashboards and use the `bulk` operation to index some sample data:
```json
PUT accounts/_bulk?refresh
{"index":{"_id":"1"}}
{"account_number":1,"balance":39225,"firstname":"Amber","lastname":"Duke","age":32,"gender":"M","address":"880 Holmes Lane","employer":"Pyrami","email":"amberduke@pyrami.com","city":"Brogan","state":"IL"}
{"index":{"_id":"6"}}
{"account_number":6,"balance":5686,"firstname":"Hattie","lastname":"Bond","age":36,"gender":"M","address":"671 Bristol Street","employer":"Netagy","email":"hattiebond@netagy.com","city":"Dante","state":"TN"}
{"index":{"_id":"13"}}
{"account_number":13,"balance":32838,"firstname":"Nanette","lastname":"Bates","age":28,"gender":"F","address":"789 Madison Street","employer":"Quility","city":"Nogal","state":"VA"}
{"index":{"_id":"18"}}
{"account_number":18,"balance":4180,"firstname":"Dale","lastname":"Adams","age":33,"gender":"M","address":"467 Hutchinson Court","email":"daleadams@boink.com","city":"Orick","state":"MD"}
```
Go to **Query Workbench** and select **PPL**.
The following example returns `firstname` and `lastname` fields for documents in an `accounts` index with `age` greater than 18:
```json
search source=accounts
| where age > 18
| fields firstname, lastname
```
#### Sample Response
| id | firstname | lastname |
:--- | :--- | :--- |
| 0 | Amber | Duke
| 1 | Hattie | Bond
| 2 | Nanette | Bates
| 3 | Dale | Adams
![PPL query workbench](../images/ppl.png)

View File

@ -0,0 +1,71 @@
---
layout: default
title: Protocol
parent: Piped processing language
nav_order: 2
---
# Protocol
The PPL plugin provides responses in JDBC format. The JDBC format is widely used because it provides schema information and more functionality such as pagination. Besides JDBC driver, various clients can benefit from the detailed and well formatted response.
## Response Format
The body of HTTP POST request can take a few more additional fields with the PPL query:
```json
curl -H 'Content-Type: application/json' -X POST localhost:9200/_opensearch/_ppl \
... -d '{"query" : "source=accounts | fields firstname, lastname"}'
```
The following example shows a normal response where the schema includes a field name and its type and datarows includes the result set:
```json
{
"schema": [
{
"name": "firstname",
"type": "string"
},
{
"name": "lastname",
"type": "string"
}
],
"datarows": [
[
"Amber",
"Duke"
],
[
"Hattie",
"Bond"
],
[
"Nanette",
"Bates"
],
[
"Dale",
"Adams"
]
],
"total": 4,
"size": 4
}
```
If any error occurred, error message and the cause will be returned instead:
```json
curl -H 'Content-Type: application/json' -X POST localhost:9200/_opensearch/_ppl \
... -d '{"query" : "source=unknown | fields firstname, lastname"}'
{
"error": {
"reason": "Error occurred in OpenSearch engine: no such index [unknown]",
"details": "org.opensearch.index.IndexNotFoundException: no such index [unknown]\nFor more details, please send request for Json format to see the raw response from opensearch engine.",
"type": "IndexNotFoundException"
},
"status": 404
}
```

View File

@ -0,0 +1,35 @@
---
layout: default
title: Settings
parent: Piped processing language
nav_order: 3
---
# Settings
The PPL plugin adds a few settings to the standard OpenSearch cluster settings. Most are dynamic, so you can change the default behavior of the plugin without restarting your cluster.
You can update these settings like any other cluster setting:
```json
PUT _cluster/settings
{
"transient": {
"opensearch": {
"ppl": {
"enabled": "false"
}
}
}
}
```
Requests to `_opensearch/_ppl` include index names in the request body, so they have the same access policy considerations as the `bulk`, `mget`, and `msearch` operations. If you set the `rest.action.multi.allow_explicit_index` parameter to `false`, the PPL plugin is disabled.
You can specify the settings shown in the following table:
Setting | Description | Default
:--- | :--- | :---
`opensearch.ppl.enabled` | Change to `false` to disable the plugin. | True
`opensearch.ppl.query.memory_limit` | Set heap memory usage limit. If a query crosses this limit, it's terminated. | 85%
`opensearch.query.size_limit` | Set the maximum number of results that you want to see. This impacts the accuracy of aggregation operations. For example, if you have 1000 documents in an index, by default, only 200 documents are extracted from the index for aggregation. | 200

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,242 @@
---
layout: default
title: Cross-Cluster Search
parent: Access Control
grand_parent: Security
nav_order: 40
---
# Cross-cluster search
Cross-cluster search is exactly what it sounds like: it lets any node in a cluster execute search requests against other clusters. The security plugin supports cross-cluster search out of the box.
---
#### Table of contents
1. TOC
{:toc}
---
## Authentication flow
When accessing a *remote cluster* from a *coordinating cluster* using cross-cluster search:
1. The security plugin authenticates the user on the coordinating cluster.
1. The security plugin fetches the user's backend roles on the coordinating cluster.
1. The call, including the authenticated user, is forwarded to the remote cluster.
1. The user's permissions are evaluated on the remote cluster.
You can have different authentication and authorization configurations on the remote and coordinating cluster, but we recommend using the same settings on both.
## Permissions
To query indices on remote clusters, users need to have the following permissions for the index, in addition to `READ` or `SEARCH` permissions:
```
indices:admin/shards/search_shards
```
#### Sample roles.yml configuration
```yml
humanresources:
cluster:
- CLUSTER_COMPOSITE_OPS_RO
indices:
'humanresources':
'*':
- READ
- indices:admin/shards/search_shards # needed for CCS
```
#### Sample role in OpenSearch Dashboards
![OpenSearch Dashboards UI for creating a cross-cluster search role](../../../images/security-ccs.png)
## Walkthrough
Save this file as `docker-compose.yml` and run `docker-compose up` to start two single-node clusters on the same network:
```yml
version: '3'
services:
opensearch-node1:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node1
environment:
- cluster.name=opensearch-cluster1
- discovery.type=single-node
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- opensearch-data1:/usr/share/opensearch/data
ports:
- 9200:9200
- 9600:9600 # required for Performance Analyzer
networks:
- opensearch-net
opensearch-node2:
image: opensearchproject/opensearch:{{site.opensearch_version}}
container_name: opensearch-node2
environment:
- cluster.name=opensearch-cluster2
- discovery.type=single-node
- bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- opensearch-data2:/usr/share/opensearch/data
ports:
- 9250:9200
- 9700:9600 # required for Performance Analyzer
networks:
- opensearch-net
volumes:
opensearch-data1:
opensearch-data2:
networks:
opensearch-net:
```
After the clusters start, verify the names of each:
```json
curl -XGET -u 'admin:admin' -k 'https://localhost:9200'
{
"cluster_name" : "opensearch-cluster1",
...
}
curl -XGET -u 'admin:admin' -k 'https://localhost:9250'
{
"cluster_name" : "opensearch-cluster2",
...
}
```
Both clusters run on `localhost`, so the important identifier is the port number. In this case, use port 9200 (`opensearch-node1`) as the remote cluster, and port 9250 (`opensearch-node2`) as the coordinating cluster.
To get the IP address for the remote cluster, first identify its container ID:
```bash
docker ps
CONTAINER ID IMAGE PORTS NAMES
6fe89ebc5a8e opensearchproject/opensearch:{{site.opensearch_version}} 0.0.0.0:9200->9200/tcp, 0.0.0.0:9600->9600/tcp, 9300/tcp opensearch-node1
2da08b6c54d8 opensearchproject/opensearch:{{site.opensearch_version}} 9300/tcp, 0.0.0.0:9250->9200/tcp, 0.0.0.0:9700->9600/tcp opensearch-node2
```
Then get that container's IP address:
```bash
docker inspect --format='{% raw %}{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}{% endraw %}' 6fe89ebc5a8e
172.31.0.3
```
On the coordinating cluster, add the remote cluster name and the IP address (with port 9300) for each "seed node." In this case, you only have one seed node:
```json
curl -k -XPUT -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9250/_cluster/settings' -d '
{
"persistent": {
"search.remote": {
"opensearch-cluster1": {
"seeds": ["172.31.0.3:9300"]
}
}
}
}'
```
On the remote cluster, index a document:
```bash
curl -XPUT -k -H 'Content-Type: application/json' -u 'admin:admin' 'https://localhost:9200/books/_doc/1' -d '{"Dracula": "Bram Stoker"}'
```
At this point, cross-cluster search works. You can test it using the `admin` user:
```bash
curl -XGET -k -u 'admin:admin' 'https://localhost:9250/opensearch-cluster1:books/_search?pretty'
{
...
"hits": [{
"_index": "opensearch-cluster1:books",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"Dracula": "Bram Stoker"
}
}]
}
```
To continue testing, create a new user on both clusters:
```bash
curl -XPUT -k -u 'admin:admin' 'https://localhost:9200/_opensearch/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
curl -XPUT -k -u 'admin:admin' 'https://localhost:9250/_opensearch/_security/api/internalusers/booksuser' -H 'Content-Type: application/json' -d '{"password":"password"}'
```
Then run the same search as before with `booksuser`:
```json
curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-cluster1:books/_search?pretty'
{
"error" : {
"root_cause" : [
{
"type" : "security_exception",
"reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]"
}
],
"type" : "security_exception",
"reason" : "no permissions for [indices:admin/shards/search_shards, indices:data/read/search] and User [name=booksuser, roles=[], requestedTenant=null]"
},
"status" : 403
}
```
Note the permissions error. On the remote cluster, create a role with the appropriate permissions, and map `booksuser` to that role:
```bash
curl -XPUT -k -u 'admin:admin' -H 'Content-Type: application/json' 'https://localhost:9200/_opensearch/_security/api/roles/booksrole' -d '{"index_permissions":[{"index_patterns":["books"],"allowed_actions":["indices:admin/shards/search_shards","indices:data/read/search"]}]}'
curl -XPUT -k -u 'admin:admin' -H 'Content-Type: application/json' 'https://localhost:9200/_opensearch/_security/api/rolesmapping/booksrole' -d '{"users" : ["booksuser"]}'
```
Both clusters must have the user, but only the remote cluster needs the role and mapping; in this case, the coordinating cluster handles authentication (i.e. "Does this request include valid user credentials?"), and the remote cluster handles authorization (i.e. "Can this user access this data?").
{: .tip }
Finally, repeat the search:
```bash
curl -XGET -k -u booksuser:password 'https://localhost:9250/opensearch-cluster1:books/_search?pretty'
{
...
"hits": [{
"_index": "opensearch-cluster1:books",
"_type": "_doc",
"_id": "1",
"_score": 1.0,
"_source": {
"Dracula": "Bram Stoker"
}
}]
}
```

View File

@ -0,0 +1,54 @@
---
layout: default
title: Default Action Groups
parent: Access Control
grand_parent: Security
nav_order: 51
---
# Default action groups
This page catalogs all default action groups. Often, the most coherent way to create new action groups is to use a combination of these default groups and [individual permissions](../permissions).
## General
Name | Description
:--- | :---
unlimited | Grants complete access. Can be used on an cluster- or index-level. Equates to `"*"`.
{% comment %}kibana_all_read | asdf
kibana_all_write | asdf{% endcomment %}
## Cluster-level
Name | Description
:---| :---
cluster_all | Grants all cluster permissions. Equates to `cluster:*`.
cluster_monitor | Grants all cluster monitoring permissions. Equates to `cluster:monitor/*`.
cluster_composite_ops_ro | Grants read-only permissions to execute requests like `mget`, `msearch`, or `mtv`, plus permissions to query for aliases.
cluster_composite_ops | Same as `CLUSTER_COMPOSITE_OPS_RO`, but also grants `bulk` permissions and all aliases permissions.
manage_snapshots | Grants permissions to manage snapshots and repositories.
cluster_manage_pipelines | Grants permissions to manage ingest pipelines.
cluster_manage_index_templates | Grants permissions to manage index templates.
## Index-level
Name | Description
:--- | :---
indices_all | Grants all permissions on the index. Equates to `indices:*`.
get | Grants permissions to use `get` and `mget` actions only.
read | Grants read permissions such as search, get field mappings, `get`, and `mget`.
write | Grants permissions to create and update documents within *existing indices*. To create new indices, see `create_index`.
delete | Grants permissions to delete documents.
crud | Combines the `read`, `write`, and `delete` action groups. Included in the `data_access` action group.
search | Grants permissions to search documents. Includes `suggest`.
suggest | Grants permissions to use the suggest API. Included in the `read` action group.
create_index | Grants permissions to create indices and mappings.
indices_monitor | Grants permissions to execute all index monitoring actions (e.g. recovery, segments info, index stats, and status).
index | A more limited version of the `write` action group.
data_access | Combines the `crud` action group with `indices:data/*`.
manage_aliases | Grants permissions to manage aliases.
manage | Grants all monitoring and administration permissions for indices.

View File

@ -0,0 +1,127 @@
---
layout: default
title: Document-Level Security
parent: Access Control
grand_parent: Security
nav_order: 10
---
# Document-level security
Document-level security lets you restrict a role to a subset of documents in an index. The easiest way to get started with document- and field-level security is open OpenSearch Dashboards and choose **Security**. Then choose **Roles**, create a new role, and review the **Index permissions** section.
![Document- and field-level security screen in OpenSearch Dashboards](../../../images/security-dls.png)
## Simple roles
Document-level security uses the OpenSearch query DSL to define which documents a role grants access to. In OpenSearch Dashboards, choose an index pattern and provide a query in the **Document level security** section:
```json
{
"bool": {
"must": {
"match": {
"genres": "Comedy"
}
}
}
}
```
This query specifies that for the role to have access to a document, its `genres` field must include `Comedy`.
A typical request to the `_search` API includes `{ "query": { ... } }` around the query, but in this case, you only need to specify the query itself.
In the REST API, you provide the query as a string, so you must escape your quotes. This role allows a user to read any document in any index with the field `public` set to `true`:
```json
PUT _opensearch/_security/api/roles/public_data
{
"cluster_permissions": [
"*"
],
"index_permissions": [{
"index_patterns": [
"pub*"
],
"dls": "{\"term\": { \"public\": true}}",
"allowed_actions": [
"read"
]
}]
}
```
These queries can be as complex as you want, but we recommend keeping them simple to minimize the performance impact that the document-level security feature has on the cluster.
{: .warning }
## Parameter substitution
A number of variables exist that you can use to enforce rules based on the properties of a user. For example, `${user.name}` is replaced with the name of the current user.
This rule allows a user to read any document where the username is a value of the `readable_by` field:
```json
PUT _opensearch/_security/api/roles/user_data
{
"cluster_permissions": [
"*"
],
"index_permissions": [{
"index_patterns": [
"pub*"
],
"dls": "{\"term\": { \"readable_by\": \"${user.name}\"}}",
"allowed_actions": [
"read"
]
}]
}
```
This table lists substitutions.
Term | Replaced with
:--- | :---
`${user.name}` | Username.
`${user.roles}` | A comma-separated, quoted list of user roles.
`${attr.<TYPE>.<NAME>}` | An attribute with name `<NAME>` defined for a user. `<TYPE>` is `internal`, `jwt`, `proxy` or `ldap`
## Attribute-based security
You can use roles and parameter substitution with the `terms_set` query to enable attribute-based security.
> Note that the `security_attributes` of the index need to be of type `keyword`.
#### User definition
```json
PUT _opensearch/_security/api/internalusers/user1
{
"password": "asdf",
"backend_roles": ["abac"],
"attributes": {
"permissions": "\"att1\", \"att2\", \"att3\""
}
}
```
#### Role definition
```json
PUT _opensearch/_security/api/roles/abac
{
"index_permissions": [{
"index_patterns": [
"*"
],
"dls": "{\"terms_set\": {\"security_attributes\": {\"terms\": [${attr.internal.permissions}], \"minimum_should_match_script\": {\"source\": \"doc['security_attributes'].length\"}}}}",
"allowed_actions": [
"read"
]
}]
}
```

View File

@ -0,0 +1,125 @@
---
layout: default
title: Field-Level Security
parent: Access Control
grand_parent: Security
nav_order: 11
---
# Field-level security
Field-level security lets you control which document fields a user can see. Just like [document-level security](../document-level-security/), you control access by index within a role.
The easiest way to get started with document- and field-level security is open OpenSearch Dashboards and choose **Security**. Then choose **Roles**, create a new role, and review the **Index permissions** section.
---
#### Table of contents
1. TOC
{:toc}
---
## Include or exclude fields
You have two options when you configure field-level security: include or exclude fields. If you include fields, users see *only* those fields when they retrieve a document. For example, if you include the `actors`, `title`, and `year` fields, a search result might look like this:
```json
{
"_index": "movies",
"_type": "_doc",
"_source": {
"year": 2013,
"title": "Rush",
"actors": [
"Daniel Brühl",
"Chris Hemsworth",
"Olivia Wilde"
]
}
}
```
If you exclude fields, users see everything *but* those fields when they retrieve a document. For example, if you exclude those same fields, the same search result might look like this:
```json
{
"_index": "movies",
"_type": "_doc",
"_source": {
"directors": [
"Ron Howard"
],
"plot": "A re-creation of the merciless 1970s rivalry between Formula One rivals James Hunt and Niki Lauda.",
"genres": [
"Action",
"Biography",
"Drama",
"Sport"
]
}
}
```
You can achieve the same outcomes using inclusion or exclusion, so choose whichever makes sense for your use case. Mixing the two doesn't make sense and is not supported.
You can specify field-level security settings using OpenSearch Dashboards, `roles.yml`, and the REST API.
- To exclude fields in `roles.yml` or the REST API, add `~` before the field name.
- Field names support wildcards (`*`).
Wildcards are especially useful for excluding *subfields*. For example, if you index a document that has a string (e.g. `{"title": "Thor"}`), OpenSearch creates a `title` field of type `text`, but it also creates a `title.keyword` subfield of type `keyword`. In this example, to prevent unauthorized access to data in the `title` field, you must also exclude the `title.keyword` subfield. Use `title*` to match all fields that begin with `title`.
### OpenSearch Dashboards
1. Choose a role and **Add index permission**.
1. Choose an index pattern.
1. Under **Field level security**, use the drop-down to select your preferred option. Then specify one or more fields and press Enter.
### roles.yml
```yml
someonerole:
cluster: []
indices:
movies:
'*':
- "READ"
_fls_:
- "~actors"
- "~title"
- "~year"
```
### REST API
See [Create role](../api/#create-role).
## Interaction with multiple roles
If you map a user to multiple roles, we recommend that those roles use either include *or* exclude statements for each index. The security plugin evaluates field-level security settings using the `AND` operator, so combining include and exclude statements can lead to neither behavior working properly.
For example, in the `movies` index, if you include `actors`, `title`, and `year` in one role, exclude `actors`, `title`, and `genres` in another role, and then map both roles to the same user, a search result might look like this:
```json
{
"_index": "movies",
"_type": "_doc",
"_source": {
"year": 2013,
"directors": [
"Ron Howard"
],
"plot": "A re-creation of the merciless 1970s rivalry between Formula One rivals James Hunt and Niki Lauda."
}
}
```
## Interaction with document-level security
[Document-level security](../document-level-security/) relies on OpenSearch queries, which means that all fields in the query must be visible in order for it to work properly. If you use field-level security in conjunction with document-level security, make sure you don't restrict access to the fields that document-level security uses.

View File

@ -0,0 +1,126 @@
---
layout: default
title: Field Masking
parent: Access Control
grand_parent: Security
nav_order: 12
---
# Field masking
If you don't want to remove fields from a document using [field-level security](../field-level-security/), you can mask their values. Currently, field masking is only available for string-based fields and replaces the field's value with a cryptographic hash.
Field masking works alongside field-level security on the same per-role, per-index basis. You can allow certain roles to see sensitive fields in plain text and mask them for others. A search result with a masked field might look like this:
```json
{
"_index": "movies",
"_type": "_doc",
"_source": {
"year": 2013,
"directors": [
"Ron Howard"
],
"title": "ca998e768dd2e6cdd84c77015feb29975f9f498a472743f159bec6f1f1db109e"
}
}
```
## Set the salt
You set the salt (a random string used to hash your data) in `opensearch.yml`:
```yml
opensearch_security.compliance.salt: abcdefghijklmnopqrstuvqxyz1234567890
```
Property | Description
:--- | :---
`opensearch_security.compliance.salt` | The salt to use when generating the hash value. Must be at least 32 characters. Only ASCII characters are allowed. Optional.
Setting the salt is optional, but we highly recommend it.
## Configure field masking
You configure field masking using OpenSearch Dashboards, `roles.yml`, or the REST API.
### OpenSearch Dashboards
1. Choose a role.
1. Choose an index permission.
1. For **Anonymization**, specify one or more fields and press Enter.
### roles.yml
```yml
someonerole:
cluster: []
indices:
movies:
_masked_fields_:
- "title"
- "genres"
'*':
- "READ"
```
### REST API
See [Create role](../api/#create-role).
## (Advanced) Use an alternative hash algorithm
By default, the security plugin uses the BLAKE2b algorithm, but you can use any hashing algorithm that your JVM provides. This list typically includes MD5, SHA-1, SHA-384, and SHA-512.
To specify a different algorithm, add it after the masked field:
```yml
someonerole:
cluster: []
indices:
movies:
_masked_fields_:
- "title::SHA-512"
- "genres"
'*':
- "READ"
```
## (Advanced) Pattern-based field masking
Rather than creating a hash, you can use one or more regular expressions and replacement strings to mask a field. The syntax is `<field>::/<regular-expression>/::<replacement-string>`. If you use multiple regular expressions, the results are passed from left to right, like piping in a shell:
```yml
hr_employee:
index_permissions:
- index_patterns:
- 'humanresources'
allowed_actions:
- ...
masked_fields:
- 'lastname::/.*/::*'
- '*ip_source::/[0-9]{1,3}$/::XXX::/^[0-9]{1,3}/::***'
someonerole:
cluster: []
indices:
movies:
_masked_fields_:
- "title::/./::*"
- "genres::/^[a-zA-Z]{1,3}/::XXX::/[a-zA-Z]{1,3}$/::YYY"
'*':
- "READ"
```
The `title` statement changes each character in the field to `*`, so you can still discern the length of the masked string. The `genres` statement changes the first three characters of the string to `XXX` and the last three characters to `YYY`.
## Effect on audit logging
The read history feature lets you track read access to sensitive fields in your documents. For example, you might track access to the email field of your customer records. Access to masked fields are excluded from read history, because the user only saw the hash value, not the clear text value of the field.

View File

@ -0,0 +1,49 @@
---
layout: default
title: User Impersonation
parent: Access Control
grand_parent: Security
nav_order: 20
---
# User impersonation
User impersonation allows specially privileged users to act as another user without knowledge of nor access to the impersonated user's credentials.
Impersonation can be useful for testing and troubleshooting, or for allowing system services to safely act as a user.
Impersonation can occur on either the REST interface or at the transport layer.
## REST interface
To allow one user to impersonate another, add the following to `opensearch.yml`:
```yml
opensearch_security.authcz.rest_impersonation_user:
<AUTHENTICATED_USER>:
- <IMPERSONATED_USER_1>
- <IMPERSONATED_USER_2>
```
The impersonated user field supports wildcards. Setting it to `*` allows `AUTHENTICATED_USER` to impersonate any user.
## Transport interface
In a similar fashion, add the following to enable transport layer impersonation:
```yml
opensearch_security.authcz.impersonation_dn:
"CN=spock,OU=client,O=client,L=Test,C=DE":
- worf
```
## Impersonating Users
To impersonate another user, submit a request to the system with the HTTP header `opensearch_security_impersonate_as` set to the name of the user to be impersonated. A good test is to make a GET request to the `_opensearch/_security/authinfo` URI:
```bash
curl -XGET -u 'admin:admin' -k -H "opensearch_security_impersonate_as: user_1" https://localhost:9200/_opensearch/_security/authinfo?pretty
```

View File

@ -0,0 +1,28 @@
---
layout: default
title: Access Control
nav_order: 10
parent: Security
has_children: true
has_toc: false
---
# Access control
After you [configure the security plugin](../configuration/) to use your own certificates and preferred authentication backend, you can start adding users, creating roles, and mapping roles to users.
This section of the documentation covers what a user is allowed to see and do after successfully authenticating.
## Concepts
Term | Description
:--- | :---
Permission | An individual action, such as creating an index (e.g. `indices:admin/create`). For a complete list, see [Permissions](permissions/).
Action group | A set of permissions. For example, the predefined `SEARCH` action group authorizes roles to use the `_search` and `_msearch` APIs.
Role | Security roles define the scope of a permission or action group: cluster, index, document, or field. For example, a role named `delivery_analyst` might have no cluster permissions, the `READ` action group for all indices that match the `delivery-data-*` pattern, access to all document types within those indices, and access to all fields except `delivery_driver_name`.
Backend role | (Optional) Arbitrary strings that you specify *or* that come from an external authentication system (e.g. LDAP/Active Directory). Backend roles can help simplify the role mapping process. Rather than mapping a role to 100 individual users, you can map the role to a single backend role that all 100 users share.
User | Users make requests to OpenSearch clusters. A user has credentials (e.g. a username and password), zero or more backend roles, and zero or more custom attributes.
Role mapping | Users assume roles after they successfully authenticate. Role mappings, well, map roles to users (or backend roles). For example, a mapping of `kibana_user` (role) to `jdoe` (user) means that John Doe gains all the permissions of `kibana_user` after authenticating. Likewise, a mapping of `all_access` (role) to `admin` (backend role) means that any user with the backend role of `admin` gains all the permissions of `all_access` after authenticating. You can map each role to many users and/or backend roles.
The security plugin comes with a number of [predefined action groups](default-action-groups/), roles, mappings, and users. These entities serve as sensible defaults and are good examples of how to use the plugin.

Some files were not shown because too many files have changed in this diff Show More