LUCENE-9463: Query match region retrieval component, passage scoring and formatting (#1750)

Reviewed as part of previous issue by @romseygeek
2020-08-14 14:21:12 +02:00 · 2020-08-14 14:21:12 +02:00 · 150a8dacb5
parent a003f64649
commit 150a8dacb5
22 changed files with 2758 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -63,6 +63,9 @@ API Changes

 Improvements

+* LUCENE-9463: Query match region retrieval component, passage scoring and formatting
+  for building custom highlighters. (Alan Woodward, Dawid Weiss)
+
 * LUCENE-9370: RegExp query is no longer lenient about inappropriate backslashes and
  follows the Java Pattern policy for rejecting illegal syntax.  (Mark Harwood)

--- a/lucene/highlighter/build.gradle
+++ b/lucene/highlighter/build.gradle
@ -28,4 +28,5 @@ dependencies {

  testImplementation project(':lucene:test-framework')
  testImplementation project(':lucene:analysis:common')
+  testImplementation project(':lucene:queryparser')
 }
--- a/lucene/highlighter/build.xml
+++ b/lucene/highlighter/build.xml
@ -38,6 +38,7 @@
    <pathelement path="${memory.jar}"/>
    <pathelement path="${queries.jar}"/>
    <pathelement path="${analyzers-common.jar}"/>
+    <pathelement path="${queryparser.jar}"/>
    <path refid="test.base.classpath"/>
  </path>

--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/BreakIteratorShrinkingAdjuster.java
@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+/**
+ * A {@link PassageAdjuster} that adjusts the {@link Passage} range to
+ * word boundaries hinted by the given {@link BreakIterator}.
+ */
+public class BreakIteratorShrinkingAdjuster implements PassageAdjuster {
+  private final BreakIterator bi;
+  private CharSequence value;
+
+  public BreakIteratorShrinkingAdjuster() {
+    this(BreakIterator.getWordInstance(Locale.ROOT));
+  }
+
+  public BreakIteratorShrinkingAdjuster(BreakIterator bi) {
+    this.bi = bi;
+  }
+
+  @Override
+  public void currentValue(CharSequence value) {
+    this.value = value;
+    bi.setText(new CharSequenceIterator(value));
+  }
+
+  @Override
+  public OffsetRange adjust(Passage passage) {
+    int from = passage.from;
+    if (from > 0) {
+      while (!bi.isBoundary(from)
+          || (from < value.length() && Character.isWhitespace(value.charAt(from)))) {
+        from = bi.following(from);
+        if (from == BreakIterator.DONE) {
+          from = passage.from;
+          break;
+        }
+      }
+      if (from == value.length()) {
+        from = passage.from;
+      }
+    }
+
+    int to = passage.to;
+    if (to != value.length()) {
+      while (!bi.isBoundary(to) || (to > 0 && Character.isWhitespace(value.charAt(to - 1)))) {
+        to = bi.preceding(to);
+        if (to == BreakIterator.DONE) {
+          to = passage.to;
+          break;
+        }
+      }
+      if (to == 0) {
+        to = passage.to;
+      }
+    }
+
+    for (OffsetRange r : passage.markers) {
+      from = Math.min(from, r.from);
+      to = Math.max(to, r.to);
+    }
+
+    if (from > to) {
+      from = to;
+    }
+
+    return new OffsetRange(from, to);
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/CharSequenceIterator.java
@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.text.CharacterIterator;
+
+/**
+ * A {@link CharacterIterator} over a {@link CharSequence}.
+ */
+final class CharSequenceIterator implements CharacterIterator {
+  private final CharSequence text;
+
+  private int begin;
+  private int end;
+  private int pos;
+
+  public CharSequenceIterator(CharSequence text) {
+    this.text = text;
+    this.begin = 0;
+    this.end = text.length();
+  }
+
+  public char first() {
+    pos = begin;
+    return current();
+  }
+
+  public char last() {
+    if (end != begin) {
+      pos = end - 1;
+    } else {
+      pos = end;
+    }
+    return current();
+  }
+
+  public char setIndex(int p) {
+    if (p < begin || p > end) throw new IllegalArgumentException("Invalid index");
+    pos = p;
+    return current();
+  }
+
+  public char current() {
+    if (pos >= begin && pos < end) {
+      return text.charAt(pos);
+    } else {
+      return DONE;
+    }
+  }
+
+  public char next() {
+    if (pos < end - 1) {
+      pos++;
+      return text.charAt(pos);
+    } else {
+      pos = end;
+      return DONE;
+    }
+  }
+
+  public char previous() {
+    if (pos > begin) {
+      pos--;
+      return text.charAt(pos);
+    } else {
+      return DONE;
+    }
+  }
+
+  public int getBeginIndex() {
+    return begin;
+  }
+
+  public int getEndIndex() {
+    return end;
+  }
+
+  public int getIndex() {
+    return pos;
+  }
+
+  @Override
+  public Object clone() {
+    try {
+      return super.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Matches;
+import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Weight;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.function.Predicate;
+
+/**
+ * Utility class to compute a list of "match regions" for a given query, searcher and
+ * document(s) using {@link Matches} API.
+ */
+public class MatchRegionRetriever {
+  private final List<LeafReaderContext> leaves;
+  private final Weight weight;
+  private final TreeSet<String> affectedFields;
+  private final Map<String, OffsetsRetrievalStrategy> offsetStrategies;
+  private final Set<String> preloadFields;
+
+  /**
+   * A callback for accepting a single document (and its associated leaf reader, leaf document ID)
+   * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for
+   * the query.
+   */
+  @FunctionalInterface
+  public interface MatchOffsetsConsumer {
+    void accept(int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits)
+        throws IOException;
+  }
+
+  /**
+   * An abstraction that provides document values for a given field. Default implementation
+   * in {@link DocumentFieldValueProvider} just reaches to a preloaded {@link Document}. It is
+   * possible to write a more efficient implementation on top of a reusable character buffer
+   * (that reuses the buffer while retrieving hit regions for documents).
+   */
+  @FunctionalInterface
+  public interface FieldValueProvider {
+    List<CharSequence> getValues(String field);
+  }
+
+  /**
+   * A constructor with the default offset strategy supplier.
+   */
+  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
+    this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+  }
+
+  /**
+   * @param searcher Index searcher to be used for retrieving matches.
+   * @param query The query for which matches should be retrieved. The query should be rewritten
+   *              against the provided searcher.
+   * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
+   *                 in the absence of position offsets in the index. Note that the analyzer must return
+   *                 tokens (positions and offsets) identical to the ones stored in the index.
+   * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
+   *                                    instances.
+   */
+  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
+                              OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
+      throws IOException {
+    leaves = searcher.getIndexReader().leaves();
+    assert checkOrderConsistency(leaves);
+
+    // We need full scoring mode so that we can receive matches from all sub-clauses
+    // (no optimizations in Boolean queries take place).
+    weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);
+
+    // Compute the subset of fields affected by this query so that we don't load or scan
+    // fields that are irrelevant.
+    affectedFields = new TreeSet<>();
+    query.visit(
+        new QueryVisitor() {
+          @Override
+          public boolean acceptField(String field) {
+            affectedFields.add(field);
+            return false;
+          }
+        });
+
+    // Compute value offset retrieval strategy for all affected fields.
+    offsetStrategies = new HashMap<>();
+    for (String field : affectedFields) {
+      offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
+    }
+
+    // Ask offset strategies if they'll need field values.
+    preloadFields = new HashSet<>();
+    offsetStrategies.forEach(
+        (field, strategy) -> {
+          if (strategy.requiresDocument()) {
+            preloadFields.add(field);
+          }
+        });
+
+    // Only preload those field values that can be affected by the query and are required
+    // by strategies.
+    preloadFields.retainAll(affectedFields);
+  }
+
+  public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer) throws IOException {
+    highlightDocuments(Arrays.stream(topDocs.scoreDocs)
+        .mapToInt(scoreDoc -> scoreDoc.doc)
+        .sorted()
+        .iterator(), consumer);
+  }
+
+  /**
+   * Low-level, high-efficiency method for highlighting large numbers of documents at once in a
+   * streaming fashion.
+   *
+   * @param docIds A stream of <em>sorted</em> document identifiers for which hit ranges should
+   *               be returned.
+   * @param consumer A streaming consumer for document-hits pairs.
+   */
+  public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
+      throws IOException {
+    if (leaves.isEmpty()) {
+      return;
+    }
+
+    Iterator<LeafReaderContext> ctx = leaves.iterator();
+    LeafReaderContext currentContext = ctx.next();
+    int previousDocId = -1;
+    Map<String, List<OffsetRange>> highlights = new TreeMap<>();
+    while (docIds.hasNext()) {
+      int docId = docIds.nextInt();
+
+      if (docId < previousDocId) {
+        throw new RuntimeException("Input document IDs must be sorted (increasing).");
+      }
+      previousDocId = docId;
+
+      while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
+        currentContext = ctx.next();
+      }
+
+      int contextRelativeDocId = docId - currentContext.docBase;
+
+      // Only preload fields we may potentially need.
+      FieldValueProvider documentSupplier;
+      if (preloadFields.isEmpty()) {
+        documentSupplier = null;
+      } else {
+        Document doc = currentContext.reader().document(contextRelativeDocId, preloadFields);
+        documentSupplier = new DocumentFieldValueProvider(doc);
+      }
+
+      highlights.clear();
+      highlightDocument(currentContext, contextRelativeDocId, documentSupplier, (field) -> true, highlights);
+      consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
+    }
+  }
+
+  /**
+   * Low-level method for retrieving hit ranges for a single document. This method can be used with
+   * custom document {@link FieldValueProvider}.
+   */
+  public void highlightDocument(
+      LeafReaderContext leafReaderContext,
+      int contextDocId,
+      FieldValueProvider doc,
+      Predicate<String> acceptField,
+      Map<String, List<OffsetRange>> outputHighlights)
+      throws IOException {
+    Matches matches = weight.matches(leafReaderContext, contextDocId);
+    if (matches == null) {
+      return;
+    }
+
+    for (String field : affectedFields) {
+      if (acceptField.test(field)) {
+        MatchesIterator matchesIterator = matches.getMatches(field);
+        if (matchesIterator == null) {
+          // No matches on this field, even though the field was part of the query. This may be possible
+          // with complex queries that source non-text fields (have no "hit regions" in any textual
+          // representation). Skip.
+        } else {
+          OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
+          if (offsetStrategy == null) {
+            throw new IOException(
+                "Non-empty matches but no offset retrieval strategy for field: " + field);
+          }
+          List<OffsetRange> ranges = offsetStrategy.get(matchesIterator, doc);
+          if (!ranges.isEmpty()) {
+            outputHighlights.put(field, ranges);
+          }
+        }
+      }
+    }
+  }
+
+  private boolean checkOrderConsistency(List<LeafReaderContext> leaves) {
+    for (int i = 1; i < leaves.size(); i++) {
+      LeafReaderContext prev = leaves.get(i - 1);
+      LeafReaderContext next = leaves.get(i);
+      assert prev.docBase <= next.docBase;
+      assert prev.docBase + prev.reader().maxDoc() == next.docBase;
+    }
+    return true;
+  }
+
+  /**
+   * Compute default strategies for retrieving offsets from {@link MatchesIterator}
+   * instances for a set of given fields.
+   */
+  public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
+      IndexReader reader, Analyzer analyzer) {
+    FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
+    return (field) -> {
+      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+      if (fieldInfo == null) {
+        return (mi, doc) -> {
+          throw new IOException("FieldInfo is null for field: " + field);
+        };
+      }
+
+      switch (fieldInfo.getIndexOptions()) {
+        case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
+          return new OffsetsFromMatchIterator(field);
+
+        case DOCS_AND_FREQS_AND_POSITIONS:
+          return new OffsetsFromPositions(field, analyzer);
+
+        case DOCS_AND_FREQS:
+        case DOCS:
+          // By default retrieve offsets from individual tokens
+          // retrieved by the analyzer (possibly narrowed down to
+          // only those terms that the query hinted at when passed
+          // a QueryVisitor.
+          //
+          // Alternative straties are also possible and may make sense
+          // depending on the use case (OffsetsFromValues, for example).
+          return new OffsetsFromTokens(field, analyzer);
+
+        default:
+          return
+              (matchesIterator, doc) -> {
+                throw new IOException(
+                    "Field is indexed without positions and/or offsets: "
+                        + field
+                        + ", "
+                        + fieldInfo.getIndexOptions());
+              };
+      }
+    };
+  }
+
+  /**
+   * Implements {@link FieldValueProvider} wrapping a preloaded
+   * {@link Document}.
+   */
+  private static final class DocumentFieldValueProvider implements FieldValueProvider {
+    private final Document doc;
+
+    public DocumentFieldValueProvider(Document doc) {
+      this.doc = doc;
+    }
+
+    @Override
+    public List<CharSequence> getValues(String field) {
+      return Arrays.asList(doc.getValues(field));
+    }
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java
@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.Objects;
+
+/**
+ * A non-empty range of offset positions.
+ */
+public class OffsetRange {
+  /** Start index, inclusive. */
+  public final int from;
+
+  /** End index, exclusive. */
+  public final int to;
+
+  /**
+   * @param from Start index, inclusive.
+   * @param to End index, exclusive.
+   */
+  public OffsetRange(int from, int to) {
+    assert from <= to : "A non-empty offset range is required: " + from + "-" + to;
+    this.from = from;
+    this.to = to;
+  }
+
+  public int length() {
+    return to - from;
+  }
+
+  @Override
+  public String toString() {
+    return "[from=" + from + ", to=" + to + "]";
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) return true;
+    if (other instanceof OffsetRange) {
+      OffsetRange that = (OffsetRange) other;
+      return from == that.from && to == that.to;
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(from, to);
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromMatchIterator.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromMatchIterator.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.search.MatchesIterator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This strategy retrieves offsets directly from {@link MatchesIterator}.
+ */
+public final class OffsetsFromMatchIterator implements OffsetsRetrievalStrategy {
+  private final String field;
+
+  OffsetsFromMatchIterator(String field) {
+    this.field = field;
+  }
+
+  @Override
+  public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
+      throws IOException {
+    ArrayList<OffsetRange> ranges = new ArrayList<>();
+    while (matchesIterator.next()) {
+      int from = matchesIterator.startOffset();
+      int to = matchesIterator.endOffset();
+      if (from < 0 || to < 0) {
+        throw new IOException("Matches API returned negative offsets for field: " + field);
+      }
+      ranges.add(new OffsetRange(from, to));
+    }
+    return ranges;
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java
@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.search.MatchesIterator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This strategy applies to fields with stored positions but no offsets. We re-analyze
+ * the field's value to find out offsets of match positions.
+ * <p>
+ * Note that this may fail if index data (positions stored in the index) is out of sync
+ * with the field values or the analyzer. This strategy assumes it'll never happen.
+ */
+public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
+  private final String field;
+  private final Analyzer analyzer;
+
+  OffsetsFromPositions(String field, Analyzer analyzer) {
+    this.field = field;
+    this.analyzer = analyzer;
+  }
+
+  @Override
+  public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
+      throws IOException {
+    ArrayList<OffsetRange> ranges = new ArrayList<>();
+    while (matchesIterator.next()) {
+      int from = matchesIterator.startPosition();
+      int to = matchesIterator.endPosition();
+      if (from < 0 || to < 0) {
+        throw new IOException("Matches API returned negative positions for field: " + field);
+      }
+      ranges.add(new OffsetRange(from, to));
+    }
+
+    // Convert from positions to offsets.
+    ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field));
+
+    return ranges;
+  }
+
+  @Override
+  public boolean requiresDocument() {
+    return true;
+  }
+
+  private static ArrayList<OffsetRange> convertPositionsToOffsets(
+      ArrayList<OffsetRange> ranges,
+      Analyzer analyzer,
+      String fieldName,
+      List<CharSequence> values)
+      throws IOException {
+
+    if (ranges.isEmpty()) {
+      return ranges;
+    }
+
+    class LeftRight {
+      int left = Integer.MAX_VALUE;
+      int right = Integer.MIN_VALUE;
+
+      @Override
+      public String toString() {
+        return "[" + "L: " + left + ", R: " + right + ']';
+      }
+    }
+
+    Map<Integer, LeftRight> requiredPositionSpans = new HashMap<>();
+    int minPosition = Integer.MAX_VALUE;
+    int maxPosition = Integer.MIN_VALUE;
+    for (OffsetRange range : ranges) {
+      requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight());
+      requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight());
+      minPosition = Math.min(minPosition, range.from);
+      maxPosition = Math.max(maxPosition, range.to);
+    }
+
+    int position = -1;
+    int valueOffset = 0;
+    for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
+      final String value = values.get(valueIndex).toString();
+      final boolean lastValue = valueIndex + 1 == max;
+
+      TokenStream ts = analyzer.tokenStream(fieldName, value);
+      OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
+      PositionIncrementAttribute posAttr = ts.getAttribute(PositionIncrementAttribute.class);
+      ts.reset();
+      while (ts.incrementToken()) {
+        position += posAttr.getPositionIncrement();
+
+        if (position >= minPosition) {
+          LeftRight leftRight = requiredPositionSpans.get(position);
+          if (leftRight != null) {
+            int startOffset = valueOffset + offsetAttr.startOffset();
+            int endOffset = valueOffset + offsetAttr.endOffset();
+
+            leftRight.left = Math.min(leftRight.left, startOffset);
+            leftRight.right = Math.max(leftRight.right, endOffset);
+          }
+
+          // Only short-circuit if we're on the last value (which should be the common
+          // case since most fields would only have a single value anyway). We need
+          // to make sure of this because otherwise offsetAttr would have incorrect value.
+          if (position > maxPosition && lastValue) {
+            break;
+          }
+        }
+      }
+      ts.end();
+      position += posAttr.getPositionIncrement() + analyzer.getPositionIncrementGap(fieldName);
+      valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(fieldName);
+      ts.close();
+    }
+
+    ArrayList<OffsetRange> converted = new ArrayList<>();
+    for (OffsetRange range : ranges) {
+      LeftRight left = requiredPositionSpans.get(range.from);
+      LeftRight right = requiredPositionSpans.get(range.to);
+      if (left == null
+          || right == null
+          || left.left == Integer.MAX_VALUE
+          || right.right == Integer.MIN_VALUE) {
+        throw new RuntimeException("Position not properly initialized for range: " + range);
+      }
+      converted.add(new OffsetRange(left.left, right.right));
+    }
+
+    return converted;
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromTokens.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromTokens.java
@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.MatchesIterator;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * This strategy works for fields where we know the match occurred but there are
+ * no known positions or offsets.
+ * <p>
+ * We re-analyze field values and return offset ranges for returned tokens that
+ * are also returned by the query's term collector.
+ */
+public final class OffsetsFromTokens implements OffsetsRetrievalStrategy {
+  private final String field;
+  private final Analyzer analyzer;
+
+  public OffsetsFromTokens(String field, Analyzer analyzer) {
+    this.field = field;
+    this.analyzer = analyzer;
+  }
+
+  @Override
+  public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
+    List<CharSequence> values = doc.getValues(field);
+
+    Set<BytesRef> matchTerms = new HashSet<>();
+    while (matchesIterator.next()) {
+      Query q = matchesIterator.getQuery();
+      q.visit(new QueryVisitor() {
+        @Override
+        public void consumeTerms(Query query, Term... terms) {
+          for (Term t : terms) {
+            if (field.equals(t.field())) {
+              matchTerms.add(t.bytes());
+            }
+          }
+        }
+      });
+    }
+
+    ArrayList<OffsetRange> ranges = new ArrayList<>();
+    int valueOffset = 0;
+    for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
+      final String value = values.get(valueIndex).toString();
+
+      TokenStream ts = analyzer.tokenStream(field, value);
+      OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
+      TermToBytesRefAttribute termAttr = ts.getAttribute(TermToBytesRefAttribute.class);
+      ts.reset();
+      while (ts.incrementToken()) {
+        if (matchTerms.contains(termAttr.getBytesRef())) {
+          int startOffset = valueOffset + offsetAttr.startOffset();
+          int endOffset = valueOffset + offsetAttr.endOffset();
+          ranges.add(new OffsetRange(startOffset, endOffset));
+        }
+      }
+      ts.end();
+      valueOffset += offsetAttr.endOffset() + analyzer.getOffsetGap(field);
+      ts.close();
+    }
+    return ranges;
+  }
+
+  @Override
+  public boolean requiresDocument() {
+    return true;
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromValues.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromValues.java
@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.search.MatchesIterator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * This strategy works for fields where we know the match occurred but there are
+ * no known positions or offsets.
+ * <p>
+ * We re-analyze field values and return offset ranges for entire values
+ * (not individual tokens). Re-analysis is required because analyzer may return
+ * an unknown offset gap.
+ */
+public final class OffsetsFromValues implements OffsetsRetrievalStrategy {
+  private final String field;
+  private final Analyzer analyzer;
+
+  public OffsetsFromValues(String field, Analyzer analyzer) {
+    this.field = field;
+    this.analyzer = analyzer;
+  }
+
+  @Override
+  public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException {
+    List<CharSequence> values = doc.getValues(field);
+
+    ArrayList<OffsetRange> ranges = new ArrayList<>();
+    int valueOffset = 0;
+    for (CharSequence charSequence : values) {
+      final String value = charSequence.toString();
+
+      TokenStream ts = analyzer.tokenStream(field, value);
+      OffsetAttribute offsetAttr = ts.getAttribute(OffsetAttribute.class);
+      ts.reset();
+      int startOffset = valueOffset;
+      while (ts.incrementToken()) {
+        // Go through all tokens to increment offset attribute properly.
+      }
+      ts.end();
+      valueOffset += offsetAttr.endOffset();
+      ranges.add(new OffsetRange(startOffset, valueOffset));
+      valueOffset += analyzer.getOffsetGap(field);
+      ts.close();
+    }
+    return ranges;
+  }
+
+  @Override
+  public boolean requiresDocument() {
+    return true;
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsRetrievalStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsRetrievalStrategy.java
@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.search.MatchesIterator;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Determines how match offset regions are computed from {@link MatchesIterator}. Several
+ * possibilities exist, ranging from retrieving offsets directly from a match instance
+ * to re-evaluating the document's field and recomputing offsets from there.
+ */
+public interface OffsetsRetrievalStrategy {
+  /**
+   * Return value offsets (match ranges) acquired from the given {@link MatchesIterator}.
+   */
+  List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
+      throws IOException;
+
+  /**
+   * Whether this strategy requires document field access.
+   */
+  default boolean requiresDocument() {
+    return false;
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsRetrievalStrategySupplier.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsRetrievalStrategySupplier.java
@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.function.Function;
+
+/**
+ * A per-field supplier of {@link OffsetsRetrievalStrategy}.
+ */
+@FunctionalInterface
+public interface OffsetsRetrievalStrategySupplier extends Function<String, OffsetsRetrievalStrategy> {
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/Passage.java
@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.List;
+
+/**
+ * A passage is a fragment of source text, scored and possibly with a list of sub-offsets (markers)
+ * to be highlighted. The markers can be overlapping or nested, but they're always contained within
+ * the passage.
+ */
+public class Passage extends OffsetRange {
+  public List<OffsetRange> markers;
+
+  public Passage(int from, int to, List<OffsetRange> markers) {
+    super(from, to);
+
+    this.markers = markers;
+  }
+
+  @Override
+  public String toString() {
+    return "[" + super.toString() + ", markers=" + markers + "]";
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageAdjuster.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageAdjuster.java
@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+/**
+ * Adjusts the range of one or more passages over a given value. An example
+ * adjuster could shift passage boundary to the next or previous word delimiter
+ * or white space, for example.
+ */
+public interface PassageAdjuster {
+  void currentValue(CharSequence value);
+  OffsetRange adjust(Passage p);
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageFormatter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageFormatter.java
@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.RandomAccess;
+import java.util.function.Function;
+
+/**
+ * Formats a collection of {@linkplain Passage passages} over a given string, cleaning up
+ * and resolving restrictions concerning overlaps, allowed sub-ranges over the
+ * input string and length restrictions.
+ *
+ * Passages are demarcated with constructor-provided ellipsis and start/end marker
+ * sequences.
+ */
+public class PassageFormatter {
+  private final String ellipsis;
+  private final Function<OffsetRange, String> markerStart;
+  private final Function<OffsetRange, String> markerEnd;
+
+  private final ArrayList<OffsetRange> markerStack = new ArrayList<>();
+
+  public PassageFormatter(String ellipsis, String markerStart, String markerEnd) {
+    this(ellipsis, (m) -> markerStart, (m) -> markerEnd);
+  }
+
+  public PassageFormatter(
+      String ellipsis,
+      Function<OffsetRange, String> markerStart,
+      Function<OffsetRange, String> markerEnd) {
+    this.ellipsis = ellipsis;
+    this.markerStart = markerStart;
+    this.markerEnd = markerEnd;
+  }
+
+  public List<String> format(CharSequence value, List<Passage> passages, List<OffsetRange> ranges) {
+    assert PassageSelector.sortedAndNonOverlapping(passages);
+    assert PassageSelector.sortedAndNonOverlapping(ranges);
+    assert ranges instanceof RandomAccess;
+
+    if (ranges.isEmpty()) {
+      return Collections.emptyList();
+    }
+
+    ArrayList<String> result = new ArrayList<>();
+    StringBuilder buf = new StringBuilder();
+
+    int rangeIndex = 0;
+    OffsetRange range = ranges.get(rangeIndex);
+    passageFormatting:
+    for (Passage passage : passages) {
+      // Move to the range of the current passage.
+      while (passage.from >= range.to) {
+        if (++rangeIndex == ranges.size()) {
+          break passageFormatting;
+        }
+        range = ranges.get(rangeIndex);
+      }
+
+      assert range.from <= passage.from && range.to >= passage.to : range + " ? " + passage;
+
+      buf.setLength(0);
+      if (range.from < passage.from) {
+        buf.append(ellipsis);
+      }
+      format(buf, value, passage);
+      if (range.to > passage.to) {
+        buf.append(ellipsis);
+      }
+      result.add(buf.toString());
+    }
+    return result;
+  }
+
+  public StringBuilder format(StringBuilder buf, CharSequence value, final Passage passage) {
+    switch (passage.markers.size()) {
+      case 0:
+        // No markers, full passage appended.
+        buf.append(value, passage.from, passage.to);
+        break;
+
+      case 1:
+        // One marker, trivial and frequent case so it's handled separately.
+        OffsetRange m = passage.markers.iterator().next();
+        buf.append(value, passage.from, m.from);
+        buf.append(markerStart.apply(m));
+        buf.append(value, m.from, m.to);
+        buf.append(markerEnd.apply(m));
+        buf.append(value, m.to, passage.to);
+        break;
+
+      default:
+        // Multiple markers, possibly overlapping or nested.
+        markerStack.clear();
+        multipleMarkers(value, passage, buf, markerStack);
+        break;
+    }
+
+    return buf;
+  }
+
+  /** Handle multiple markers, possibly overlapping or nested. */
+  private void multipleMarkers(
+      CharSequence value, final Passage p, StringBuilder b, ArrayList<OffsetRange> markerStack) {
+    int at = p.from;
+    int max = p.to;
+    SlicePoint[] slicePoints = slicePoints(p);
+    for (SlicePoint slicePoint : slicePoints) {
+      b.append(value, at, slicePoint.offset);
+      OffsetRange currentMarker = slicePoint.marker;
+      switch (slicePoint.type) {
+        case START:
+          markerStack.add(currentMarker);
+          b.append(markerStart.apply(currentMarker));
+          break;
+
+        case END:
+          int markerIndex = markerStack.lastIndexOf(currentMarker);
+          for (int k = markerIndex; k < markerStack.size(); k++) {
+            b.append(markerEnd.apply(markerStack.get(k)));
+          }
+          markerStack.remove(markerIndex);
+          for (int k = markerIndex; k < markerStack.size(); k++) {
+            b.append(markerStart.apply(markerStack.get(k)));
+          }
+          break;
+
+        default:
+          throw new RuntimeException();
+      }
+
+      at = slicePoint.offset;
+    }
+
+    if (at < max) {
+      b.append(value, at, max);
+    }
+  }
+
+  private static SlicePoint[] slicePoints(Passage p) {
+    SlicePoint[] slicePoints = new SlicePoint[p.markers.size() * 2];
+    int x = 0;
+    for (OffsetRange m : p.markers) {
+      slicePoints[x++] = new SlicePoint(SlicePoint.Type.START, m.from, m);
+      slicePoints[x++] = new SlicePoint(SlicePoint.Type.END, m.to, m);
+    }
+
+    // Order slice points by their offset
+    Comparator<SlicePoint> c =
+        Comparator.<SlicePoint>comparingInt(pt -> pt.offset)
+            .thenComparingInt(pt -> pt.type.ordering)
+            .thenComparing(
+                (a, b) -> {
+                  if (a.type == SlicePoint.Type.START) {
+                    // Longer start slice points come first.
+                    return Integer.compare(b.marker.to, a.marker.to);
+                  } else {
+                    // Shorter end slice points come first.
+                    return Integer.compare(b.marker.from, a.marker.from);
+                  }
+                });
+
+    Arrays.sort(slicePoints, c);
+
+    return slicePoints;
+  }
+
+  static class SlicePoint {
+    enum Type {
+      START(2),
+      END(1);
+
+      private final int ordering;
+
+      Type(int ordering) {
+        this.ordering = ordering;
+      }
+    }
+
+    public final int offset;
+    public final Type type;
+    public final OffsetRange marker;
+
+    public SlicePoint(Type t, int offset, OffsetRange m) {
+      this.type = t;
+      this.offset = offset;
+      this.marker = m;
+    }
+
+    @Override
+    public String toString() {
+      return "(" + type + ", " + marker + ")";
+    }
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java
@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.PriorityQueue;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.RandomAccess;
+
+/** Selects fragments of text that score best for the given set of highlight markers. */
+public class PassageSelector {
+  public static final Comparator<Passage> DEFAULT_SCORER =
+      (a, b) -> {
+        // Compare the number of highlights first.
+        int v;
+        v = Integer.compare(a.markers.size(), b.markers.size());
+        if (v != 0) {
+          return v;
+        }
+
+        // Total number of characters covered by the highlights.
+        int len1 = 0, len2 = 0;
+        for (OffsetRange o : a.markers) {
+          len1 += o.length();
+        }
+        for (OffsetRange o : b.markers) {
+          len2 += o.length();
+        }
+        if (len1 != len2) {
+          return Integer.compare(len1, len2);
+        }
+
+        return Integer.compare(b.from, a.from);
+      };
+
+  private final Comparator<Passage> passageScorer;
+  private final PassageAdjuster passageAdjuster;
+
+  public PassageSelector() {
+    this(DEFAULT_SCORER, null);
+  }
+
+  public PassageSelector(Comparator<Passage> passageScorer, PassageAdjuster passageAdjuster) {
+    this.passageScorer = passageScorer;
+    this.passageAdjuster = passageAdjuster;
+  }
+
+  public List<Passage> pickBest(
+      CharSequence value,
+      List<? extends OffsetRange> markers,
+      int maxPassageWindow,
+      int maxPassages) {
+    return pickBest(
+        value, markers, maxPassageWindow, maxPassages, List.of(new OffsetRange(0, value.length())));
+  }
+
+  public List<Passage> pickBest(
+      CharSequence value,
+      List<? extends OffsetRange> markers,
+      int maxPassageWindow,
+      int maxPassages,
+      List<OffsetRange> permittedPassageRanges) {
+    assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
+
+    // Handle odd special cases early.
+    if (value.length() == 0 || maxPassageWindow == 0) {
+      return Collections.emptyList();
+    }
+
+    // Sort markers by their start offset, shortest first.
+    markers.sort(
+        (a, b) -> {
+          int v = Integer.compare(a.from, b.from);
+          return v != 0 ? v : Integer.compare(a.to, b.to);
+        });
+
+    // Determine a maximum offset window around each highlight marker and
+    // pick the best scoring passage candidates.
+    PriorityQueue<Passage> pq =
+        new PriorityQueue<>(maxPassages) {
+          @Override
+          protected boolean lessThan(Passage a, Passage b) {
+            return passageScorer.compare(a, b) < 0;
+          }
+        };
+
+    assert sortedAndNonOverlapping(permittedPassageRanges);
+
+    final int max = markers.size();
+    int markerIndex = 0;
+    nextRange:
+    for (OffsetRange range : permittedPassageRanges) {
+      final int rangeTo = Math.min(range.to, value.length());
+
+      // Skip ranges outside of the value window anyway.
+      if (range.from >= rangeTo) {
+        continue;
+      }
+
+      while (markerIndex < max) {
+        OffsetRange m = markers.get(markerIndex);
+
+        // Markers are sorted so if the current marker's start is past the range,
+        // we can advance, but we need to check the same marker against the new range.
+        if (m.from >= rangeTo) {
+          continue nextRange;
+        }
+
+        // Check if current marker falls within the range and is smaller than the largest allowed
+        // passage window.
+        if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
+
+          // Adjust the window range to center the highlight marker.
+          int from = (m.from + m.to - maxPassageWindow) / 2;
+          int to = (m.from + m.to + maxPassageWindow) / 2;
+          if (from < range.from) {
+            to += range.from - from;
+            from = range.from;
+          }
+          if (to > rangeTo) {
+            from -= to - rangeTo;
+            to = rangeTo;
+            if (from < range.from) {
+              from = range.from;
+            }
+          }
+
+          if (from < to && to <= value.length()) {
+            // Find other markers that are completely inside the passage window.
+            ArrayList<OffsetRange> inside = new ArrayList<>();
+            int i = markerIndex;
+            while (i > 0 && markers.get(i - 1).from >= from) {
+              i--;
+            }
+
+            OffsetRange c;
+            for (; i < max && (c = markers.get(i)).from < to; i++) {
+              if (c.to <= to) {
+                inside.add(c);
+              }
+            }
+
+            if (!inside.isEmpty()) {
+              pq.insertWithOverflow(new Passage(from, to, inside));
+            }
+          }
+        }
+
+        // Advance to the next marker.
+        markerIndex++;
+      }
+    }
+
+    // Collect from the priority queue (reverse the order so that highest-scoring are first).
+    Passage[] passages;
+    if (pq.size() > 0) {
+      passages = new Passage[pq.size()];
+      for (int i = pq.size(); --i >= 0; ) {
+        passages[i] = pq.pop();
+      }
+    } else {
+      // Handle the default, no highlighting markers case.
+      passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
+    }
+
+    // Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
+    // on a proper word/sentence boundary.
+    if (passageAdjuster != null) {
+      passageAdjuster.currentValue(value);
+      for (int x = 0; x < passages.length; x++) {
+        Passage p = passages[x];
+        OffsetRange newRange = passageAdjuster.adjust(p);
+        if (newRange.from != p.from || newRange.to != p.to) {
+          assert newRange.from >= p.from && newRange.to <= p.to
+              : "Adjusters must not expand the passage's range: was "
+                  + p
+                  + " => changed to "
+                  + newRange;
+          passages[x] = new Passage(newRange.from, newRange.to, p.markers);
+        }
+      }
+    }
+
+    // Ensure there are no overlaps on passages. In case of conflicts, better score wins.
+    int last = 0;
+    for (int i = 0; i < passages.length; i++) {
+      Passage a = passages[i];
+      if (a != null && a.length() > 0) {
+        passages[last++] = a;
+        for (int j = i + 1; j < passages.length; j++) {
+          Passage b = passages[j];
+          if (b != null) {
+            if (adjecentOrOverlapping(a, b)) {
+              passages[j] = null;
+            }
+          }
+        }
+      }
+    }
+
+    // Remove nullified slots.
+    if (passages.length != last) {
+      passages = ArrayUtil.copyOfSubArray(passages, 0, last);
+    }
+
+    // Sort in the offset order again.
+    Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
+
+    return Arrays.asList(passages);
+  }
+
+  static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
+    if (permittedPassageRanges.size() > 1) {
+      Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
+      for (OffsetRange next, previous = i.next(); i.hasNext(); previous = next) {
+        next = i.next();
+        if (previous.to > next.from) {
+          throw new AssertionError(
+              "Ranges must be sorted and non-overlapping: " + permittedPassageRanges);
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Invoked when no passages could be selected (due to constraints or lack of highlight markers).
+   */
+  protected Passage[] pickDefaultPassage(
+      CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
+    // Search for the first range that is not empty.
+    for (OffsetRange o : permittedPassageRanges) {
+      int to = Math.min(value.length(), o.to);
+      if (o.from < to) {
+        return new Passage[] {
+          new Passage(
+              o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
+        };
+      }
+    }
+
+    return new Passage[] {};
+  }
+
+  private static boolean adjecentOrOverlapping(Passage a, Passage b) {
+    if (a.from >= b.from) {
+      return a.from <= b.to - 1;
+    } else {
+      return a.to - 1 >= b.from;
+    }
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/package-info.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/package-info.java
@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This package contains several components useful to build a highlighter
+ * on top of the {@link org.apache.lucene.search.Matches} API.
+ *
+ * {@link org.apache.lucene.search.matchhighlight.MatchRegionRetriever} can be
+ * used to retrieve hit areas for a given {@link org.apache.lucene.search.Query}
+ * and one (or more) indexed documents. These hit areas can be then passed to
+ * {@link org.apache.lucene.search.matchhighlight.PassageSelector} and formatted
+ * with {@link org.apache.lucene.search.matchhighlight.PassageFormatter}.
+ */
+package org.apache.lucene.search.matchhighlight;
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AsciiMatchRangeHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AsciiMatchRangeHighlighter.java
@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A simple ASCII match range highlighter for tests.
+ */
+final class AsciiMatchRangeHighlighter {
+  private final Analyzer analyzer;
+  private final PassageFormatter passageFormatter;
+  private final PassageSelector selector;
+
+  private int maxPassageWindow = 160;
+  private int maxPassages = 10;
+
+  public AsciiMatchRangeHighlighter(Analyzer analyzer) {
+    this.passageFormatter = new PassageFormatter("...", ">", "<");
+    this.selector = new PassageSelector();
+    this.analyzer = analyzer;
+  }
+
+  public Map<String, List<String>> apply(Document document, Map<String, List<OffsetRange>> fieldHighlights) {
+    ArrayList<OffsetRange> valueRanges = new ArrayList<>();
+    Map<String, List<String>> fieldSnippets = new LinkedHashMap<>();
+
+    fieldHighlights.forEach(
+        (field, matchRanges) -> {
+          int offsetGap = analyzer.getOffsetGap(field);
+
+          String[] values = document.getValues(field);
+          String value;
+          if (values.length == 1) {
+            value = values[0];
+          } else {
+            // This can be inefficient if offset gap is large but recomputing
+            // offsets in a smart way doesn't make sense for tests.
+            String fieldGapPadding = " ".repeat(offsetGap);
+            value = String.join(fieldGapPadding, values);
+          }
+
+          // Create permitted range windows for passages so that they don't cross
+          // multi-value boundary.
+          valueRanges.clear();
+          int offset = 0;
+          for (CharSequence v : values) {
+            valueRanges.add(new OffsetRange(offset, offset + v.length()));
+            offset += v.length();
+            offset += offsetGap;
+          }
+
+          List<Passage> passages =
+              selector.pickBest(value, matchRanges, maxPassageWindow, maxPassages, valueRanges);
+
+          fieldSnippets.put(field, passageFormatter.format(value, passages, valueRanges));
+        });
+
+    return fieldSnippets;
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/MissingAnalyzer.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/MissingAnalyzer.java
@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+
+import java.io.Reader;
+
+/** An {@link Analyzer} that throws a runtime exception when used for anything. */
+final class MissingAnalyzer extends Analyzer {
+  @Override
+  protected Reader initReader(String fieldName, Reader reader) {
+    throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    throw new RuntimeException("Field must have an explicit Analyzer: " + fieldName);
+  }
+
+  @Override
+  public int getOffsetGap(String fieldName) {
+    return 0;
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
@ -0,0 +1,767 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.intervals.IntervalQuery;
+import org.apache.lucene.queries.intervals.Intervals;
+import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
+import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
+import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.emptyArray;
+import static org.hamcrest.Matchers.not;
+
+public class TestMatchRegionRetriever extends LuceneTestCase {
+  private static final String FLD_ID = "field_id";
+
+  private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
+  private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
+
+  private static final String FLD_TEXT_POS_OFFS = "field_text_offs";
+  private static final String FLD_TEXT_POS = "field_text";
+
+  private static final String FLD_TEXT_SYNONYMS_POS_OFFS = "field_text_syns_offs";
+  private static final String FLD_TEXT_SYNONYMS_POS = "field_text_syns";
+
+  private static final String FLD_TEXT_NOPOS = "field_text_nopos";
+
+  private static final String FLD_NON_EXISTING = "field_missing";
+
+  private FieldType TYPE_STORED_WITH_OFFSETS;
+  private FieldType TYPE_STORED_NO_POSITIONS;
+
+  private Analyzer analyzer;
+
+  @Before
+  public void setup() {
+    TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
+    TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    TYPE_STORED_WITH_OFFSETS.freeze();
+
+    TYPE_STORED_NO_POSITIONS = new FieldType(TextField.TYPE_STORED);
+    TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+    TYPE_STORED_NO_POSITIONS.freeze();
+
+    Analyzer whitespaceAnalyzer =
+        new Analyzer() {
+          final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
+          final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
+
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            WhitespaceTokenizer tokenizer =
+                new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
+            return new TokenStreamComponents(tokenizer);
+          }
+
+          @Override
+          public int getOffsetGap(String fieldName) {
+            return offsetGap;
+          }
+
+          @Override
+          public int getPositionIncrementGap(String fieldName) {
+            return positionGap;
+          }
+        };
+
+    Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
+    fieldAnalyzers.put(FLD_TEXT_POS, whitespaceAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT_POS_OFFS, whitespaceAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
+
+    try {
+      SynonymMap.Builder b = new SynonymMap.Builder();
+      b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
+      b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
+      SynonymMap synonymMap = b.build();
+      Analyzer synonymsAnalyzer =
+          new Analyzer() {
+            @Override
+            protected TokenStreamComponents createComponents(String fieldName) {
+              Tokenizer tokenizer = new WhitespaceTokenizer();
+              TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
+              return new TokenStreamComponents(tokenizer, tokenStream);
+            }
+          };
+      fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
+      fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+
+    analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
+  }
+
+  BiFunction<String, String, Query> stdQueryParser =
+      (query, defField) -> {
+        try {
+          StandardQueryParser parser = new StandardQueryParser(analyzer);
+          parser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
+          return parser.parse(query, defField);
+        } catch (QueryNodeException e) {
+          throw new RuntimeException(e);
+        }
+      };
+
+  @Test
+  public void testTermQueryWithOffsets() throws IOException {
+    checkTermQuery(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testTermQueryWithPositions() throws IOException {
+    checkTermQuery(FLD_TEXT_POS);
+  }
+
+  private void checkTermQuery(String field) throws IOException {
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar baz")),
+            Map.of(field, values("bar foo baz")),
+            Map.of(field, values("bar baz foo")),
+            Map.of(field, values("bar bar bar irrelevant"))),
+        reader -> {
+          assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo< bar baz')", field),
+                  fmt("1: (%s: 'bar >foo< baz')", field),
+                  fmt("2: (%s: 'bar baz >foo<')", field)));
+        });
+  }
+
+  @Test
+  public void testBooleanMultifieldQueryWithOffsets() throws IOException {
+    checkBooleanMultifieldQuery(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testBooleanMultifieldQueryWithPositions() throws IOException {
+    checkBooleanMultifieldQuery(FLD_TEXT_POS);
+  }
+
+  private void checkBooleanMultifieldQuery(String field) throws IOException {
+    Query query =
+        new BooleanQuery.Builder()
+            .add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term(FLD_NON_EXISTING, "abc")), BooleanClause.Occur.SHOULD)
+            .add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
+            .build();
+
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar baz abc")),
+            Map.of(field, values("bar foo baz def")),
+            Map.of(field, values("bar baz foo xyz"))),
+        reader -> {
+          assertThat(highlights(reader, query),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo bar baz< abc')", field),
+                  fmt("1: (%s: 'bar >foo baz< def')", field)));
+        });
+  }
+
+  @Test
+  public void testVariousQueryTypesWithOffsets() throws IOException {
+    checkVariousQueryTypes(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testVariousQueryTypesWithPositions() throws IOException {
+    checkVariousQueryTypes(FLD_TEXT_POS);
+  }
+
+  private void checkVariousQueryTypes(String field) throws IOException {
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar baz abc")),
+            Map.of(field, values("bar foo baz def")),
+            Map.of(field, values("bar baz foo xyz"))),
+        reader -> {
+          assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo< bar >baz< abc')", field),
+                  fmt("1: (%s: 'bar >foo< >baz< def')", field),
+                  fmt("2: (%s: 'bar >baz< >foo< xyz')", field)));
+
+          assertThat(highlights(reader, stdQueryParser.apply("foo OR xyz", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo< bar baz abc')", field),
+                  fmt("1: (%s: 'bar >foo< baz def')", field),
+                  fmt("2: (%s: 'bar baz >foo< >xyz<')", field)));
+
+          assertThat(highlights(reader, stdQueryParser.apply("bas~2", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >bar< >baz< >abc<')", field),
+                  fmt("1: (%s: '>bar< foo >baz< def')", field),
+                  fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
+
+          assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"", field)),
+              containsInAnyOrder((fmt("0: (%s: '>foo bar< baz abc')", field))));
+
+          assertThat(highlights(reader, stdQueryParser.apply("\"foo bar\"~3", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo bar< baz abc')", field),
+                  fmt("1: (%s: '>bar foo< baz def')", field),
+                  fmt("2: (%s: '>bar baz foo< xyz')", field)));
+
+          assertThat(highlights(reader, stdQueryParser.apply("ba*", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >bar< >baz< abc')", field),
+                  fmt("1: (%s: '>bar< foo >baz< def')", field),
+                  fmt("2: (%s: '>bar< >baz< foo xyz')", field)));
+
+          assertThat(highlights(reader, stdQueryParser.apply("[bar TO bas]", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >bar< baz abc')", field),
+                  fmt("1: (%s: '>bar< foo baz def')", field),
+                  fmt("2: (%s: '>bar< baz foo xyz')", field)));
+
+          // Note how document '2' has 'bar' that isn't highlighted (because this
+          // document is excluded in the first clause).
+          assertThat(
+              highlights(reader, stdQueryParser.apply("([bar TO baz] -xyz) OR baz", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >bar< >>baz<< abc')", field),
+                  fmt("1: (%s: '>bar< foo >>baz<< def')", field),
+                  fmt("2: (%s: 'bar >baz< foo xyz')", field)));
+
+          assertThat(highlights(reader, new MatchAllDocsQuery()),
+              Matchers.hasSize(0));
+        });
+
+    withReader(
+        List.of(
+            Map.of(field, values("foo baz foo")),
+            Map.of(field, values("bas baz foo")),
+            Map.of(field, values("bar baz foo xyz"))),
+        reader -> {
+          assertThat(
+              highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
+        });
+  }
+
+  @Test
+  public void testIntervalQueries() throws IOException {
+    String field = FLD_TEXT_POS_OFFS;
+
+    withReader(
+        List.of(
+            Map.of(field, values("foo baz foo")),
+            Map.of(field, values("bas baz foo")),
+            Map.of(field, values("bar baz foo xyz"))),
+        reader -> {
+          assertThat(
+              highlights(reader, new IntervalQuery(field,
+                  Intervals.unordered(
+                      Intervals.term("foo"),
+                      Intervals.term("bas"),
+                      Intervals.term("baz")))),
+              containsInAnyOrder(
+                  fmt("1: (field_text_offs: '>bas baz foo<')", field)
+              ));
+
+          assertThat(
+              highlights(reader, new IntervalQuery(field,
+                  Intervals.maxgaps(1,
+                      Intervals.unordered(
+                          Intervals.term("foo"),
+                          Intervals.term("bar"))))),
+              containsInAnyOrder(
+                  fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
+              ));
+
+          assertThat(
+              highlights(reader, new IntervalQuery(field,
+                  Intervals.containing(
+                      Intervals.unordered(
+                          Intervals.term("foo"),
+                          Intervals.term("bar")),
+                      Intervals.term("foo")))),
+              containsInAnyOrder(
+                  fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
+              ));
+
+          assertThat(
+              highlights(reader, new IntervalQuery(field,
+                  Intervals.containedBy(
+                      Intervals.term("foo"),
+                      Intervals.unordered(
+                          Intervals.term("foo"),
+                          Intervals.term("bar"))))),
+              containsInAnyOrder(
+                  fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
+              ));
+
+          assertThat(
+              highlights(reader, new IntervalQuery(field,
+                  Intervals.overlapping(
+                      Intervals.unordered(
+                          Intervals.term("foo"),
+                          Intervals.term("bar")),
+                      Intervals.term("foo")))),
+              containsInAnyOrder(
+                  fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
+              ));
+        });
+  }
+
+  @Test
+  public void testMultivaluedFieldsWithOffsets() throws IOException {
+    checkMultivaluedFields(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testMultivaluedFieldsWithPositions() throws IOException {
+    checkMultivaluedFields(FLD_TEXT_POS);
+  }
+
+  public void checkMultivaluedFields(String field) throws IOException {
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar", "baz abc", "bad baz")),
+            Map.of(field, values("bar foo", "baz def")),
+            Map.of(field, values("bar baz", "foo xyz"))),
+        reader -> {
+          assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>baz< abc | bad >baz<')", field),
+                  fmt("1: (%s: '>baz< def')", field),
+                  fmt("2: (%s: 'bar >baz<')", field)));
+        });
+  }
+
+  @Test
+  public void testMultiFieldHighlights() throws IOException {
+    for (String[] fields :
+        new String[][]{
+            {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
+            {FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
+            {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
+        }) {
+      String field1 = fields[0];
+      String field2 = fields[1];
+      withReader(
+          List.of(
+              Map.of(
+                  field1, values("foo bar", "baz abc"),
+                  field2, values("foo baz", "loo bar"))),
+          reader -> {
+            String ordered =
+                Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
+                    .sorted()
+                    .collect(Collectors.joining(""));
+
+            assertThat(
+                highlights(
+                    reader,
+                    stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
+                containsInAnyOrder(fmt("0: %s", ordered)));
+          });
+    }
+  }
+
+  /**
+   * Rewritten Boolean queries may omit matches from {@link
+   * org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses. Check that this isn't the case.
+   */
+  @Test
+  public void testNoRewrite() throws IOException {
+    String field1 = FLD_TEXT_POS_OFFS1;
+    String field2 = FLD_TEXT_POS_OFFS2;
+    withReader(
+        List.of(
+            Map.of(
+                field1, values("0100"),
+                field2, values("loo bar")),
+            Map.of(
+                field1, values("0200"),
+                field2, values("foo bar"))),
+        reader -> {
+          String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
+          assertThat(
+              highlights(
+                  reader,
+                  stdQueryParser.apply(fmt("+%s:01* OR %s:bar", field1, field2), field1)),
+              containsInAnyOrder(expected));
+
+          assertThat(
+              highlights(
+                  reader,
+                  stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
+              containsInAnyOrder(expected));
+        });
+  }
+
+  @Test
+  public void testNestedQueryHitsWithOffsets() throws IOException {
+    checkNestedQueryHits(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testNestedQueryHitsWithPositions() throws IOException {
+    checkNestedQueryHits(FLD_TEXT_POS);
+  }
+
+  public void checkNestedQueryHits(String field) throws IOException {
+    withReader(
+        List.of(Map.of(field, values("foo bar baz abc"))),
+        reader -> {
+          assertThat(
+              highlights(
+                  reader,
+                  new BooleanQuery.Builder()
+                      .add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
+                      .add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
+                      .build()),
+              containsInAnyOrder(fmt("0: (%s: '>foo >bar< baz< abc')", field)));
+
+          assertThat(
+              highlights(
+                  reader,
+                  new BooleanQuery.Builder()
+                      .add(new PhraseQuery(1, field, "foo", "baz"), BooleanClause.Occur.SHOULD)
+                      .add(new TermQuery(new Term(field, "bar")), BooleanClause.Occur.SHOULD)
+                      .add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
+                      .build()),
+              containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
+        });
+  }
+
+  @Test
+  public void testGraphQueryWithOffsets() throws Exception {
+    checkGraphQuery(FLD_TEXT_SYNONYMS_POS_OFFS);
+  }
+
+  @Test
+  public void testGraphQueryWithPositions() throws Exception {
+    checkGraphQuery(FLD_TEXT_SYNONYMS_POS);
+  }
+
+  private void checkGraphQuery(String field) throws IOException {
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar baz")),
+            Map.of(field, values("bar foo baz")),
+            Map.of(field, values("bar baz foo")),
+            Map.of(field, values("bar bar bar irrelevant"))),
+        reader -> {
+          assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
+              containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));
+
+          // [syn2 syn3] = baz
+          // so both these queries highlight baz.
+          assertThat(highlights(reader, new TermQuery(new Term(field, "syn3"))),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo bar >baz<')", field),
+                  fmt("1: (%s: 'bar foo >baz<')", field),
+                  fmt("2: (%s: 'bar >baz< foo')", field)));
+          assertThat(
+              highlights(reader, stdQueryParser.apply(field + ":\"syn2 syn3\"", field)),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo bar >baz<')", field),
+                  fmt("1: (%s: 'bar foo >baz<')", field),
+                  fmt("2: (%s: 'bar >baz< foo')", field)));
+          assertThat(
+              highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
+              containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
+        });
+  }
+
+  @Test
+  public void testSpanQueryWithOffsets() throws Exception {
+    checkSpanQueries(FLD_TEXT_POS_OFFS);
+  }
+
+  @Test
+  public void testSpanQueryWithPositions() throws Exception {
+    checkSpanQueries(FLD_TEXT_POS);
+  }
+
+  private void checkSpanQueries(String field) throws IOException {
+    withReader(
+        List.of(
+            Map.of(field, values("foo bar baz")),
+            Map.of(field, values("bar foo baz")),
+            Map.of(field, values("bar baz foo")),
+            Map.of(field, values("bar bar bar irrelevant"))),
+        reader -> {
+          assertThat(
+              highlights(
+                  reader,
+                  SpanNearQuery.newOrderedNearQuery(field)
+                      .addClause(new SpanTermQuery(new Term(field, "bar")))
+                      .addClause(new SpanTermQuery(new Term(field, "foo")))
+                      .build()),
+              containsInAnyOrder(fmt("1: (%s: '>bar foo< baz')", field)));
+
+          assertThat(
+              highlights(
+                  reader,
+                  SpanNearQuery.newOrderedNearQuery(field)
+                      .addClause(new SpanTermQuery(new Term(field, "bar")))
+                      .addGap(1)
+                      .addClause(new SpanTermQuery(new Term(field, "foo")))
+                      .build()),
+              containsInAnyOrder(fmt("2: (%s: '>bar baz foo<')", field)));
+
+          assertThat(
+              highlights(
+                  reader,
+                  SpanNearQuery.newUnorderedNearQuery(field)
+                      .addClause(new SpanTermQuery(new Term(field, "foo")))
+                      .addClause(new SpanTermQuery(new Term(field, "bar")))
+                      .build()),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo bar< baz')", field), fmt("1: (%s: '>bar foo< baz')", field)));
+
+          assertThat(
+              highlights(
+                  reader,
+                  SpanNearQuery.newUnorderedNearQuery(field)
+                      .addClause(new SpanTermQuery(new Term(field, "foo")))
+                      .addClause(new SpanTermQuery(new Term(field, "bar")))
+                      .setSlop(1)
+                      .build()),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo bar< baz')", field),
+                  fmt("1: (%s: '>bar foo< baz')", field),
+                  fmt("2: (%s: '>bar baz foo<')", field)));
+        });
+  }
+
+  /**
+   * This test runs a term query against a field with no stored
+   * positions or offsets. This test checks the {@link OffsetsFromValues}
+   * strategy that returns highlights over entire indexed values.
+   */
+  @Test
+  public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
+    String field = FLD_TEXT_NOPOS;
+
+    withReader(
+        List.of(
+            Map.of(FLD_TEXT_NOPOS, values("foo bar")),
+            Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
+        ),
+        reader -> {
+          OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
+              .computeOffsetRetrievalStrategies(reader, analyzer);
+          OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
+            if (fld.equals(field)) {
+              return new OffsetsFromValues(field, analyzer);
+            } else {
+              return defaults.apply(field);
+            }
+          };
+
+          assertThat(
+              highlights(
+                  customSuppliers,
+                  reader,
+                  new TermQuery(new Term(field, "bar"))),
+              containsInAnyOrder(
+                  fmt("0: (%s: '>foo bar<')", field),
+                  fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
+        });
+  }
+
+  /**
+   * This test runs a term query against a field with no stored
+   * positions or offsets.
+   * <p>
+   * Such field structure is often useful for multivalued "keyword-like"
+   * fields.
+   */
+  @Test
+  public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
+    String field = FLD_TEXT_NOPOS;
+
+    withReader(
+        List.of(
+            Map.of(FLD_TEXT_NOPOS, values("foo bar"),
+                   FLD_TEXT_POS, values("bar bar")),
+            Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
+        ),
+        reader -> {
+          assertThat(
+              highlights(
+                  reader,
+                  new TermQuery(new Term(field, "bar"))),
+              containsInAnyOrder(
+                  fmt("0: (%s: 'foo >bar<')", field),
+                  fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
+        });
+  }
+
+  private List<String> highlights(IndexReader reader, Query query) throws IOException {
+    return highlights(MatchRegionRetriever.computeOffsetRetrievalStrategies(reader, analyzer),
+        reader, query);
+  }
+
+  private List<String> highlights(OffsetsRetrievalStrategySupplier offsetsStrategySupplier,
+                                  IndexReader reader, Query query) throws IOException {
+    IndexSearcher searcher = new IndexSearcher(reader);
+    int maxDocs = 1000;
+
+    Query rewrittenQuery = searcher.rewrite(query);
+    TopDocs topDocs = searcher.search(rewrittenQuery, maxDocs);
+
+    ArrayList<String> highlights = new ArrayList<>();
+
+    AsciiMatchRangeHighlighter formatter = new AsciiMatchRangeHighlighter(analyzer);
+
+    MatchRegionRetriever.MatchOffsetsConsumer highlightCollector =
+        (docId, leafReader, leafDocId, fieldHighlights) -> {
+          StringBuilder sb = new StringBuilder();
+
+          Document document = leafReader.document(leafDocId);
+          formatter
+              .apply(document, new TreeMap<>(fieldHighlights))
+              .forEach(
+                  (field, snippets) -> {
+                    sb.append(
+                        String.format(
+                            Locale.ROOT, "(%s: '%s')", field, String.join(" | ", snippets)));
+                  });
+
+          if (sb.length() > 0) {
+            sb.insert(0, document.get(FLD_ID) + ": ");
+            highlights.add(sb.toString());
+          }
+        };
+
+    MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
+        offsetsStrategySupplier);
+    highlighter.highlightDocuments(topDocs, highlightCollector);
+
+    return highlights;
+  }
+
+  private String[] values(String... values) {
+    assertThat(values, not(emptyArray()));
+    return values;
+  }
+
+  private void withReader(
+      Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
+      throws IOException {
+    IndexWriterConfig config = new IndexWriterConfig(analyzer);
+
+    try (Directory directory = new ByteBuffersDirectory()) {
+      IndexWriter iw = new IndexWriter(directory, config);
+
+      int seq = 0;
+      for (Map<String, String[]> fields : docs) {
+        Document doc = new Document();
+        doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
+        for (Map.Entry<String, String[]> field : fields.entrySet()) {
+          for (String value : field.getValue()) {
+            doc.add(toField(field.getKey(), value));
+          }
+        }
+        iw.addDocument(doc);
+        if (RandomizedTest.randomBoolean()) {
+          iw.commit();
+        }
+      }
+      iw.flush();
+
+      try (DirectoryReader reader = DirectoryReader.open(iw)) {
+        block.accept(reader);
+      }
+    }
+  }
+
+  private IndexableField toField(String name, String value) {
+    switch (name) {
+      case FLD_TEXT_NOPOS:
+        return new Field(name, value, TYPE_STORED_NO_POSITIONS);
+      case FLD_TEXT_POS:
+      case FLD_TEXT_SYNONYMS_POS:
+        return new TextField(name, value, Field.Store.YES);
+      case FLD_TEXT_POS_OFFS:
+      case FLD_TEXT_POS_OFFS1:
+      case FLD_TEXT_POS_OFFS2:
+      case FLD_TEXT_SYNONYMS_POS_OFFS:
+        return new Field(name, value, TYPE_STORED_WITH_OFFSETS);
+      default:
+        throw new AssertionError("Don't know how to handle this field: " + name);
+    }
+  }
+
+  private static String fmt(String string, Object... args) {
+    return String.format(Locale.ROOT, string, args);
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java
@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
+import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+
+public class TestPassageSelector extends LuceneTestCase {
+  @Test
+  public void checkEmptyExtra() {
+    checkPassages(
+        "foo >>bar<< baz abc",
+        "foo bar baz abc",
+        300,
+        100,
+        new OffsetRange(4, 7),
+        new OffsetRange(4, 7));
+
+    checkPassages(
+        ">foo >bar< >baz<< abc",
+        "foo bar baz abc",
+        300,
+        100,
+        new OffsetRange(0, 11),
+        new OffsetRange(4, 7),
+        new OffsetRange(8, 11));
+
+    checkPassages(
+        ">>foo< bar >baz<< abc",
+        "foo bar baz abc",
+        300,
+        100,
+        new OffsetRange(0, 11),
+        new OffsetRange(0, 3),
+        new OffsetRange(8, 11));
+  }
+
+  @Test
+  public void oneMarker() {
+    checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
+    checkPassages("0123456789>a<", "0123456789a", 300, 1, new OffsetRange(10, 11));
+    checkPassages(">0123456789a<", "0123456789a", 300, 1, new OffsetRange(0, 11));
+  }
+
+  @Test
+  public void noHighlights() {
+    checkPassages("0123456789a", "0123456789a", 300, 1);
+    checkPassages("01234...", "0123456789a", 5, 1);
+    checkPassages(
+        "0123",
+        "0123456789a",
+        15,
+        2,
+        new OffsetRange[0],
+        new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)});
+  }
+
+  @Test
+  public void oneMarkerTruncated() {
+    checkPassages(">0<12...", "0123456789a", 4, 1, new OffsetRange(0, 1));
+    checkPassages("...789>a<", "0123456789a", 4, 1, new OffsetRange(10, 11));
+    checkPassages("...>3456<...", "0123456789a", 4, 1, new OffsetRange(3, 7));
+    checkPassages("...3>45<6...", "0123456789a", 4, 1, new OffsetRange(4, 6));
+  }
+
+  @Test
+  public void highlightLargerThanWindow() {
+    String value = "0123456789a";
+    checkPassages("0123...", value, 4, 1, new OffsetRange(0, value.length()));
+  }
+
+  @Test
+  public void twoMarkers() {
+    checkPassages(
+        "0>12<3>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 3), new OffsetRange(4, 6));
+    checkPassages(
+        "0>123<>45<6789a", "0123456789a", 300, 1, new OffsetRange(1, 4), new OffsetRange(4, 6));
+  }
+
+  @Test
+  public void noMarkers() {
+    checkPassages("0123456789a", "0123456789a", 300, 1);
+    checkPassages("0123...", "0123456789a", 4, 1);
+  }
+
+  @Test
+  public void markersOutsideValue() {
+    checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
+  }
+
+  @Test
+  public void twoPassages() {
+    checkPassages(
+        "0>12<3...|...6>78<9...",
+        "0123456789a",
+        4,
+        2,
+        new OffsetRange(1, 3),
+        new OffsetRange(7, 9));
+  }
+
+  @Test
+  public void emptyRanges() {
+    // Empty ranges cover the highlight, so it is omitted.
+    // Instead, the first non-empty range is taken as the default.
+    checkPassages(
+        "6789...",
+        "0123456789a",
+        4,
+        2,
+        ranges(new OffsetRange(0, 1)),
+        ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
+  }
+
+  @Test
+  public void passageScoring() {
+    // More highlights per passage -> better passage
+    checkPassages(
+        ">01<>23<...",
+        "0123456789a",
+        4,
+        1,
+        new OffsetRange(0, 2),
+        new OffsetRange(2, 4),
+        new OffsetRange(8, 10));
+
+    checkPassages(
+        "...>01<23>45<67>89<...",
+        "__________0123456789a__________",
+        10,
+        1,
+        new OffsetRange(10, 12),
+        new OffsetRange(14, 16),
+        new OffsetRange(18, 20));
+
+    // ...if tied, the one with longer highlight length overall.
+    checkPassages(
+        "...6>789<...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(7, 10));
+
+    // ...if tied, the first one in order.
+    checkPassages(">01<23...", "0123456789a", 4, 1, new OffsetRange(0, 2), new OffsetRange(8, 10));
+  }
+
+  @Test
+  public void rangeWindows() {
+    // Add constraint windows to split the three highlights.
+    checkPassages(
+        "..._______>01<2",
+        "__________0123456789a__________",
+        10,
+        3,
+        ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
+        ranges(new OffsetRange(0, 13)));
+
+    checkPassages(
+        ">89<a_______...",
+        "__________0123456789a__________",
+        10,
+        3,
+        ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
+        ranges(new OffsetRange(18, Integer.MAX_VALUE)));
+
+    checkPassages(
+        "...________>01<|23>45<67|>89<a_______...",
+        "__________0123456789a__________",
+        10,
+        3,
+        ranges(new OffsetRange(10, 12), new OffsetRange(14, 16), new OffsetRange(18, 20)),
+        ranges(
+            new OffsetRange(0, 12),
+            new OffsetRange(12, 18),
+            new OffsetRange(18, Integer.MAX_VALUE)));
+  }
+
+  @Test
+  public void randomizedSanityCheck() {
+    PassageSelector selector = new PassageSelector();
+    PassageFormatter formatter = new PassageFormatter("...", ">", "<");
+    ArrayList<OffsetRange> highlights = new ArrayList<>();
+    ArrayList<OffsetRange> ranges = new ArrayList<>();
+    for (int i = 0; i < 5000; i++) {
+      String value =
+          randomBoolean()
+              ? randomAsciiLettersOfLengthBetween(0, 100)
+              : randomRealisticUnicodeOfCodepointLengthBetween(0, 1000);
+
+      ranges.clear();
+      highlights.clear();
+      for (int j = randomIntBetween(0, 10); --j >= 0; ) {
+        int from = randomIntBetween(0, value.length());
+        highlights.add(new OffsetRange(from, from + randomIntBetween(1, 10)));
+      }
+
+      int charWindow = randomIntBetween(1, 100);
+      int maxPassages = randomIntBetween(1, 10);
+
+      if (randomIntBetween(0, 5) == 0) {
+        int increment = value.length() / 10;
+        for (int c = randomIntBetween(0, 20), start = 0; --c >= 0; ) {
+          int step = randomIntBetween(0, increment);
+          ranges.add(new OffsetRange(start, start + step));
+          start += step + randomIntBetween(0, 3);
+        }
+      } else {
+        ranges.add(new OffsetRange(0, value.length()));
+      }
+
+      // Just make sure there are no exceptions.
+      List<Passage> passages =
+          selector.pickBest(value, highlights, charWindow, maxPassages, ranges);
+      formatter.format(value, passages, ranges);
+    }
+  }
+
+  private void checkPassages(
+      String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
+    checkPassages(
+        expected,
+        value,
+        charWindow,
+        maxPassages,
+        highlights,
+        ranges(new OffsetRange(0, value.length())));
+  }
+
+  private void checkPassages(
+      String expected,
+      String value,
+      int charWindow,
+      int maxPassages,
+      OffsetRange[] highlights,
+      OffsetRange[] ranges) {
+    String result = getPassages(value, charWindow, maxPassages, highlights, ranges);
+    if (!Objects.equals(result, expected)) {
+      System.out.println("Value:  " + value);
+      System.out.println("Result: " + result);
+      System.out.println("Expect: " + expected);
+    }
+    assertThat(result, Matchers.equalTo(expected));
+  }
+
+  protected String getPassages(
+      String value,
+      int charWindow,
+      int maxPassages,
+      OffsetRange[] highlights,
+      OffsetRange[] ranges) {
+    PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
+    PassageSelector selector = new PassageSelector();
+    List<OffsetRange> hlist = Arrays.asList(highlights);
+    List<OffsetRange> rangeList = Arrays.asList(ranges);
+    List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
+    return String.join("|", passageFormatter.format(value, passages, rangeList));
+  }
+
+  protected OffsetRange[] ranges(OffsetRange... offsets) {
+    return offsets;
+  }
+}