LUCENE-9464: Add high(er)-level hit highlighter example that demonstrates and uses low-level components (#1820)

2020-09-10 13:17:13 +02:00 · 2020-09-10 13:17:13 +02:00 · e2f3f626ee
parent 8debc9d0c2
commit e2f3f626ee
7 changed files with 1199 additions and 192 deletions
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/FieldValueHighlighters.java
@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.BiPredicate;
+import java.util.function.Predicate;
+
+/**
+ * A factory of {@link org.apache.lucene.search.matchhighlight.MatchHighlighter.FieldValueHighlighter} classes
+ * that cover typical use cases (verbatim values, highlights, abbreviations).
+ *
+ * @see MatchHighlighter#appendFieldHighlighter
+ */
+public final class FieldValueHighlighters {
+  private FieldValueHighlighters() {
+  }
+
+  private static abstract class AbstractFieldValueHighlighter implements MatchHighlighter.FieldValueHighlighter {
+    private final BiPredicate<String, Boolean> testPredicate;
+
+    protected AbstractFieldValueHighlighter(BiPredicate<String, Boolean> testPredicate) {
+      this.testPredicate = testPredicate;
+    }
+
+    @Override
+    public final boolean isApplicable(String field, boolean hasMatches) {
+      return testPredicate.test(field, hasMatches);
+    }
+  }
+
+  /**
+   * Displays up to {@code maxLeadingCharacters} of the field's value, regardless of whether it contained
+   * highlights or not.
+   */
+  public static MatchHighlighter.FieldValueHighlighter maxLeadingCharacters(int maxLeadingCharacters, String ellipsis, Set<String> fields) {
+    PassageSelector passageSelector = defaultPassageSelector();
+    PassageFormatter passageFormatter = new PassageFormatter(ellipsis, "", "");
+    return new AbstractFieldValueHighlighter((field, hasMatches) -> fields.contains(field)) {
+      @Override
+      public List<String> format(String field, String[] values, String contiguousValue,
+                                 List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+        List<Passage> bestPassages =
+            passageSelector.pickBest(contiguousValue, Collections.emptyList(), maxLeadingCharacters, 1, valueRanges);
+
+        return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+      }
+
+      @Override
+      public Collection<String> alwaysFetchedFields() {
+        return fields;
+      }
+    };
+  }
+
+  /**
+   * Default preconfigured {@link PassageSelector}.
+   */
+  public static PassageSelector defaultPassageSelector() {
+    return new PassageSelector(
+        PassageSelector.DEFAULT_SCORER,
+        new BreakIteratorShrinkingAdjuster());
+  }
+
+  /**
+   * Highlights fields matching predicate {@code matchFields} only if they contained query matches.
+   */
+  public static MatchHighlighter.FieldValueHighlighter highlighted(
+      int maxPassageWindow,
+      int maxPassages,
+      PassageFormatter passageFormatter,
+      Predicate<String> matchFields) {
+    PassageSelector passageSelector = defaultPassageSelector();
+    return new AbstractFieldValueHighlighter((field, hasMatches) -> matchFields.test(field) && hasMatches) {
+      @Override
+      public List<String> format(String field, String[] values, String contiguousValue,
+                                 List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+        assert matchOffsets != null;
+
+        List<Passage> bestPassages =
+            passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
+
+        return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+      }
+    };
+  }
+
+  /**
+   * Always returns raw field values, no highlighting or value truncation is applied.
+   */
+  public static MatchHighlighter.FieldValueHighlighter verbatimValue(String field, String... moreFields) {
+    HashSet<String> matchFields = new HashSet<>(Arrays.asList(moreFields));
+    matchFields.add(field);
+    return new AbstractFieldValueHighlighter((fld, hasMatches) -> matchFields.contains(fld)) {
+      @Override
+      public Collection<String> alwaysFetchedFields() {
+        return matchFields;
+      }
+
+      @Override
+      public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
+                                 List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+        return Arrays.asList(values);
+      }
+    };
+  }
+
+  /**
+   * Matches all fields and omits their value in the output (so that no highlight or value is emitted).
+   */
+  public static MatchHighlighter.FieldValueHighlighter skipRemaining() {
+    return new AbstractFieldValueHighlighter((field, hasMatches) -> true) {
+      @Override
+      public List<String> format(String field, String[] values, String contiguousValue, List<OffsetRange> valueRanges,
+                                 List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+        return null;
+      }
+    };
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchHighlighter.java
@ -0,0 +1,308 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DocumentStoredFieldVisitor;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Predicate;
+import java.util.stream.Stream;
+
+/**
+ * An example highlighter that combines several lower-level highlighting
+ * utilities in this package into a fully featured, ready-to-use component.
+ * <p>
+ * Note that if you need to customize or tweak the details of highlighting,
+ * it is better to assemble your own highlighter using those low-level
+ * building blocks, rather than extend or modify this one.
+ */
+public class MatchHighlighter {
+  private final IndexSearcher searcher;
+  private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
+  private final Analyzer analyzer;
+
+  private final HashSet<String> fieldsAlwaysReturned = new HashSet<>();
+  private final List<FieldValueHighlighter> fieldHighlighters = new ArrayList<>();
+
+  /**
+   * Actual per-field highlighter. Field highlighters are probed whether they
+   * are applicable to a particular combination of (field, hasMatches) pair. If a highlighter
+   * declares it is applicable, its {@link #format} method is invoked and the result
+   * is returned as the field's value.
+   *
+   * @see FieldValueHighlighters
+   */
+  public interface FieldValueHighlighter {
+    /**
+     * Check if this highlighter can be applied to a given field.
+     *
+     * @param field Field name
+     * @param hasMatches {@code true} if the field has a non-empty set of match regions.
+     */
+    boolean isApplicable(String field, boolean hasMatches);
+
+    /**
+     * Do format field values appropriately.
+     */
+    List<String> format(String field, String[] values, String contiguousValue,
+                        List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets);
+
+    /**
+     * @return Returns a set of fields that must be fetched for each document, regardless
+     * of whether they had matches or not. This is useful to load and return certain fields
+     * that should always be included (identifiers, document titles, etc.).
+     */
+    default Collection<String> alwaysFetchedFields() {
+      return Collections.emptyList();
+    }
+
+    /**
+     * Returns a new field value highlighter that is a combination of this one and another one.
+     */
+    default FieldValueHighlighter or(FieldValueHighlighter other) {
+      FieldValueHighlighter first = this;
+      FieldValueHighlighter second = other;
+
+      HashSet<String> fieldUnion = new HashSet<>();
+      fieldUnion.addAll(first.alwaysFetchedFields());
+      fieldUnion.addAll(second.alwaysFetchedFields());
+
+      return new FieldValueHighlighter() {
+        @Override
+        public boolean isApplicable(String field, boolean hasMatches) {
+          return first.isApplicable(field, hasMatches)
+              || second.isApplicable(field, hasMatches);
+        }
+
+        @Override
+        public List<String> format(String field, String[] values, String contiguousValue,
+                                   List<OffsetRange> valueRanges, List<QueryOffsetRange> matchOffsets) {
+          FieldValueHighlighter delegate =
+              first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second;
+          return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
+        }
+
+        @Override
+        public Collection<String> alwaysFetchedFields() {
+          return fieldUnion;
+        }
+      };
+    }
+  }
+
+  /**
+   * Append a new highlighter to field highlighters chain. The order of field highlighters
+   * is important (first-matching wins).
+   */
+  public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
+    fieldHighlighters.add(highlighter);
+    fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
+    return this;
+  }
+
+  /**
+   * Always fetch the given set of fields for all input documents.
+   */
+  public void alwaysFetchFields(String field, String... otherFields) {
+    Stream.concat(Stream.of(field), Stream.of(otherFields))
+        .forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld)));
+  }
+
+  /**
+   * Single document's highlights.
+   */
+  public static class DocHighlights {
+    public final int docId;
+    public final Map<String, List<String>> fields = new LinkedHashMap<>();
+
+    public DocHighlights(int docId) {
+      this.docId = docId;
+    }
+  }
+
+  /**
+   * An {@link OffsetRange} of a match, together with the source query that caused it.
+   */
+  public static class QueryOffsetRange extends OffsetRange {
+    public final Query query;
+
+    QueryOffsetRange(Query query, int from, int to) {
+      super(from, to);
+      this.query = query;
+    }
+  }
+
+  private static class DocHit {
+    final int docId;
+    private final LeafReader leafReader;
+    private final int leafDocId;
+    private final LinkedHashMap<String, List<QueryOffsetRange>> matchRanges
+        = new LinkedHashMap<>();
+
+    DocHit(int docId, LeafReader leafReader, int leafDocId) {
+      this.docId = docId;
+      this.leafReader = leafReader;
+      this.leafDocId = leafDocId;
+    }
+
+    void addMatches(Query query, Map<String, List<OffsetRange>> hits) {
+      hits.forEach((field, offsets) -> {
+        List<QueryOffsetRange> target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
+        offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
+      });
+    }
+
+    Document document(Predicate<String> needsField) throws IOException {
+      // Only load the fields that have a chance to be highlighted.
+      DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() {
+        @Override
+        public Status needsField(FieldInfo fieldInfo) {
+          return (matchRanges.containsKey(fieldInfo.name) ||
+              needsField.test(fieldInfo.name)) ? Status.YES : Status.NO;
+        }
+      };
+
+      leafReader.document(leafDocId, visitor);
+      return visitor.getDocument();
+    }
+  }
+
+  public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
+    this(searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+  }
+
+  public MatchHighlighter(IndexSearcher searcher,
+                          Analyzer analyzer,
+                          OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
+    this.searcher = searcher;
+    this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
+    this.analyzer = analyzer;
+  }
+
+  public Stream<DocHighlights> highlight(TopDocs topDocs, Query... queries) throws IOException {
+    // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
+    // for streaming, so we'll just prepopulate the map in proper order.
+    LinkedHashMap<Integer, DocHit> docHits = new LinkedHashMap<>();
+    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+      docHits.put(scoreDoc.doc, null);
+    }
+
+    // Collect match ranges for each query and associate each range to the origin query.
+    for (Query q : queries) {
+      MatchRegionRetriever highlighter =
+          new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
+      highlighter.highlightDocuments(topDocs,
+          (int docId, LeafReader leafReader, int leafDocId, Map<String, List<OffsetRange>> hits) -> {
+            DocHit docHit = docHits.get(docId);
+            if (docHit == null) {
+              docHit = new DocHit(docId, leafReader, leafDocId);
+              docHits.put(docId, docHit);
+            }
+            docHit.addMatches(q, hits);
+          });
+    }
+
+    return docHits.values().stream()
+        .filter(Objects::nonNull) // This should always the case?
+        .map(this::computeDocFieldValues);
+  }
+
+  private DocHighlights computeDocFieldValues(DocHit docHit) {
+    Document doc;
+    try {
+      doc = docHit.document(fieldsAlwaysReturned::contains);
+    } catch (IOException e) {
+      throw new UncheckedIOException(e);
+    }
+
+    DocHighlights docHighlights = new DocHighlights(docHit.docId);
+
+    HashSet<String> unique = new HashSet<>();
+    for (IndexableField indexableField : doc) {
+      String field = indexableField.name();
+      if (!unique.add(field)) {
+        continue;
+      }
+
+      String[] values = doc.getValues(field);
+      String contiguousValue = contiguousFieldValue(field, values);
+      List<OffsetRange> valueRanges = computeValueRanges(field, values);
+      List<QueryOffsetRange> offsets = docHit.matchRanges.get(field);
+
+      List<String> formattedValues = fieldValueHighlighter(field, offsets != null)
+          .format(field, values, contiguousValue, valueRanges, offsets);
+
+      if (formattedValues != null) {
+        docHighlights.fields.put(field, formattedValues);
+      }
+    }
+
+    return docHighlights;
+  }
+
+  private List<OffsetRange> computeValueRanges(String field, String[] values) {
+    ArrayList<OffsetRange> valueRanges = new ArrayList<>();
+    int offset = 0;
+    for (CharSequence v : values) {
+      valueRanges.add(new OffsetRange(offset, offset + v.length()));
+      offset += v.length();
+      offset += analyzer.getOffsetGap(field);
+    }
+    return valueRanges;
+  }
+
+  private String contiguousFieldValue(String field, String[] values) {
+    String value;
+    if (values.length == 1) {
+      value = values[0];
+    } else {
+      // TODO: This can be inefficient if offset gap is large but the logic
+      // of applying offsets would get much more complicated so leaving for now
+      // (would have to recalculate all offsets to omit gaps).
+      String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
+      value = String.join(fieldGapPadding, values);
+    }
+    return value;
+  }
+
+  private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
+    for (FieldValueHighlighter highlighter : fieldHighlighters) {
+      if (highlighter.isApplicable(field, hasMatches)) {
+        return highlighter;
+      }
+    }
+    throw new RuntimeException("No field highlighter could be matched to field: " + field);
+  }
+}
--- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java
@ -80,22 +80,23 @@ public class MatchRegionRetriever {

  /**
   * A constructor with the default offset strategy supplier.
+   *
+   * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
+   *                 in the absence of position offsets in the index. Note that the analyzer must return
+   *                 tokens (positions and offsets) identical to the ones stored in the index.
   */
  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer) throws IOException {
-    this(searcher, query, analyzer, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
+    this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
  }

  /**
   * @param searcher Index searcher to be used for retrieving matches.
   * @param query The query for which matches should be retrieved. The query should be rewritten
   *              against the provided searcher.
-   * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields
-   *                 in the absence of position offsets in the index. Note that the analyzer must return
-   *                 tokens (positions and offsets) identical to the ones stored in the index.
   * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link OffsetsRetrievalStrategy}
   *                                    instances.
   */
-  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer,
+  public MatchRegionRetriever(IndexSearcher searcher, Query query,
                              OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
      throws IOException {
    leaves = searcher.getIndexReader().leaves();
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/AnalyzerWithGaps.java
@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
+
+/**
+ * An analyzer for tests that has a predefined offset and position gap.
+ */
+class AnalyzerWithGaps extends DelegatingAnalyzerWrapper {
+  private final Analyzer delegate;
+  private final int offsetGap;
+  private final int positionGap;
+
+  AnalyzerWithGaps(int offsetGap, int positionGap, Analyzer delegate) {
+    super(delegate.getReuseStrategy());
+    this.delegate = delegate;
+    this.offsetGap = offsetGap;
+    this.positionGap = positionGap;
+  }
+
+  @Override
+  protected Analyzer getWrappedAnalyzer(String fieldName) {
+    return delegate;
+  }
+
+  @Override
+  public int getOffsetGap(String fieldName) {
+    return offsetGap;
+  }
+
+  @Override
+  public int getPositionIncrementGap(String fieldName) {
+    return positionGap;
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/IndexBuilder.java
@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.function.BiFunction;
+import java.util.function.Consumer;
+
+/**
+ * Utility class for building an ephemeral document index
+ * and running a block of code on its reader.
+ */
+class IndexBuilder {
+  public static final String FLD_ID = "id";
+  public static final String FLD_SORT_ORDER = "id_order";
+
+  private final BiFunction<String, String, IndexableField> toField;
+  private final ArrayList<Document> documents = new ArrayList<>();
+  private int seq;
+
+  class DocFields {
+    final Document document;
+
+    public DocFields(Document doc) {
+      this.document = doc;
+    }
+
+    public void add(String field, String... values) {
+      assert values.length > 0 : "At least one field value is required.";
+      for (String value : values) {
+        document.add(toField.apply(field, value));
+      }
+    }
+  }
+
+  IndexBuilder(BiFunction<String, String, IndexableField> valueToField) {
+    this.toField = valueToField;
+  }
+
+  public IndexBuilder doc(String field, String... values) {
+    return doc(fields -> {
+      fields.add(field, values);
+    });
+  }
+
+  public IndexBuilder doc(Consumer<DocFields> fields) {
+    Document doc = new Document();
+    doc.add(new NumericDocValuesField(FLD_SORT_ORDER, seq));
+    doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
+    fields.accept(new DocFields(doc));
+    documents.add(doc);
+    return this;
+  }
+
+  public IndexBuilder build(Analyzer analyzer, IOUtils.IOConsumer<DirectoryReader> block) throws IOException {
+    IndexWriterConfig config = new IndexWriterConfig(analyzer);
+    config.setIndexSort(new Sort(new SortField(FLD_SORT_ORDER, SortField.Type.LONG)));
+    try (Directory directory = new ByteBuffersDirectory()) {
+      IndexWriter iw = new IndexWriter(directory, config);
+      for (Document doc : documents) {
+        iw.addDocument(doc);
+      }
+      if (RandomizedTest.randomBoolean()) {
+        iw.commit();
+      }
+      iw.flush();
+
+      try (DirectoryReader reader = DirectoryReader.open(iw)) {
+        block.accept(reader);
+      }
+    }
+    return this;
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@ -0,0 +1,466 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.matchhighlight;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.intervals.IntervalQuery;
+import org.apache.lucene.queries.intervals.Intervals;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class TestMatchHighlighter extends LuceneTestCase {
+  private static final String FLD_ID = "id";
+  private static final String FLD_TEXT1 = "text1";
+  private static final String FLD_TEXT2 = "text2";
+
+  private FieldType TYPE_TEXT_POSITIONS_OFFSETS;
+  private FieldType TYPE_TEXT_POSITIONS;
+
+  private PerFieldAnalyzerWrapper analyzer;
+
+  @Before
+  public void setup() throws IOException {
+    TYPE_TEXT_POSITIONS = TextField.TYPE_STORED;
+
+    TYPE_TEXT_POSITIONS_OFFSETS = new FieldType(TextField.TYPE_STORED);
+    TYPE_TEXT_POSITIONS_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    TYPE_TEXT_POSITIONS_OFFSETS.freeze();
+
+    Map<String, Analyzer> fieldAnalyzers = new HashMap<>();
+
+    // Create an analyzer with some synonyms, just to showcase them.
+    SynonymMap synonymMap = buildSynonymMap(new String[][]{
+        {"moon\u0000shine", "firewater"},
+        {"firewater", "moon\u0000shine"},
+    });
+
+    // Make a non-empty offset gap so that break iterator doesn't go haywire on multivalues
+    // glued together.
+    final int offsetGap = RandomizedTest.randomIntBetween(1, 2);
+    final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
+    Analyzer synonymsAnalyzer =
+        new AnalyzerWithGaps(offsetGap, positionGap, new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new WhitespaceTokenizer();
+            TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
+            return new TokenStreamComponents(tokenizer, tokenStream);
+          }
+        });
+
+    fieldAnalyzers.put(FLD_TEXT1, synonymsAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT2, synonymsAnalyzer);
+
+    analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
+  }
+
+  static SynonymMap buildSynonymMap(String[][] synonyms) throws IOException {
+    SynonymMap.Builder builder = new SynonymMap.Builder();
+    for (String[] pair : synonyms) {
+      assertThat(pair.length, Matchers.equalTo(2));
+      builder.add(new CharsRef(pair[0]), new CharsRef(pair[1]), true);
+    }
+    return builder.build();
+  }
+
+  @Test
+  public void testBasicUsage() throws IOException {
+    new IndexBuilder(this::toField)
+        .doc(FLD_TEXT1, "foo bar baz")
+        .doc(FLD_TEXT1, "bar foo baz")
+        .doc(fields -> {
+          fields.add(FLD_TEXT1, "Very long content but not matching anything.");
+          fields.add(FLD_TEXT2, "no foo but bar");
+        })
+        .build(analyzer, reader -> {
+          Query query = new BooleanQuery.Builder()
+              .add(new TermQuery(new Term(FLD_TEXT1, "foo")), BooleanClause.Occur.SHOULD)
+              .add(new TermQuery(new Term(FLD_TEXT2, "bar")), BooleanClause.Occur.SHOULD)
+              .build();
+
+          // In the most basic scenario, we run a search against a query, retrieve
+          // top docs...
+          IndexSearcher searcher = new IndexSearcher(reader);
+          Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+          TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+          // ...and would want a fixed set of fields from those documents, some of them
+          // possibly highlighted if they matched the query.
+          //
+          // This configures the highlighter so that the FLD_ID field is always returned verbatim,
+          // and FLD_TEXT1 is returned *only if it contained a query match*.
+          MatchHighlighter highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+                .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+                    80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+                .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          // Note document field highlights are a stream over documents in topDocs. In the remaining code we will just
+          // collect them on the fly into a preformatted string.
+          Stream<MatchHighlighter.DocHighlights> highlights = highlighter.highlight(topDocs, query);
+          assertHighlights(toDocList(highlights),
+                  " 0. id: 0",
+                  "    text1: >foo< bar baz",
+                  " 1. id: 1",
+                  "    text1: bar >foo< baz",
+                  " 2. id: 2");
+
+          // In a more realistic use case, you'd want to show the value of a given field *regardless* of whether it
+          // contained a highlight or not -- it is odd that document "id: 2" above doesn't have the 'text1' field
+          // shown because that field wasn't part of the query match.
+          //
+          // Let's say the field is also potentially long; if it contains a match,
+          // we would want to display the contextual snippet surrounding that match. If it does not contain any
+          // matches, we would want to display its content up to a given number of characters (lead lines).
+          //
+          // Let's do this by adding an appropriate field highlighter on FLD_TEXT1.
+          highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+                  .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+                      80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+                  .appendFieldHighlighter(FieldValueHighlighters.maxLeadingCharacters(10, "...", Set.of(FLD_TEXT1)))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+              " 0. id: 0",
+              "    text1: >foo< bar baz",
+              " 1. id: 1",
+              "    text1: bar >foo< baz",
+              " 2. id: 2",
+              "    text1: Very long...");
+
+          // Field highlighters can apply to multiple fields and be chained for convenience.
+          // For example, this defines a combined highlighter over both FLD_TEXT1 and FLD_TEXT2.
+          Set<String> fields = Set.of(FLD_TEXT1, FLD_TEXT2);
+          MatchHighlighter.FieldValueHighlighter highlightedOrAbbreviated =
+              FieldValueHighlighters.highlighted(80 * 3, 1, new PassageFormatter("...", ">", "<"), fields::contains)
+                  .or(FieldValueHighlighters.maxLeadingCharacters(10, "...", fields));
+
+          highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(FieldValueHighlighters.verbatimValue(FLD_ID))
+                  .appendFieldHighlighter(highlightedOrAbbreviated)
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+              " 0. id: 0",
+              "    text1: >foo< bar baz",
+              " 1. id: 1",
+              "    text1: bar >foo< baz",
+              " 2. id: 2",
+              "    text1: Very long...",
+              "    text2: no foo but >bar<");
+        });
+  }
+
+  @Test
+  public void testSynonymHighlight() throws IOException {
+    // There is nothing special needed to highlight or process complex queries, synonyms, etc.
+    // Synonyms defined in the constructor of this class.
+    new IndexBuilder(this::toField)
+        .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
+        .build(analyzer, reader -> {
+          IndexSearcher searcher = new IndexSearcher(reader);
+          Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+
+          MatchHighlighter highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+                      80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          Query query = new TermQuery(new Term(FLD_TEXT1, "firewater"));
+          assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+              "0. text1: Where the >moon shine< falls, >firewater< flows.");
+
+          query = new PhraseQuery(FLD_TEXT1, "moon", "shine");
+          assertHighlights(toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+              "0. text1: Where the >moon shine< falls, >firewater< flows.");
+        });
+  }
+
+  @Test
+  public void testCustomFieldHighlightHandling() throws IOException {
+    // Match highlighter is a showcase of individual components in this package, suitable
+    // to create any kind of field-display designs.
+    //
+    // In this example we will build a custom field highlighting handler that
+    // highlights matches over a multivalued field, shows that field's values if it received
+    // no matches and limits the number of values displayed to at most 2 (with an appropriate message).
+    new IndexBuilder(this::toField)
+        // Just one document, one field, four values.
+        .doc(FLD_TEXT1, "foo bar", "bar foo baz", "bar baz foo", "baz baz baz")
+        .build(analyzer, reader -> {
+          IndexSearcher searcher = new IndexSearcher(reader);
+          Sort sortOrder = Sort.INDEXORDER;
+
+          // Let's start with the simple predefined highlighter so that the field's value shows
+          // and is highlighted when it was part of the hit.
+          MatchHighlighter.FieldValueHighlighter highlighted = FieldValueHighlighters.highlighted(
+              80 * 3, 2, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals);
+          MatchHighlighter highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(highlighted)
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          Query query = new TermQuery(new Term(FLD_TEXT1, "foo"));
+          TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+          // Note the highlighter is configured with at most 2 snippets so the match on the
+          // third value ("bar baz foo") is omitted. Ellipsis isn't inserted too because
+          // values are displayed in full.
+          assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+              "0. text1: >foo< bar, bar >foo< baz");
+
+          // So the above works fine if the field received a match but omits it otherwise. We can
+          // force the display of this field by chaining with verbatim value highlighter:
+          highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(highlighted.or(FieldValueHighlighters.verbatimValue(FLD_TEXT1)))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
+              "0. text1: foo bar, bar foo baz, bar baz foo, baz baz baz");
+
+          // But this is not exactly what we'd like because we want to limit the display of values to the first two.
+          // Let's just write a custom field highlighter handler that does it.
+          class AtMostNValuesHighlighter implements MatchHighlighter.FieldValueHighlighter {
+            private final String field;
+            private final int limit;
+
+            AtMostNValuesHighlighter(String field, int limit) {
+              this.field = field;
+              this.limit = limit;
+            }
+
+            @Override
+            public boolean isApplicable(String field, boolean hasMatches) {
+              return Objects.equals(field, this.field);
+            }
+
+            @Override
+            public List<String> format(String field, String[] values, String contiguousValue,
+                                       List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+              if (values.length <= limit) {
+                return Arrays.asList(values);
+              } else {
+                List<String> collected = Stream.of(values).limit(limit).collect(Collectors.toList());
+                int remaining = values.length - collected.size();
+                collected.add(String.format(Locale.ROOT, "[%d omitted]", remaining));
+                return collected;
+              }
+            }
+
+            @Override
+            public Collection<String> alwaysFetchedFields() {
+              return Collections.singleton(field);
+            }
+          }
+
+          // We can now chain it as usual and contemplate the result.
+          highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(highlighted.or(new AtMostNValuesHighlighter(FLD_TEXT1, 2)))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+              "0. text1: >foo< bar, bar >foo< baz");
+          assertHighlights(toDocList(highlighter.highlight(topDocs, new MatchAllDocsQuery())),
+              "0. text1: foo bar, bar foo baz, [2 omitted]");
+        });
+  }
+
+  @Test
+  public void testHighlightMoreQueriesAtOnceShowoff() throws IOException {
+    // Match highlighter underlying components are powerful enough to build interesting,
+    // if not always super-practical, things. In this case, we would like to highlight
+    // a set of matches of *more than one* query over the same set of input documents. This includes
+    // highest-scoring passage resolution (from multiple hits) and different highlight markers
+    // for each query.
+    new IndexBuilder(this::toField)
+        .doc(FLD_TEXT1, "foo bar baz")
+        .doc(FLD_TEXT1, "foo baz bar")
+        .build(analyzer, reader -> {
+          // Let's start with the two queries. The first one will be an unordered
+          // query for (foo, baz) with a max gap of 1; let's use intervals for this.
+          Query q1 = new IntervalQuery(FLD_TEXT1,
+                  Intervals.maxgaps(1,
+                      Intervals.unordered(
+                          Intervals.term("foo"),
+                          Intervals.term("baz"))));
+
+          // The second one will be a simpler term query for "bar".
+          Query q2 = new TermQuery(new Term(FLD_TEXT1, "bar"));
+
+          // Let's fetch matching documents by combining the two into a Boolean query.
+          Query query = new BooleanQuery.Builder()
+              .add(q1, BooleanClause.Occur.SHOULD)
+              .add(q2, BooleanClause.Occur.SHOULD)
+              .build();
+
+          IndexSearcher searcher = new IndexSearcher(reader);
+          Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+          TopDocs topDocs = searcher.search(query, 10, sortOrder);
+
+          // If we use the "regular" highlighter, the result will be slightly odd: a nested
+          // highlight over "bar" within the first match. Also, you can't distinguish which of the sub-queries
+          // caused which highlight marker... but if it were HTML then you could give the span
+          // some semi-translucent background and layered matches would be visible.
+          MatchHighlighter highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(FieldValueHighlighters.highlighted(
+                      80 * 3, 1, new PassageFormatter("...", "<span>", "</span>"), FLD_TEXT1::equals))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          assertHighlights(toDocList(highlighter.highlight(topDocs, query)),
+              "0. text1: <span>foo <span>bar</span> baz</span>",
+              "1. text1: <span>foo baz</span> <span>bar</span>");
+
+          // To separate highlights for multiple queries we'll pass them separately to the
+          // highlighter and differentiate highlight markers upon their application. Let's start with the customized
+          // field highlighter first. This utilizes the fact that match ranges passed from MatchHighlighter
+          // contain a reference to the original query which brought up the match.
+          class SeparateMarkerFieldHighlighter implements MatchHighlighter.FieldValueHighlighter {
+            private final String field;
+            private final Map<Query, String> queryClassMap;
+
+            SeparateMarkerFieldHighlighter(String field, Map<Query, String> queryClassMap) {
+              this.field = field;
+              this.queryClassMap = queryClassMap;
+            }
+
+            @Override
+            public boolean isApplicable(String field, boolean hasMatches) {
+              return Objects.equals(field, this.field) && hasMatches;
+            }
+
+            @Override
+            public List<String> format(String field, String[] values, String contiguousValue,
+                                       List<OffsetRange> valueRanges, List<MatchHighlighter.QueryOffsetRange> matchOffsets) {
+              PassageSelector passageSelector = new PassageSelector();
+              int maxPassageWindow = 80;
+              int maxPassages = 3;
+              List<Passage> bestPassages =
+                  passageSelector.pickBest(contiguousValue, matchOffsets, maxPassageWindow, maxPassages, valueRanges);
+
+              // We know the offset ranges passed to us by MatchHighlighter are instances of QueryOffsetRange
+              // so we compute the class based on that.
+              Function<OffsetRange, String> queryToClass =
+                  (range) -> queryClassMap.get(((MatchHighlighter.QueryOffsetRange) range).query);
+
+              PassageFormatter passageFormatter = new PassageFormatter("...",
+                  (range) -> "<span class='" + queryToClass.apply(range) + "'>",
+                  (range) -> "</span>");
+
+              return passageFormatter.format(contiguousValue, bestPassages, valueRanges);
+            }
+          }
+
+          // And this is pretty much it. We now set up query classes to display, set up the highlighter...
+          Map<Query, String> queryClassMap = Map.of(q1, "q1", q2, "q2");
+          highlighter =
+              new MatchHighlighter(searcher, analyzer)
+                  .appendFieldHighlighter(new SeparateMarkerFieldHighlighter(FLD_TEXT1, queryClassMap))
+                  .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+          // ...and run highlighting. Note the query passed to the highlighter are individual sub-clauses
+          // of the Boolean query used to fetch documents.
+          assertHighlights(toDocList(highlighter.highlight(topDocs, q1, q2)),
+              "0. text1: <span class='q1'>foo <span class='q2'>bar</span> baz</span>",
+              "1. text1: <span class='q1'>foo baz</span> <span class='q2'>bar</span>");
+        });
+  }
+
+  private void assertHighlights(List<List<String>> docList, String... expectedFormattedLines) {
+    ArrayList<String> actualLines = new ArrayList<>();
+    for (int doc = 0; doc < docList.size(); doc++) {
+      List<String> fields = docList.get(doc);
+      for (int i = 0; i < fields.size(); i++) {
+        actualLines.add((i == 0 ? String.format(Locale.ROOT, "%2d. ", doc) : "    ") + fields.get(i));
+      }
+    }
+
+    if (!Arrays.equals(
+        Stream.of(expectedFormattedLines).map(String::trim).toArray(),
+        actualLines.stream().map(String::trim).toArray())) {
+      throw new AssertionError("Actual hits were:\n" +
+          String.join("\n", actualLines) + "\n\n but expected them to be:\n" +
+          String.join("\n", expectedFormattedLines));
+    }
+  }
+
+  private List<List<String>> toDocList(Stream<MatchHighlighter.DocHighlights> highlights) {
+    return highlights.map(docHighlights ->
+        docHighlights.fields.entrySet().stream()
+            .map(e -> e.getKey() + ": " + String.join(", ", e.getValue()))
+            .collect(Collectors.toList())
+    ).collect(Collectors.toList());
+  }
+
+  private IndexableField toField(String name, String value) {
+    switch (name) {
+      case FLD_TEXT1:
+        return new Field(name, value, TYPE_TEXT_POSITIONS_OFFSETS);
+      case FLD_TEXT2:
+        return new Field(name, value, TYPE_TEXT_POSITIONS);
+      default:
+        throw new AssertionError("Don't know how to handle this field: " + name);
+    }
+  }
+}
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java
@ -20,21 +20,17 @@ import com.carrotsearch.randomizedtesting.RandomizedTest;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.apache.lucene.analysis.util.CharTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.intervals.IntervalQuery;
@ -52,19 +48,13 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
-import org.apache.lucene.store.ByteBuffersDirectory;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 import org.hamcrest.Matchers;
 import org.junit.Before;
 import org.junit.Test;

 import java.io.IOException;
-import java.io.UncheckedIOException;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@ -75,11 +65,9 @@ import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import static org.hamcrest.Matchers.containsInAnyOrder;
-import static org.hamcrest.Matchers.emptyArray;
-import static org.hamcrest.Matchers.not;

 public class TestMatchRegionRetriever extends LuceneTestCase {
-  private static final String FLD_ID = "field_id";
+  private static final String FLD_ID = IndexBuilder.FLD_ID;

  private static final String FLD_TEXT_POS_OFFS1 = "field_text_offs1";
  private static final String FLD_TEXT_POS_OFFS2 = "field_text_offs2";
@ -100,7 +88,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  private Analyzer analyzer;

  @Before
-  public void setup() {
+  public void setup() throws IOException {
    TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
    TYPE_STORED_WITH_OFFSETS.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    TYPE_STORED_WITH_OFFSETS.freeze();
@ -109,26 +97,24 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
    TYPE_STORED_NO_POSITIONS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    TYPE_STORED_NO_POSITIONS.freeze();

+    final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
+    final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
    Analyzer whitespaceAnalyzer =
-        new Analyzer() {
-          final int offsetGap = RandomizedTest.randomIntBetween(0, 2);
-          final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
+        new AnalyzerWithGaps(offsetGap, positionGap,
+            new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN));

+    SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
+        {"foo\u0000bar", "syn1"},
+        {"baz", "syn2\u0000syn3"},
+    });
+
+    Analyzer synonymsAnalyzer =
+        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
-            WhitespaceTokenizer tokenizer =
-                new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
-            return new TokenStreamComponents(tokenizer);
-          }
-
-          @Override
-          public int getOffsetGap(String fieldName) {
-            return offsetGap;
-          }
-
-          @Override
-          public int getPositionIncrementGap(String fieldName) {
-            return positionGap;
+            Tokenizer tokenizer = new WhitespaceTokenizer();
+            TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
+            return new TokenStreamComponents(tokenizer, tokenStream);
          }
        };

@ -138,26 +124,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
    fieldAnalyzers.put(FLD_TEXT_POS_OFFS1, whitespaceAnalyzer);
    fieldAnalyzers.put(FLD_TEXT_POS_OFFS2, whitespaceAnalyzer);
    fieldAnalyzers.put(FLD_TEXT_NOPOS, whitespaceAnalyzer);
-
-    try {
-      SynonymMap.Builder b = new SynonymMap.Builder();
-      b.add(new CharsRef("foo\u0000bar"), new CharsRef("syn1"), true);
-      b.add(new CharsRef("baz"), new CharsRef("syn2\u0000syn3"), true);
-      SynonymMap synonymMap = b.build();
-      Analyzer synonymsAnalyzer =
-          new Analyzer() {
-            @Override
-            protected TokenStreamComponents createComponents(String fieldName) {
-              Tokenizer tokenizer = new WhitespaceTokenizer();
-              TokenStream tokenStream = new SynonymGraphFilter(tokenizer, synonymMap, true);
-              return new TokenStreamComponents(tokenizer, tokenStream);
-            }
-          };
-      fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
-      fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);
-    } catch (IOException e) {
-      throw new UncheckedIOException(e);
-    }
+    fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS_OFFS, synonymsAnalyzer);
+    fieldAnalyzers.put(FLD_TEXT_SYNONYMS_POS, synonymsAnalyzer);

    analyzer = new PerFieldAnalyzerWrapper(new MissingAnalyzer(), fieldAnalyzers);
  }
@ -184,13 +152,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  private void checkTermQuery(String field) throws IOException {
-    withReader(
-        List.of(
-            Map.of(field, values("foo bar baz")),
-            Map.of(field, values("bar foo baz")),
-            Map.of(field, values("bar baz foo")),
-            Map.of(field, values("bar bar bar irrelevant"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz")
+        .doc(field, "bar foo baz")
+        .doc(field, "bar baz foo")
+        .doc(field, "bar bar bar irrelevant")
+        .build(analyzer, reader -> {
          assertThat(highlights(reader, new TermQuery(new Term(field, "foo"))),
              containsInAnyOrder(
                  fmt("0: (%s: '>foo< bar baz')", field),
@ -217,17 +184,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
            .add(new TermQuery(new Term(field, "xyz")), BooleanClause.Occur.MUST_NOT)
            .build();

-    withReader(
-        List.of(
-            Map.of(field, values("foo bar baz abc")),
-            Map.of(field, values("bar foo baz def")),
-            Map.of(field, values("bar baz foo xyz"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz abc")
+        .doc(field, "bar foo baz def")
+        .doc(field, "bar baz foo xyz")
+        .build(analyzer, reader -> {
          assertThat(highlights(reader, query),
              containsInAnyOrder(
                  fmt("0: (%s: '>foo bar baz< abc')", field),
                  fmt("1: (%s: 'bar >foo baz< def')", field)));
-        });
+        }
+    );
  }

  @Test
@ -241,12 +208,11 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  private void checkVariousQueryTypes(String field) throws IOException {
-    withReader(
-        List.of(
-            Map.of(field, values("foo bar baz abc")),
-            Map.of(field, values("bar foo baz def")),
-            Map.of(field, values("bar baz foo xyz"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz abc")
+        .doc(field, "bar foo baz def")
+        .doc(field, "bar baz foo xyz")
+        .build(analyzer, reader -> {
          assertThat(highlights(reader, stdQueryParser.apply("foo baz", field)),
              containsInAnyOrder(
                  fmt("0: (%s: '>foo< bar >baz< abc')", field),
@ -297,31 +263,31 @@ public class TestMatchRegionRetriever extends LuceneTestCase {

          assertThat(highlights(reader, new MatchAllDocsQuery()),
              Matchers.hasSize(0));
-        });
+        }
+    );

-    withReader(
-        List.of(
-            Map.of(field, values("foo baz foo")),
-            Map.of(field, values("bas baz foo")),
-            Map.of(field, values("bar baz foo xyz"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo baz foo")
+        .doc(field, "bas baz foo")
+        .doc(field, "bar baz foo xyz")
+        .build(analyzer, reader -> {
          assertThat(
              highlights(reader, stdQueryParser.apply("[bar TO baz] -bar", field)),
              containsInAnyOrder(
                  fmt("0: (%s: 'foo >baz< foo')", field), fmt("1: (%s: '>bas< >baz< foo')", field)));
-        });
+        }
+    );
  }

  @Test
  public void testIntervalQueries() throws IOException {
    String field = FLD_TEXT_POS_OFFS;

-    withReader(
-        List.of(
-            Map.of(field, values("foo baz foo")),
-            Map.of(field, values("bas baz foo")),
-            Map.of(field, values("bar baz foo xyz"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo baz foo")
+        .doc(field, "bas baz foo")
+        .doc(field, "bar baz foo xyz")
+        .build(analyzer, reader -> {
          assertThat(
              highlights(reader, new IntervalQuery(field,
                  Intervals.unordered(
@ -374,7 +340,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
              containsInAnyOrder(
                  fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)
              ));
-        });
+        }
+    );
  }

  @Test
@ -388,36 +355,37 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  public void checkMultivaluedFields(String field) throws IOException {
-    withReader(
-        List.of(
-            Map.of(field, values("foo bar", "baz abc", "bad baz")),
-            Map.of(field, values("bar foo", "baz def")),
-            Map.of(field, values("bar baz", "foo xyz"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar", "baz abc", "bad baz")
+        .doc(field, "bar foo", "baz def")
+        .doc(field, "bar baz", "foo xyz")
+        .build(analyzer, reader -> {
          assertThat(highlights(reader, stdQueryParser.apply("baz", field)),
              containsInAnyOrder(
                  fmt("0: (%s: '>baz< abc | bad >baz<')", field),
                  fmt("1: (%s: '>baz< def')", field),
                  fmt("2: (%s: 'bar >baz<')", field)));
-        });
+        }
+    );
  }

  @Test
  public void testMultiFieldHighlights() throws IOException {
-    for (String[] fields :
+    for (String[] fieldPairs :
        new String[][]{
            {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS_OFFS2},
            {FLD_TEXT_POS, FLD_TEXT_POS_OFFS2},
            {FLD_TEXT_POS_OFFS1, FLD_TEXT_POS}
        }) {
-      String field1 = fields[0];
-      String field2 = fields[1];
-      withReader(
-          List.of(
-              Map.of(
-                  field1, values("foo bar", "baz abc"),
-                  field2, values("foo baz", "loo bar"))),
-          reader -> {
+      String field1 = fieldPairs[0];
+      String field2 = fieldPairs[1];
+
+      new IndexBuilder(this::toField)
+          .doc(fields -> {
+            fields.add(field1, "foo bar", "baz abc");
+            fields.add(field2, "foo baz", "loo bar");
+          })
+          .build(analyzer, reader -> {
            String ordered =
                Stream.of(fmt("(%s: '>baz< abc')", field1), fmt("(%s: 'loo >bar<')", field2))
                    .sorted()
@ -428,7 +396,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                    reader,
                    stdQueryParser.apply(field1 + ":baz" + " OR " + field2 + ":bar", field1)),
                containsInAnyOrder(fmt("0: %s", ordered)));
-          });
+          }
+      );
    }
  }

@ -440,15 +409,17 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  public void testNoRewrite() throws IOException {
    String field1 = FLD_TEXT_POS_OFFS1;
    String field2 = FLD_TEXT_POS_OFFS2;
-    withReader(
-        List.of(
-            Map.of(
-                field1, values("0100"),
-                field2, values("loo bar")),
-            Map.of(
-                field1, values("0200"),
-                field2, values("foo bar"))),
-        reader -> {
+
+    new IndexBuilder(this::toField)
+        .doc(fields -> {
+          fields.add(field1, "0100");
+          fields.add(field2, "loo bar");
+        })
+        .doc(fields -> {
+          fields.add(field1, "0200");
+          fields.add(field2, "foo bar");
+        })
+        .build(analyzer, reader -> {
          String expected = fmt("0: (%s: '>0100<')(%s: 'loo >bar<')", field1, field2);
          assertThat(
              highlights(
@ -461,7 +432,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                  reader,
                  stdQueryParser.apply(fmt("+%s:01* AND %s:bar", field1, field2), field1)),
              containsInAnyOrder(expected));
-        });
+        }
+    );
  }

  @Test
@ -475,9 +447,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  public void checkNestedQueryHits(String field) throws IOException {
-    withReader(
-        List.of(Map.of(field, values("foo bar baz abc"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz abc")
+        .build(analyzer, reader -> {
          assertThat(
              highlights(
                  reader,
@ -496,7 +468,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                      .add(new TermQuery(new Term(field, "baz")), BooleanClause.Occur.SHOULD)
                      .build()),
              containsInAnyOrder(fmt("0: (%s: '>foo >bar< >baz<< abc')", field)));
-        });
+        }
+    );
  }

  @Test
@ -510,13 +483,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  private void checkGraphQuery(String field) throws IOException {
-    withReader(
-        List.of(
-            Map.of(field, values("foo bar baz")),
-            Map.of(field, values("bar foo baz")),
-            Map.of(field, values("bar baz foo")),
-            Map.of(field, values("bar bar bar irrelevant"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz")
+        .doc(field, "bar foo baz")
+        .doc(field, "bar baz foo")
+        .doc(field, "bar bar bar irrelevant")
+        .build(analyzer, reader -> {
          assertThat(highlights(reader, new TermQuery(new Term(field, "syn1"))),
              containsInAnyOrder(fmt("0: (%s: '>foo bar< baz')", field)));

@ -536,7 +508,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
          assertThat(
              highlights(reader, stdQueryParser.apply(field + ":\"foo syn2 syn3\"", field)),
              containsInAnyOrder(fmt("1: (%s: 'bar >foo baz<')", field)));
-        });
+        }
+    );
  }

  @Test
@ -550,13 +523,12 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  }

  private void checkSpanQueries(String field) throws IOException {
-    withReader(
-        List.of(
-            Map.of(field, values("foo bar baz")),
-            Map.of(field, values("bar foo baz")),
-            Map.of(field, values("bar baz foo")),
-            Map.of(field, values("bar bar bar irrelevant"))),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(field, "foo bar baz")
+        .doc(field, "bar foo baz")
+        .doc(field, "bar baz foo")
+        .doc(field, "bar bar bar irrelevant")
+        .build(analyzer, reader -> {
          assertThat(
              highlights(
                  reader,
@ -598,7 +570,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
                  fmt("0: (%s: '>foo bar< baz')", field),
                  fmt("1: (%s: '>bar foo< baz')", field),
                  fmt("2: (%s: '>bar baz foo<')", field)));
-        });
+        }
+    );
  }

  /**
@ -610,12 +583,10 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  public void testTextFieldNoPositionsOffsetFromValues() throws Exception {
    String field = FLD_TEXT_NOPOS;

-    withReader(
-        List.of(
-            Map.of(FLD_TEXT_NOPOS, values("foo bar")),
-            Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz baz"))
-        ),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(FLD_TEXT_NOPOS, "foo bar")
+        .doc(FLD_TEXT_NOPOS, "foo bar", "baz baz")
+        .build(analyzer, reader -> {
          OffsetsRetrievalStrategySupplier defaults = MatchRegionRetriever
              .computeOffsetRetrievalStrategies(reader, analyzer);
          OffsetsRetrievalStrategySupplier customSuppliers = (fld) -> {
@ -634,7 +605,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
              containsInAnyOrder(
                  fmt("0: (%s: '>foo bar<')", field),
                  fmt("1: (%s: '>foo bar< | >baz baz<')", field)));
-        });
+        }
+    );
  }

  /**
@ -648,13 +620,13 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
  public void testTextFieldNoPositionsOffsetsFromTokens() throws Exception {
    String field = FLD_TEXT_NOPOS;

-    withReader(
-        List.of(
-            Map.of(FLD_TEXT_NOPOS, values("foo bar"),
-                   FLD_TEXT_POS, values("bar bar")),
-            Map.of(FLD_TEXT_NOPOS, values("foo bar", "baz bar"))
-        ),
-        reader -> {
+    new IndexBuilder(this::toField)
+        .doc(fields -> {
+          fields.add(FLD_TEXT_NOPOS, "foo bar");
+          fields.add(FLD_TEXT_POS, "bar bar");
+        })
+        .doc(FLD_TEXT_NOPOS, "foo bar", "baz bar")
+        .build(analyzer, reader -> {
          assertThat(
              highlights(
                  reader,
@ -662,7 +634,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
              containsInAnyOrder(
                  fmt("0: (%s: 'foo >bar<')", field),
                  fmt("1: (%s: 'foo >bar< | baz >bar<')", field)));
-        });
+        }
+    );
  }

  private List<String> highlights(IndexReader reader, Query query) throws IOException {
@ -702,46 +675,14 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
          }
        };

-    MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, analyzer,
-        offsetsStrategySupplier);
+    MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, rewrittenQuery, offsetsStrategySupplier);
    highlighter.highlightDocuments(topDocs, highlightCollector);

    return highlights;
  }

-  private String[] values(String... values) {
-    assertThat(values, not(emptyArray()));
-    return values;
-  }
-
-  private void withReader(
-      Collection<Map<String, String[]>> docs, IOUtils.IOConsumer<DirectoryReader> block)
-      throws IOException {
-    IndexWriterConfig config = new IndexWriterConfig(analyzer);
-
-    try (Directory directory = new ByteBuffersDirectory()) {
-      IndexWriter iw = new IndexWriter(directory, config);
-
-      int seq = 0;
-      for (Map<String, String[]> fields : docs) {
-        Document doc = new Document();
-        doc.add(new StringField(FLD_ID, Integer.toString(seq++), Field.Store.YES));
-        for (Map.Entry<String, String[]> field : fields.entrySet()) {
-          for (String value : field.getValue()) {
-            doc.add(toField(field.getKey(), value));
-          }
-        }
-        iw.addDocument(doc);
-        if (RandomizedTest.randomBoolean()) {
-          iw.commit();
-        }
-      }
-      iw.flush();
-
-      try (DirectoryReader reader = DirectoryReader.open(iw)) {
-        block.accept(reader);
-      }
-    }
+  private static String fmt(String string, Object... args) {
+    return String.format(Locale.ROOT, string, args);
  }

  private IndexableField toField(String name, String value) {
@ -760,8 +701,4 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
        throw new AssertionError("Don't know how to handle this field: " + name);
    }
  }
-
-  private static String fmt(String string, Object... args) {
-    return String.format(Locale.ROOT, string, args);
-  }
 }