From 8fa0a8dd1e5eb3a5e2553c346372d203d00e575b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 11 Jan 2017 12:07:11 +0000 Subject: [PATCH] LUCENE-7627: Add #intersect(CompiledAutomaton) to Sorted*DocValues --- lucene/CHANGES.txt | 4 +++ .../apache/lucene/index/SortedDocValues.java | 22 +++++++++++++ .../lucene/index/SortedSetDocValues.java | 22 +++++++++++++ .../index/BaseDocValuesFormatTestCase.java | 32 +++++++++++++++++++ 4 files changed, 80 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 30943d2a9d2..58201d62919 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -126,6 +126,10 @@ New features concurrently across all segments in the index (Emmanuel Keller via Mike McCandless) +* LUCENE-7627: Added .intersect methods to SortedDocValues and + SortedSetDocValues to allow filtering their TermsEnums with a + CompiledAutomaton (Alan Woodward, Mike McCandless) + Bug Fixes * LUCENE-7547: JapaneseTokenizerFactory was failing to close the diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index e2d7dfd4692..087e4871ed4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -20,6 +20,7 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; /** * A per-document byte[] with presorted values. This is fundamentally an @@ -110,4 +111,25 @@ public abstract class SortedDocValues extends BinaryDocValues { return new SortedDocValuesTermsEnum(this); } + /** + * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton} + * The enum supports {@link TermsEnum#ord()}. + */ + public TermsEnum intersect(CompiledAutomaton automaton) throws IOException { + TermsEnum in = termsEnum(); + switch (automaton.type) { + case NONE: + return TermsEnum.EMPTY; + case ALL: + return in; + case SINGLE: + return new SingleTermsEnum(in, automaton.term); + case NORMAL: + return new AutomatonTermsEnum(in, automaton); + default: + // unreachable + throw new RuntimeException("unhandled case"); + } + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java index 6d02c250863..9e1c6a395f9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java @@ -20,6 +20,7 @@ package org.apache.lucene.index; import java.io.IOException; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; /** * A multi-valued version of {@link SortedDocValues}. @@ -102,4 +103,25 @@ public abstract class SortedSetDocValues extends DocValuesIterator { public TermsEnum termsEnum() throws IOException { return new SortedSetDocValuesTermsEnum(this); } + + /** + * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton} + * The enum supports {@link TermsEnum#ord()}. + */ + public TermsEnum intersect(CompiledAutomaton automaton) throws IOException { + TermsEnum in = termsEnum(); + switch (automaton.type) { + case NONE: + return TermsEnum.EMPTY; + case ALL: + return in; + case SINGLE: + return new SingleTermsEnum(in, automaton.term); + case NORMAL: + return new AutomatonTermsEnum(in, automaton); + default: + // unreachable + throw new RuntimeException("unhandled case"); + } + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index d55f212b160..8cb666570eb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -67,6 +67,8 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.TestUtil; import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RegExp; import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; @@ -906,6 +908,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); + + // NORMAL automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertNull(termsEnum.next()); + + // SINGLE automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertNull(termsEnum.next()); + ireader.close(); directory.close(); } @@ -2057,6 +2074,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); + + // NORMAL automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertNull(termsEnum.next()); + + // SINGLE automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertNull(termsEnum.next()); + ireader.close(); directory.close(); }