mirror of https://github.com/apache/lucene.git
LUCENE-7627: Add #intersect(CompiledAutomaton) to Sorted*DocValues
This commit is contained in:
parent
53d5af17da
commit
8fa0a8dd1e
|
@ -126,6 +126,10 @@ New features
|
|||
concurrently across all segments in the index (Emmanuel Keller via
|
||||
Mike McCandless)
|
||||
|
||||
* LUCENE-7627: Added .intersect methods to SortedDocValues and
|
||||
SortedSetDocValues to allow filtering their TermsEnums with a
|
||||
CompiledAutomaton (Alan Woodward, Mike McCandless)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
/**
|
||||
* A per-document byte[] with presorted values. This is fundamentally an
|
||||
|
@ -110,4 +111,25 @@ public abstract class SortedDocValues extends BinaryDocValues {
|
|||
return new SortedDocValuesTermsEnum(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
|
||||
* The enum supports {@link TermsEnum#ord()}.
|
||||
*/
|
||||
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
|
||||
TermsEnum in = termsEnum();
|
||||
switch (automaton.type) {
|
||||
case NONE:
|
||||
return TermsEnum.EMPTY;
|
||||
case ALL:
|
||||
return in;
|
||||
case SINGLE:
|
||||
return new SingleTermsEnum(in, automaton.term);
|
||||
case NORMAL:
|
||||
return new AutomatonTermsEnum(in, automaton);
|
||||
default:
|
||||
// unreachable
|
||||
throw new RuntimeException("unhandled case");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
|
||||
/**
|
||||
* A multi-valued version of {@link SortedDocValues}.
|
||||
|
@ -102,4 +103,25 @@ public abstract class SortedSetDocValues extends DocValuesIterator {
|
|||
public TermsEnum termsEnum() throws IOException {
|
||||
return new SortedSetDocValuesTermsEnum(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
|
||||
* The enum supports {@link TermsEnum#ord()}.
|
||||
*/
|
||||
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
|
||||
TermsEnum in = termsEnum();
|
||||
switch (automaton.type) {
|
||||
case NONE:
|
||||
return TermsEnum.EMPTY;
|
||||
case ALL:
|
||||
return in;
|
||||
case SINGLE:
|
||||
return new SingleTermsEnum(in, automaton.term);
|
||||
case NORMAL:
|
||||
return new AutomatonTermsEnum(in, automaton);
|
||||
default:
|
||||
// unreachable
|
||||
throw new RuntimeException("unhandled case");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,8 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
@ -906,6 +908,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
|||
termsEnum.seekExact(2);
|
||||
assertEquals("world", termsEnum.term().utf8ToString());
|
||||
assertEquals(2, termsEnum.ord());
|
||||
|
||||
// NORMAL automaton
|
||||
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
|
||||
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||
assertEquals(1, termsEnum.ord());
|
||||
assertEquals("world", termsEnum.next().utf8ToString());
|
||||
assertEquals(2, termsEnum.ord());
|
||||
assertNull(termsEnum.next());
|
||||
|
||||
// SINGLE automaton
|
||||
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
|
||||
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||
assertEquals(1, termsEnum.ord());
|
||||
assertNull(termsEnum.next());
|
||||
|
||||
ireader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
@ -2057,6 +2074,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
|||
termsEnum.seekExact(2);
|
||||
assertEquals("world", termsEnum.term().utf8ToString());
|
||||
assertEquals(2, termsEnum.ord());
|
||||
|
||||
// NORMAL automaton
|
||||
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
|
||||
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||
assertEquals(1, termsEnum.ord());
|
||||
assertEquals("world", termsEnum.next().utf8ToString());
|
||||
assertEquals(2, termsEnum.ord());
|
||||
assertNull(termsEnum.next());
|
||||
|
||||
// SINGLE automaton
|
||||
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
|
||||
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||
assertEquals(1, termsEnum.ord());
|
||||
assertNull(termsEnum.next());
|
||||
|
||||
ireader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue