LUCENE-7627: Add #intersect(CompiledAutomaton) to Sorted*DocValues

This commit is contained in:
Alan Woodward 2017-01-11 12:07:11 +00:00
parent 53d5af17da
commit 8fa0a8dd1e
4 changed files with 80 additions and 0 deletions

View File

@ -126,6 +126,10 @@ New features
concurrently across all segments in the index (Emmanuel Keller via
Mike McCandless)
* LUCENE-7627: Added .intersect methods to SortedDocValues and
SortedSetDocValues to allow filtering their TermsEnums with a
CompiledAutomaton (Alan Woodward, Mike McCandless)
Bug Fixes
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* A per-document byte[] with presorted values. This is fundamentally an
@ -110,4 +111,25 @@ public abstract class SortedDocValues extends BinaryDocValues {
return new SortedDocValuesTermsEnum(this);
}
/**
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
* The enum supports {@link TermsEnum#ord()}.
*/
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
TermsEnum in = termsEnum();
switch (automaton.type) {
case NONE:
return TermsEnum.EMPTY;
case ALL:
return in;
case SINGLE:
return new SingleTermsEnum(in, automaton.term);
case NORMAL:
return new AutomatonTermsEnum(in, automaton);
default:
// unreachable
throw new RuntimeException("unhandled case");
}
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* A multi-valued version of {@link SortedDocValues}.
@ -102,4 +103,25 @@ public abstract class SortedSetDocValues extends DocValuesIterator {
public TermsEnum termsEnum() throws IOException {
return new SortedSetDocValuesTermsEnum(this);
}
/**
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
* The enum supports {@link TermsEnum#ord()}.
*/
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
TermsEnum in = termsEnum();
switch (automaton.type) {
case NONE:
return TermsEnum.EMPTY;
case ALL:
return in;
case SINGLE:
return new SingleTermsEnum(in, automaton.term);
case NORMAL:
return new AutomatonTermsEnum(in, automaton);
default:
// unreachable
throw new RuntimeException("unhandled case");
}
}
}

View File

@ -67,6 +67,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
@ -906,6 +908,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
// NORMAL automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
assertNull(termsEnum.next());
// SINGLE automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertNull(termsEnum.next());
ireader.close();
directory.close();
}
@ -2057,6 +2074,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
// NORMAL automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertEquals("world", termsEnum.next().utf8ToString());
assertEquals(2, termsEnum.ord());
assertNull(termsEnum.next());
// SINGLE automaton
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
assertEquals("hello", termsEnum.next().utf8ToString());
assertEquals(1, termsEnum.ord());
assertNull(termsEnum.next());
ireader.close();
directory.close();
}