mirror of https://github.com/apache/lucene.git
LUCENE-7627: Add #intersect(CompiledAutomaton) to Sorted*DocValues
This commit is contained in:
parent
53d5af17da
commit
8fa0a8dd1e
|
@ -126,6 +126,10 @@ New features
|
||||||
concurrently across all segments in the index (Emmanuel Keller via
|
concurrently across all segments in the index (Emmanuel Keller via
|
||||||
Mike McCandless)
|
Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-7627: Added .intersect methods to SortedDocValues and
|
||||||
|
SortedSetDocValues to allow filtering their TermsEnums with a
|
||||||
|
CompiledAutomaton (Alan Woodward, Mike McCandless)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
|
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A per-document byte[] with presorted values. This is fundamentally an
|
* A per-document byte[] with presorted values. This is fundamentally an
|
||||||
|
@ -110,4 +111,25 @@ public abstract class SortedDocValues extends BinaryDocValues {
|
||||||
return new SortedDocValuesTermsEnum(this);
|
return new SortedDocValuesTermsEnum(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
|
||||||
|
* The enum supports {@link TermsEnum#ord()}.
|
||||||
|
*/
|
||||||
|
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
|
||||||
|
TermsEnum in = termsEnum();
|
||||||
|
switch (automaton.type) {
|
||||||
|
case NONE:
|
||||||
|
return TermsEnum.EMPTY;
|
||||||
|
case ALL:
|
||||||
|
return in;
|
||||||
|
case SINGLE:
|
||||||
|
return new SingleTermsEnum(in, automaton.term);
|
||||||
|
case NORMAL:
|
||||||
|
return new AutomatonTermsEnum(in, automaton);
|
||||||
|
default:
|
||||||
|
// unreachable
|
||||||
|
throw new RuntimeException("unhandled case");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A multi-valued version of {@link SortedDocValues}.
|
* A multi-valued version of {@link SortedDocValues}.
|
||||||
|
@ -102,4 +103,25 @@ public abstract class SortedSetDocValues extends DocValuesIterator {
|
||||||
public TermsEnum termsEnum() throws IOException {
|
public TermsEnum termsEnum() throws IOException {
|
||||||
return new SortedSetDocValuesTermsEnum(this);
|
return new SortedSetDocValuesTermsEnum(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
|
||||||
|
* The enum supports {@link TermsEnum#ord()}.
|
||||||
|
*/
|
||||||
|
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
|
||||||
|
TermsEnum in = termsEnum();
|
||||||
|
switch (automaton.type) {
|
||||||
|
case NONE:
|
||||||
|
return TermsEnum.EMPTY;
|
||||||
|
case ALL:
|
||||||
|
return in;
|
||||||
|
case SINGLE:
|
||||||
|
return new SingleTermsEnum(in, automaton.term);
|
||||||
|
case NORMAL:
|
||||||
|
return new AutomatonTermsEnum(in, automaton);
|
||||||
|
default:
|
||||||
|
// unreachable
|
||||||
|
throw new RuntimeException("unhandled case");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,6 +67,8 @@ import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||||
|
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||||
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
|
||||||
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
@ -906,6 +908,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
termsEnum.seekExact(2);
|
termsEnum.seekExact(2);
|
||||||
assertEquals("world", termsEnum.term().utf8ToString());
|
assertEquals("world", termsEnum.term().utf8ToString());
|
||||||
assertEquals(2, termsEnum.ord());
|
assertEquals(2, termsEnum.ord());
|
||||||
|
|
||||||
|
// NORMAL automaton
|
||||||
|
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
|
||||||
|
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertEquals("world", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(2, termsEnum.ord());
|
||||||
|
assertNull(termsEnum.next());
|
||||||
|
|
||||||
|
// SINGLE automaton
|
||||||
|
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
|
||||||
|
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertNull(termsEnum.next());
|
||||||
|
|
||||||
ireader.close();
|
ireader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
@ -2057,6 +2074,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
termsEnum.seekExact(2);
|
termsEnum.seekExact(2);
|
||||||
assertEquals("world", termsEnum.term().utf8ToString());
|
assertEquals("world", termsEnum.term().utf8ToString());
|
||||||
assertEquals(2, termsEnum.ord());
|
assertEquals(2, termsEnum.ord());
|
||||||
|
|
||||||
|
// NORMAL automaton
|
||||||
|
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
|
||||||
|
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertEquals("world", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(2, termsEnum.ord());
|
||||||
|
assertNull(termsEnum.next());
|
||||||
|
|
||||||
|
// SINGLE automaton
|
||||||
|
termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
|
||||||
|
assertEquals("hello", termsEnum.next().utf8ToString());
|
||||||
|
assertEquals(1, termsEnum.ord());
|
||||||
|
assertNull(termsEnum.next());
|
||||||
|
|
||||||
ireader.close();
|
ireader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue