mirror of
https://github.com/apache/lucene.git
synced 2025-02-16 15:06:41 +00:00
LUCENE-2380: add getTermsEnum for FieldCache entry and use it for fieldcache merging in Solr faceting
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@954904 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3915a393f3
commit
98b1313d34
lucene
solr/src/java/org/apache/solr/request
@ -79,7 +79,7 @@ Changes in backwards compatibility policy
|
||||
(getTerms, getTermsIndex). Also, the sort values (returned in
|
||||
FieldDoc.fields) when sorting by SortField.STRING or
|
||||
SortField.STRING_VAL are now BytesRef instances. See MIGRATE.txt
|
||||
for more details. (Mike McCandless)
|
||||
for more details. (yonik, Mike McCandless)
|
||||
|
||||
* LUCENE-2480: Though not a change in backwards compatibility policy, pre-3.0
|
||||
indexes are no longer supported. You should upgrade to 3.x first, then run
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.search;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
@ -530,6 +531,9 @@ public interface FieldCache {
|
||||
|
||||
/** Number of documents */
|
||||
public abstract int size();
|
||||
|
||||
/** Returns a TermsEnum that can iterate over the values in this index entry */
|
||||
public abstract TermsEnum getTermsEnum();
|
||||
}
|
||||
|
||||
/** Checks the internal cache for an appropriate entry, and if none
|
||||
|
@ -19,17 +19,9 @@ package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.GrowableWriter;
|
||||
@ -674,6 +666,105 @@ class FieldCacheImpl implements FieldCache {
|
||||
public BytesRef lookup(int ord, BytesRef ret) {
|
||||
return bytes.fillUsingLengthPrefix(ret, termOrdToBytesOffset.get(ord));
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum getTermsEnum() {
|
||||
return this.new DocTermsIndexEnum();
|
||||
}
|
||||
|
||||
class DocTermsIndexEnum extends TermsEnum {
|
||||
int currentOrd;
|
||||
int currentBlockNumber;
|
||||
int end; // end position in the current block
|
||||
final byte[][] blocks;
|
||||
final int[] blockEnds;
|
||||
|
||||
final BytesRef term = new BytesRef();
|
||||
|
||||
public DocTermsIndexEnum() {
|
||||
currentOrd = 0;
|
||||
currentBlockNumber = 0;
|
||||
blocks = bytes.getBlocks();
|
||||
blockEnds = bytes.getBlockEnds();
|
||||
currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get(0));
|
||||
end = blockEnds[currentBlockNumber];
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
|
||||
// TODO - we can support with binary search
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
assert(ord >= 0 && ord <= numOrd);
|
||||
// TODO: if gap is small, could iterate from current position? Or let user decide that?
|
||||
currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get((int)ord));
|
||||
end = blockEnds[currentBlockNumber];
|
||||
currentOrd = (int)ord;
|
||||
return SeekStatus.FOUND;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
int start = term.offset + term.length;
|
||||
if (start >= end) {
|
||||
// switch byte blocks
|
||||
if (currentBlockNumber +1 >= blocks.length) {
|
||||
return null;
|
||||
}
|
||||
currentBlockNumber++;
|
||||
term.bytes = blocks[currentBlockNumber];
|
||||
end = blockEnds[currentBlockNumber];
|
||||
start = 0;
|
||||
if (end<=0) return null; // special case of empty last array
|
||||
}
|
||||
|
||||
currentOrd++;
|
||||
|
||||
byte[] block = term.bytes;
|
||||
if ((block[start] & 128) == 0) {
|
||||
term.length = block[start];
|
||||
term.offset = start+1;
|
||||
} else {
|
||||
term.length = (((int) (block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
|
||||
term.offset = start+2;
|
||||
}
|
||||
|
||||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() throws IOException {
|
||||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() throws IOException {
|
||||
return currentOrd;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean DEFAULT_FASTER_BUT_MORE_RAM = true;
|
||||
|
@ -32,6 +32,7 @@ import java.io.IOException;
|
||||
* <p>@lucene.internal</p>*/
|
||||
public final class PagedBytes {
|
||||
private final List<byte[]> blocks = new ArrayList<byte[]>();
|
||||
private final List<Integer> blockEnd = new ArrayList<Integer>();
|
||||
private final int blockSize;
|
||||
private final int blockBits;
|
||||
private final int blockMask;
|
||||
@ -42,6 +43,7 @@ public final class PagedBytes {
|
||||
|
||||
public final static class Reader implements Closeable {
|
||||
private final byte[][] blocks;
|
||||
private final int[] blockEnds;
|
||||
private final int blockBits;
|
||||
private final int blockMask;
|
||||
private final int blockSize;
|
||||
@ -52,6 +54,10 @@ public final class PagedBytes {
|
||||
for(int i=0;i<blocks.length;i++) {
|
||||
blocks[i] = pagedBytes.blocks.get(i);
|
||||
}
|
||||
blockEnds = new int[blocks.length];
|
||||
for(int i=0;i< blockEnds.length;i++) {
|
||||
blockEnds[i] = pagedBytes.blockEnd.get(i);
|
||||
}
|
||||
blockBits = pagedBytes.blockBits;
|
||||
blockMask = pagedBytes.blockMask;
|
||||
blockSize = pagedBytes.blockSize;
|
||||
@ -102,6 +108,34 @@ public final class PagedBytes {
|
||||
return b;
|
||||
}
|
||||
|
||||
/** @lucene.internal Reads length as 1 or 2 byte vInt prefix, starting @ start. Returns the block number of the term. */
|
||||
public int fillUsingLengthPrefix2(BytesRef b, long start) {
|
||||
final int index = (int) (start >> blockBits);
|
||||
final int offset = (int) (start & blockMask);
|
||||
final byte[] block = b.bytes = blocks[index];
|
||||
|
||||
if ((block[offset] & 128) == 0) {
|
||||
b.length = block[offset];
|
||||
b.offset = offset+1;
|
||||
} else {
|
||||
b.length = (((int) (block[offset] & 0x7f)) << 8) | (block[1+offset] & 0xff);
|
||||
b.offset = offset+2;
|
||||
assert b.length > 0;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
/** @lucene.internal */
|
||||
public byte[][] getBlocks() {
|
||||
return blocks;
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public int[] getBlockEnds() {
|
||||
return blockEnds;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
threadBuffers.close();
|
||||
}
|
||||
@ -123,6 +157,7 @@ public final class PagedBytes {
|
||||
if (left == 0) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
@ -149,6 +184,7 @@ public final class PagedBytes {
|
||||
if (left == 0) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
@ -178,6 +214,7 @@ public final class PagedBytes {
|
||||
currentBlock = EMPTY_BYTES;
|
||||
}
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
currentBlock = null;
|
||||
return new Reader(this);
|
||||
}
|
||||
@ -200,6 +237,7 @@ public final class PagedBytes {
|
||||
}
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
|
@ -22,6 +22,7 @@ import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
@ -167,6 +168,20 @@ public class TestFieldCache extends LuceneTestCase {
|
||||
final String s = term == null ? null : term.utf8ToString();
|
||||
assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s));
|
||||
}
|
||||
|
||||
int nTerms = termsIndex.numOrd();
|
||||
// System.out.println("nTerms="+nTerms);
|
||||
|
||||
TermsEnum tenum = termsIndex.getTermsEnum();
|
||||
BytesRef val = new BytesRef();
|
||||
for (int i=1; i<nTerms; i++) {
|
||||
BytesRef val1 = tenum.next();
|
||||
BytesRef val2 = termsIndex.lookup(i,val);
|
||||
// System.out.println("i="+i);
|
||||
assertEquals(val2, val1);
|
||||
}
|
||||
|
||||
|
||||
// test bad field
|
||||
termsIndex = cache.getTermsIndex(reader, "bogusfield");
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package org.apache.solr.request;
|
||||
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
@ -137,7 +138,9 @@ class PerSegmentSingleValuedFaceting {
|
||||
seg.pos = seg.startTermIndex;
|
||||
}
|
||||
if (seg.pos < seg.endTermIndex) {
|
||||
seg.si.lookup(seg.pos, seg.tempBR);
|
||||
seg.tenum = seg.si.getTermsEnum();
|
||||
seg.tenum.seek(seg.pos);
|
||||
seg.tempBR = seg.tenum.term();
|
||||
queue.add(seg);
|
||||
}
|
||||
}
|
||||
@ -156,7 +159,6 @@ class PerSegmentSingleValuedFaceting {
|
||||
SegFacet seg = queue.top();
|
||||
|
||||
// make a shallow copy
|
||||
// Is this always safe? Or could the byte[] be changed?
|
||||
val.bytes = seg.tempBR.bytes;
|
||||
val.offset = seg.tempBR.offset;
|
||||
val.length = seg.tempBR.length;
|
||||
@ -173,7 +175,7 @@ class PerSegmentSingleValuedFaceting {
|
||||
queue.pop();
|
||||
seg = queue.top();
|
||||
} else {
|
||||
seg.si.lookup(seg.pos, seg.tempBR);
|
||||
seg.tempBR = seg.tenum.next();
|
||||
seg = queue.updateTop();
|
||||
}
|
||||
} while (seg != null && val.compareTo(seg.tempBR) == 0);
|
||||
@ -215,9 +217,10 @@ class PerSegmentSingleValuedFaceting {
|
||||
int endTermIndex;
|
||||
int[] counts;
|
||||
|
||||
int pos; // only used during merge with other segments
|
||||
int pos; // only used when merging
|
||||
TermsEnum tenum; // only used when merging
|
||||
|
||||
final BytesRef tempBR = new BytesRef();
|
||||
BytesRef tempBR = new BytesRef();
|
||||
|
||||
void countTerms() throws IOException {
|
||||
si = FieldCache.DEFAULT.getTermsIndex(reader, fieldName);
|
||||
|
Loading…
x
Reference in New Issue
Block a user