1
0
mirror of https://github.com/apache/lucene.git synced 2025-02-16 15:06:41 +00:00

LUCENE-2380: add getTermsEnum for FieldCache entry and use it for fieldcache merging in Solr faceting

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@954904 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-06-15 14:15:58 +00:00
parent 3915a393f3
commit 98b1313d34
6 changed files with 167 additions and 16 deletions
lucene
CHANGES.txt
src
java/org/apache/lucene
test/org/apache/lucene/search
solr/src/java/org/apache/solr/request

@ -79,7 +79,7 @@ Changes in backwards compatibility policy
(getTerms, getTermsIndex). Also, the sort values (returned in
FieldDoc.fields) when sorting by SortField.STRING or
SortField.STRING_VAL are now BytesRef instances. See MIGRATE.txt
for more details. (Mike McCandless)
for more details. (yonik, Mike McCandless)
* LUCENE-2480: Though not a change in backwards compatibility policy, pre-3.0
indexes are no longer supported. You should upgrade to 3.x first, then run

@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRef;
@ -530,6 +531,9 @@ public interface FieldCache {
/** Number of documents */
public abstract int size();
/** Returns a TermsEnum that can iterate over the values in this index entry */
public abstract TermsEnum getTermsEnum();
}
/** Checks the internal cache for an appropriate entry, and if none

@ -19,17 +19,9 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;
import java.util.*;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.GrowableWriter;
@ -674,6 +666,105 @@ class FieldCacheImpl implements FieldCache {
public BytesRef lookup(int ord, BytesRef ret) {
return bytes.fillUsingLengthPrefix(ret, termOrdToBytesOffset.get(ord));
}
@Override
public TermsEnum getTermsEnum() {
return this.new DocTermsIndexEnum();
}
class DocTermsIndexEnum extends TermsEnum {
int currentOrd;
int currentBlockNumber;
int end; // end position in the current block
final byte[][] blocks;
final int[] blockEnds;
final BytesRef term = new BytesRef();
public DocTermsIndexEnum() {
currentOrd = 0;
currentBlockNumber = 0;
blocks = bytes.getBlocks();
blockEnds = bytes.getBlockEnds();
currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get(0));
end = blockEnds[currentBlockNumber];
}
@Override
public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
// TODO - we can support with binary search
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seek(long ord) throws IOException {
assert(ord >= 0 && ord <= numOrd);
// TODO: if gap is small, could iterate from current position? Or let user decide that?
currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get((int)ord));
end = blockEnds[currentBlockNumber];
currentOrd = (int)ord;
return SeekStatus.FOUND;
}
@Override
public BytesRef next() throws IOException {
int start = term.offset + term.length;
if (start >= end) {
// switch byte blocks
if (currentBlockNumber +1 >= blocks.length) {
return null;
}
currentBlockNumber++;
term.bytes = blocks[currentBlockNumber];
end = blockEnds[currentBlockNumber];
start = 0;
if (end<=0) return null; // special case of empty last array
}
currentOrd++;
byte[] block = term.bytes;
if ((block[start] & 128) == 0) {
term.length = block[start];
term.offset = start+1;
} else {
term.length = (((int) (block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
term.offset = start+2;
}
return term;
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
return currentOrd;
}
@Override
public int docFreq() {
throw new UnsupportedOperationException();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
throw new UnsupportedOperationException();
}
}
}
private static boolean DEFAULT_FASTER_BUT_MORE_RAM = true;

@ -32,6 +32,7 @@ import java.io.IOException;
* <p>@lucene.internal</p>*/
public final class PagedBytes {
private final List<byte[]> blocks = new ArrayList<byte[]>();
private final List<Integer> blockEnd = new ArrayList<Integer>();
private final int blockSize;
private final int blockBits;
private final int blockMask;
@ -42,6 +43,7 @@ public final class PagedBytes {
public final static class Reader implements Closeable {
private final byte[][] blocks;
private final int[] blockEnds;
private final int blockBits;
private final int blockMask;
private final int blockSize;
@ -52,6 +54,10 @@ public final class PagedBytes {
for(int i=0;i<blocks.length;i++) {
blocks[i] = pagedBytes.blocks.get(i);
}
blockEnds = new int[blocks.length];
for(int i=0;i< blockEnds.length;i++) {
blockEnds[i] = pagedBytes.blockEnd.get(i);
}
blockBits = pagedBytes.blockBits;
blockMask = pagedBytes.blockMask;
blockSize = pagedBytes.blockSize;
@ -102,6 +108,34 @@ public final class PagedBytes {
return b;
}
/** @lucene.internal Reads length as 1 or 2 byte vInt prefix, starting @ start. Returns the block number of the term. */
public int fillUsingLengthPrefix2(BytesRef b, long start) {
final int index = (int) (start >> blockBits);
final int offset = (int) (start & blockMask);
final byte[] block = b.bytes = blocks[index];
if ((block[offset] & 128) == 0) {
b.length = block[offset];
b.offset = offset+1;
} else {
b.length = (((int) (block[offset] & 0x7f)) << 8) | (block[1+offset] & 0xff);
b.offset = offset+2;
assert b.length > 0;
}
return index;
}
/** @lucene.internal */
public byte[][] getBlocks() {
return blocks;
}
/** @lucene.internal */
public int[] getBlockEnds() {
return blockEnds;
}
public void close() {
threadBuffers.close();
}
@ -123,6 +157,7 @@ public final class PagedBytes {
if (left == 0) {
if (currentBlock != null) {
blocks.add(currentBlock);
blockEnd.add(upto);
}
currentBlock = new byte[blockSize];
upto = 0;
@ -149,6 +184,7 @@ public final class PagedBytes {
if (left == 0) {
if (currentBlock != null) {
blocks.add(currentBlock);
blockEnd.add(upto);
}
currentBlock = new byte[blockSize];
upto = 0;
@ -178,6 +214,7 @@ public final class PagedBytes {
currentBlock = EMPTY_BYTES;
}
blocks.add(currentBlock);
blockEnd.add(upto);
currentBlock = null;
return new Reader(this);
}
@ -200,6 +237,7 @@ public final class PagedBytes {
}
if (currentBlock != null) {
blocks.add(currentBlock);
blockEnd.add(upto);
}
currentBlock = new byte[blockSize];
upto = 0;

@ -22,6 +22,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.Directory;
@ -167,6 +168,20 @@ public class TestFieldCache extends LuceneTestCase {
final String s = term == null ? null : term.utf8ToString();
assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s));
}
int nTerms = termsIndex.numOrd();
// System.out.println("nTerms="+nTerms);
TermsEnum tenum = termsIndex.getTermsEnum();
BytesRef val = new BytesRef();
for (int i=1; i<nTerms; i++) {
BytesRef val1 = tenum.next();
BytesRef val2 = termsIndex.lookup(i,val);
// System.out.println("i="+i);
assertEquals(val2, val1);
}
// test bad field
termsIndex = cache.getTermsIndex(reader, "bogusfield");

@ -1,5 +1,6 @@
package org.apache.solr.request;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
@ -137,7 +138,9 @@ class PerSegmentSingleValuedFaceting {
seg.pos = seg.startTermIndex;
}
if (seg.pos < seg.endTermIndex) {
seg.si.lookup(seg.pos, seg.tempBR);
seg.tenum = seg.si.getTermsEnum();
seg.tenum.seek(seg.pos);
seg.tempBR = seg.tenum.term();
queue.add(seg);
}
}
@ -156,7 +159,6 @@ class PerSegmentSingleValuedFaceting {
SegFacet seg = queue.top();
// make a shallow copy
// Is this always safe? Or could the byte[] be changed?
val.bytes = seg.tempBR.bytes;
val.offset = seg.tempBR.offset;
val.length = seg.tempBR.length;
@ -173,7 +175,7 @@ class PerSegmentSingleValuedFaceting {
queue.pop();
seg = queue.top();
} else {
seg.si.lookup(seg.pos, seg.tempBR);
seg.tempBR = seg.tenum.next();
seg = queue.updateTop();
}
} while (seg != null && val.compareTo(seg.tempBR) == 0);
@ -215,9 +217,10 @@ class PerSegmentSingleValuedFaceting {
int endTermIndex;
int[] counts;
int pos; // only used during merge with other segments
int pos; // only used when merging
TermsEnum tenum; // only used when merging
final BytesRef tempBR = new BytesRef();
BytesRef tempBR = new BytesRef();
void countTerms() throws IOException {
si = FieldCache.DEFAULT.getTermsIndex(reader, fieldName);