lucene 4: Upgraded the simple id cache.

This commit is contained in:
Martijn van Groningen 2012-10-26 16:51:45 +02:00 committed by Shay Banon
parent 683be6fc64
commit 24ef987624
5 changed files with 108 additions and 96 deletions

View File

@ -19,10 +19,13 @@
package org.elasticsearch.index.cache.id;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.elasticsearch.common.component.CloseableComponent;
import org.elasticsearch.index.IndexComponent;
import java.util.List;
/**
*
*/
@ -32,7 +35,7 @@ public interface IdCache extends IndexComponent, CloseableComponent, Iterable<Id
void clear(IndexReader reader);
void refresh(IndexReader[] readers) throws Exception;
void refresh(List<AtomicReaderContext> readers) throws Exception;
IdReaderCache reader(IndexReader reader);

View File

@ -20,10 +20,11 @@
package org.elasticsearch.index.cache.id.simple;
import gnu.trove.impl.Constants;
import gnu.trove.map.hash.TIntObjectHashMap;
import org.apache.lucene.index.*;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.HashedBytesArray;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
@ -39,10 +40,7 @@ import org.elasticsearch.index.mapper.internal.ParentFieldMapper;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentMap;
/**
@ -91,20 +89,20 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
@SuppressWarnings({"StringEquality"})
@Override
public void refresh(IndexReader[] readers) throws Exception {
public void refresh(List<AtomicReaderContext> atomicReaderContexts) throws Exception {
// do a quick check for the common case, that all are there
if (refreshNeeded(readers)) {
if (refreshNeeded(atomicReaderContexts)) {
synchronized (idReaders) {
if (!refreshNeeded(readers)) {
if (!refreshNeeded(atomicReaderContexts)) {
return;
}
// do the refresh
Map<Object, Map<String, TypeBuilder>> builders = new HashMap<Object, Map<String, TypeBuilder>>();
Map<Object, Map<BytesReference, TypeBuilder>> builders = new HashMap<Object, Map<BytesReference, TypeBuilder>>();
// first, go over and load all the id->doc map for all types
for (IndexReader reader : readers) {
for (AtomicReaderContext context : atomicReaderContexts) {
AtomicReader reader = context.reader();
if (idReaders.containsKey(reader.getCoreCacheKey())) {
// no need, continue
continue;
@ -113,98 +111,84 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
if (reader instanceof SegmentReader) {
((SegmentReader) reader).addCoreClosedListener(this);
}
HashMap<String, TypeBuilder> readerBuilder = new HashMap<String, TypeBuilder>();
Map<BytesReference, TypeBuilder> readerBuilder = new HashMap<BytesReference, TypeBuilder>();
builders.put(reader.getCoreCacheKey(), readerBuilder);
String field = StringHelper.intern(UidFieldMapper.NAME);
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field));
try {
do {
Term term = termEnum.term();
if (term == null || term.field() != field) break;
// TODO we can optimize this, since type is the prefix, and we get terms ordered
// so, only need to move to the next type once its different
Uid uid = Uid.createUid(term.text());
TypeBuilder typeBuilder = readerBuilder.get(uid.type());
if (typeBuilder == null) {
typeBuilder = new TypeBuilder(reader);
readerBuilder.put(StringHelper.intern(uid.type()), typeBuilder);
}
Terms terms = reader.terms(UidFieldMapper.NAME);
if (terms == null) { // Should not happen
throw new ElasticSearchIllegalArgumentException("Id cache needs _uid field");
}
HashedBytesArray idAsBytes = checkIfCanReuse(builders, new HashedBytesArray(uid.id()));
termDocs.seek(termEnum);
while (termDocs.next()) {
// when traversing, make sure to ignore deleted docs, so the key->docId will be correct
if (!reader.isDeleted(termDocs.doc())) {
typeBuilder.idToDoc.put(idAsBytes, termDocs.doc());
typeBuilder.docToId[termDocs.doc()] = idAsBytes;
}
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docsEnum = null;
for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.term()) {
HashedBytesArray[] typeAndId = splitUidIntoTypeAndId(term);
TypeBuilder typeBuilder = readerBuilder.get(typeAndId[0]);
if (typeBuilder == null) {
typeBuilder = new TypeBuilder(reader);
readerBuilder.put(typeAndId[0], typeBuilder);
}
HashedBytesArray idAsBytes = checkIfCanReuse(builders, typeAndId[1]);
docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
typeBuilder.idToDoc.put(idAsBytes, docId);
typeBuilder.docToId[docId] = idAsBytes;
}
}
}
// now, go and load the docId->parentId map
for (IndexReader reader : readers) {
for (AtomicReaderContext context : atomicReaderContexts) {
AtomicReader reader = context.reader();
if (idReaders.containsKey(reader.getCoreCacheKey())) {
// no need, continue
continue;
}
Map<String, TypeBuilder> readerBuilder = builders.get(reader.getCoreCacheKey());
Map<BytesReference, TypeBuilder> readerBuilder = builders.get(reader.getCoreCacheKey());
String field = StringHelper.intern(ParentFieldMapper.NAME);
TermDocs termDocs = reader.termDocs();
TermEnum termEnum = reader.terms(new Term(field));
try {
do {
Term term = termEnum.term();
if (term == null || term.field() != field) break;
// TODO we can optimize this, since type is the prefix, and we get terms ordered
// so, only need to move to the next type once its different
Uid uid = Uid.createUid(term.text());
Terms terms = reader.terms(ParentFieldMapper.NAME);
if (terms == null) { // Should not happen
throw new ElasticSearchIllegalArgumentException("Id cache needs _parent field");
}
TypeBuilder typeBuilder = readerBuilder.get(uid.type());
if (typeBuilder == null) {
typeBuilder = new TypeBuilder(reader);
readerBuilder.put(StringHelper.intern(uid.type()), typeBuilder);
TermsEnum termsEnum = terms.iterator(null);
DocsEnum docsEnum = null;
for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.term()) {
HashedBytesArray[] typeAndId = splitUidIntoTypeAndId(term);
TypeBuilder typeBuilder = readerBuilder.get(typeAndId[0]);
if (typeBuilder == null) {
typeBuilder = new TypeBuilder(reader);
readerBuilder.put(typeAndId[0], typeBuilder);
}
HashedBytesArray idAsBytes = checkIfCanReuse(builders, typeAndId[1]);
boolean added = false; // optimize for when all the docs are deleted for this id
docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0);
for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
if (!added) {
typeBuilder.parentIdsValues.add(idAsBytes);
added = true;
}
typeBuilder.parentIdsOrdinals[docId] = typeBuilder.t;
}
HashedBytesArray idAsBytes = checkIfCanReuse(builders, new HashedBytesArray(uid.id()));
boolean added = false; // optimize for when all the docs are deleted for this id
termDocs.seek(termEnum);
while (termDocs.next()) {
// ignore deleted docs while we are at it
if (!reader.isDeleted(termDocs.doc())) {
if (!added) {
typeBuilder.parentIdsValues.add(idAsBytes);
added = true;
}
typeBuilder.parentIdsOrdinals[termDocs.doc()] = typeBuilder.t;
}
}
if (added) {
typeBuilder.t++;
}
} while (termEnum.next());
} finally {
termDocs.close();
termEnum.close();
if (added) {
typeBuilder.t++;
}
}
}
// now, build it back
for (Map.Entry<Object, Map<String, TypeBuilder>> entry : builders.entrySet()) {
MapBuilder<String, SimpleIdReaderTypeCache> types = MapBuilder.newMapBuilder();
for (Map.Entry<String, TypeBuilder> typeBuilderEntry : entry.getValue().entrySet()) {
for (Map.Entry<Object, Map<BytesReference, TypeBuilder>> entry : builders.entrySet()) {
MapBuilder<BytesReference, SimpleIdReaderTypeCache> types = MapBuilder.newMapBuilder();
for (Map.Entry<BytesReference, TypeBuilder> typeBuilderEntry : entry.getValue().entrySet()) {
types.put(typeBuilderEntry.getKey(), new SimpleIdReaderTypeCache(typeBuilderEntry.getKey(),
typeBuilderEntry.getValue().idToDoc,
typeBuilderEntry.getValue().docToId,
@ -226,7 +210,7 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
return sizeInBytes;
}
private HashedBytesArray checkIfCanReuse(Map<Object, Map<String, TypeBuilder>> builders, HashedBytesArray idAsBytes) {
private HashedBytesArray checkIfCanReuse(Map<Object, Map<BytesReference, TypeBuilder>> builders, HashedBytesArray idAsBytes) {
HashedBytesArray finalIdAsBytes;
// go over and see if we can reuse this id
for (SimpleIdReaderCache idReaderCache : idReaders.values()) {
@ -235,7 +219,7 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
return finalIdAsBytes;
}
}
for (Map<String, TypeBuilder> map : builders.values()) {
for (Map<BytesReference, TypeBuilder> map : builders.values()) {
for (TypeBuilder typeBuilder : map.values()) {
finalIdAsBytes = typeBuilder.canReuse(idAsBytes);
if (finalIdAsBytes != null) {
@ -246,15 +230,37 @@ public class SimpleIdCache extends AbstractIndexComponent implements IdCache, Se
return idAsBytes;
}
private boolean refreshNeeded(IndexReader[] readers) {
for (IndexReader reader : readers) {
if (!idReaders.containsKey(reader.getCoreCacheKey())) {
private boolean refreshNeeded(List<AtomicReaderContext> atomicReaderContexts) {
for (AtomicReaderContext atomicReaderContext : atomicReaderContexts) {
if (!idReaders.containsKey(atomicReaderContext.reader().getCoreCacheKey())) {
return true;
}
}
return false;
}
// LUCENE 4 UPGRADE: This logic should go to Uid class. Uid class should BR based instead of string
private static HashedBytesArray[] splitUidIntoTypeAndId(BytesRef term) {
int loc = -1;
for (int i = term.offset; i < term.length; i++) {
if (term.bytes[i] == 0x23) { // 0x23 is equal to '#'
loc = i;
break;
}
}
if (loc == -1) {
return null;
}
byte[] type = new byte[loc - term.offset];
System.arraycopy(term.bytes, term.offset, type, 0, type.length);
byte[] id = new byte[term.length - type.length -1];
System.arraycopy(term.bytes, loc + 1, id, 0, id.length);
return new HashedBytesArray[]{new HashedBytesArray(type), new HashedBytesArray(id)};
}
static class TypeBuilder {
final ExtTObjectIntHasMap<HashedBytesArray> idToDoc = new ExtTObjectIntHasMap<HashedBytesArray>(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1);
final HashedBytesArray[] docToId;

View File

@ -20,6 +20,8 @@
package org.elasticsearch.index.cache.id.simple;
import com.google.common.collect.ImmutableMap;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.HashedBytesArray;
import org.elasticsearch.index.cache.id.IdReaderCache;
import org.elasticsearch.index.cache.id.IdReaderTypeCache;
@ -31,9 +33,9 @@ public class SimpleIdReaderCache implements IdReaderCache {
private final Object readerCacheKey;
private final ImmutableMap<String, SimpleIdReaderTypeCache> types;
private final ImmutableMap<BytesReference, SimpleIdReaderTypeCache> types;
public SimpleIdReaderCache(Object readerCacheKey, ImmutableMap<String, SimpleIdReaderTypeCache> types) {
public SimpleIdReaderCache(Object readerCacheKey, ImmutableMap<BytesReference, SimpleIdReaderTypeCache> types) {
this.readerCacheKey = readerCacheKey;
this.types = types;
}
@ -45,12 +47,12 @@ public class SimpleIdReaderCache implements IdReaderCache {
@Override
public IdReaderTypeCache type(String type) {
return types.get(type);
return types.get(new BytesArray(type));
}
@Override
public HashedBytesArray parentIdByDoc(String type, int docId) {
SimpleIdReaderTypeCache typeCache = types.get(type);
SimpleIdReaderTypeCache typeCache = types.get(new BytesArray(type));
if (typeCache != null) {
return typeCache.parentIdByDoc(docId);
}
@ -59,7 +61,7 @@ public class SimpleIdReaderCache implements IdReaderCache {
@Override
public int docById(String type, HashedBytesArray id) {
SimpleIdReaderTypeCache typeCache = types.get(type);
SimpleIdReaderTypeCache typeCache = types.get(new BytesArray(type));
if (typeCache != null) {
return typeCache.docById(id);
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.cache.id.simple;
import gnu.trove.impl.hash.TObjectHash;
import org.elasticsearch.common.RamUsage;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.HashedBytesArray;
import org.elasticsearch.common.trove.ExtTObjectIntHasMap;
import org.elasticsearch.index.cache.id.IdReaderTypeCache;
@ -30,7 +31,7 @@ import org.elasticsearch.index.cache.id.IdReaderTypeCache;
*/
public class SimpleIdReaderTypeCache implements IdReaderTypeCache {
private final String type;
private final BytesReference type;
private final ExtTObjectIntHasMap<HashedBytesArray> idToDoc;
@ -42,7 +43,7 @@ public class SimpleIdReaderTypeCache implements IdReaderTypeCache {
private long sizeInBytes = -1;
public SimpleIdReaderTypeCache(String type, ExtTObjectIntHasMap<HashedBytesArray> idToDoc, HashedBytesArray[] docIdToId,
public SimpleIdReaderTypeCache(BytesReference type, ExtTObjectIntHasMap<HashedBytesArray> idToDoc, HashedBytesArray[] docIdToId,
HashedBytesArray[] parentIdsValues, int[] parentIdsOrdinals) {
this.type = type;
this.idToDoc = idToDoc;
@ -52,7 +53,7 @@ public class SimpleIdReaderTypeCache implements IdReaderTypeCache {
this.parentIdsOrdinals = parentIdsOrdinals;
}
public String type() {
public BytesReference type() {
return this.type;
}

View File

@ -84,7 +84,7 @@ public class QueryPhase implements SearchPhase {
if (searchContext.scopePhases() != null) {
// we have scoped queries, refresh the id cache
try {
searchContext.idCache().refresh(searchContext.searcher().subReaders());
searchContext.idCache().refresh(searchContext.searcher().getTopReaderContext().leaves());
} catch (Exception e) {
throw new QueryPhaseExecutionException(searchContext, "Failed to refresh id cache for child queries", e);
}