LUCENE-2919: add PKIndexSplitter, to split index into two by term

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-06-17 22:52:55 +00:00
parent f32a6aa455
commit bba6b01826
3 changed files with 233 additions and 0 deletions

View File

@ -79,6 +79,10 @@ New Features
facilitate doing grouping in a distributed environment (Uwe
Schindler, Mike McCandless)
* LUCENE-2919: Added PKIndexSplitter, that splits an index according
to a middle term in a specified field. (Jason Rutherglen via Mike
McCandless)
API Changes
* LUCENE-3141: add getter method to access fragInfos in FieldFragList.

View File

@ -0,0 +1,136 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.Version;
/**
* Split an index based on a given primary key term
* and a 'middle' term. If the middle term is present, it's
* sent to dir2.
*/
public class PKIndexSplitter {
private Term midTerm;
Directory input;
Directory dir1;
Directory dir2;
public PKIndexSplitter(Term midTerm, Directory input,
Directory dir1, Directory dir2) {
this.midTerm = midTerm;
this.input = input;
this.dir1 = dir1;
this.dir2 = dir2;
}
public void split() throws IOException {
IndexReader reader = IndexReader.open(input);
OpenBitSet lowDels = setDeletes(reader, null, midTerm.bytes());
OpenBitSet hiDels = setDeletes(reader, midTerm.bytes(), null);
createIndex(dir1, reader, lowDels);
createIndex(dir2, reader, hiDels);
reader.close();
}
private void createIndex(Directory target, IndexReader reader, OpenBitSet bv) throws IOException {
IndexWriter w = new IndexWriter(target, new IndexWriterConfig(
Version.LUCENE_CURRENT,
new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
.setOpenMode(OpenMode.CREATE));
w.addIndexes(new DeletesIndexReader(reader, bv));
w.close();
}
private OpenBitSet setDeletes(IndexReader reader, BytesRef startTerm,
BytesRef endTermExcl) throws IOException {
OpenBitSet incl = new OpenBitSet(reader.maxDoc());
Terms terms = MultiFields.getTerms(reader, midTerm.field());
TermsEnum te = terms.iterator();
if (startTerm != null) {
te.seek(startTerm);
}
while (true) {
final BytesRef term = te.next();
if (term == null) {
break;
}
if (endTermExcl != null && term.compareTo(endTermExcl) >= 0) {
break;
}
DocsEnum docs = MultiFields.getTermDocsEnum(reader,
MultiFields.getDeletedDocs(reader), midTerm.field(), term);
while (true) {
final int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
incl.set(doc);
} else break;
}
}
OpenBitSet dels = new OpenBitSet(reader.maxDoc());
for (int x=0; x < reader.maxDoc(); x++) {
if (!incl.get(x)) {
dels.set(x);
}
}
return dels;
}
public static class DeletesIndexReader extends FilterIndexReader {
OpenBitSet readerDels;
public DeletesIndexReader(IndexReader reader, OpenBitSet deletes) {
super(new SlowMultiReaderWrapper(reader));
readerDels = new OpenBitSet(reader.maxDoc());
if (in.hasDeletions()) {
final Bits oldDelBits = MultiFields.getDeletedDocs(in);
assert oldDelBits != null;
for (int i = 0; i < in.maxDoc(); i++) {
if (oldDelBits.get(i) || deletes.get(i)) {
readerDels.set(i);
}
}
} else {
readerDels = deletes;
}
}
@Override
public int numDocs() {
return in.maxDoc() - (int)readerDels.cardinality();
}
@Override
public boolean hasDeletions() {
return (int)readerDels.cardinality() > 0;
}
@Override
public Bits getDeletedDocs() {
return readerDels;
}
}
}

View File

@ -0,0 +1,93 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
import java.text.DecimalFormat;
import java.text.NumberFormat;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.LuceneTestCase;
public class TestPKIndexSplitter extends LuceneTestCase {
public void testSplit() throws Exception {
NumberFormat format = new DecimalFormat("000000000");
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(
Version.LUCENE_CURRENT,
new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
.setOpenMode(OpenMode.CREATE));
for (int x=0; x < 10; x++) {
Document doc = createDocument(x, "1", 3, format);
w.addDocument(doc);
}
for (int x=15; x < 20; x++) {
Document doc = createDocument(x, "2", 3, format);
w.addDocument(doc);
}
w.close();
Directory dir1 = newDirectory();
Directory dir2 = newDirectory();
Term splitTerm = new Term("id", new BytesRef(format.format(11)));
PKIndexSplitter splitter = new PKIndexSplitter(splitTerm,
dir, dir1, dir2);
splitter.split();
IndexReader ir1 = IndexReader.open(dir1);
IndexReader ir2 = IndexReader.open(dir2);
assertEquals(10, ir1.maxDoc());
assertEquals(4, ir2.maxDoc());
ir1.close();
ir2.close();
dir1.close();
dir2.close();
dir.close();
}
public Document createDocument(int n, String indexName,
int numFields, NumberFormat format) {
StringBuilder sb = new StringBuilder();
Document doc = new Document();
String id = format.format(n);
doc.add(new Field("id", id, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
doc.add(new Field("indexname", indexName, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
sb.append("a");
sb.append(n);
doc.add(new Field("field1", sb.toString(), Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
sb.append(" b");
sb.append(n);
for (int i = 1; i < numFields; i++) {
doc.add(new Field("field" + (i + 1), sb.toString(), Store.YES,
Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
}
return doc;
}
}