mirror of https://github.com/apache/lucene.git
LUCENE-2919: add PKIndexSplitter, to split index into two by term
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137064 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f32a6aa455
commit
bba6b01826
|
@ -79,6 +79,10 @@ New Features
|
|||
facilitate doing grouping in a distributed environment (Uwe
|
||||
Schindler, Mike McCandless)
|
||||
|
||||
* LUCENE-2919: Added PKIndexSplitter, that splits an index according
|
||||
to a middle term in a specified field. (Jason Rutherglen via Mike
|
||||
McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3141: add getter method to access fragInfos in FieldFragList.
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Split an index based on a given primary key term
|
||||
* and a 'middle' term. If the middle term is present, it's
|
||||
* sent to dir2.
|
||||
*/
|
||||
public class PKIndexSplitter {
|
||||
private Term midTerm;
|
||||
Directory input;
|
||||
Directory dir1;
|
||||
Directory dir2;
|
||||
|
||||
public PKIndexSplitter(Term midTerm, Directory input,
|
||||
Directory dir1, Directory dir2) {
|
||||
this.midTerm = midTerm;
|
||||
this.input = input;
|
||||
this.dir1 = dir1;
|
||||
this.dir2 = dir2;
|
||||
}
|
||||
|
||||
public void split() throws IOException {
|
||||
IndexReader reader = IndexReader.open(input);
|
||||
OpenBitSet lowDels = setDeletes(reader, null, midTerm.bytes());
|
||||
OpenBitSet hiDels = setDeletes(reader, midTerm.bytes(), null);
|
||||
|
||||
createIndex(dir1, reader, lowDels);
|
||||
createIndex(dir2, reader, hiDels);
|
||||
reader.close();
|
||||
}
|
||||
|
||||
private void createIndex(Directory target, IndexReader reader, OpenBitSet bv) throws IOException {
|
||||
IndexWriter w = new IndexWriter(target, new IndexWriterConfig(
|
||||
Version.LUCENE_CURRENT,
|
||||
new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
|
||||
.setOpenMode(OpenMode.CREATE));
|
||||
w.addIndexes(new DeletesIndexReader(reader, bv));
|
||||
w.close();
|
||||
}
|
||||
|
||||
private OpenBitSet setDeletes(IndexReader reader, BytesRef startTerm,
|
||||
BytesRef endTermExcl) throws IOException {
|
||||
OpenBitSet incl = new OpenBitSet(reader.maxDoc());
|
||||
Terms terms = MultiFields.getTerms(reader, midTerm.field());
|
||||
TermsEnum te = terms.iterator();
|
||||
if (startTerm != null) {
|
||||
te.seek(startTerm);
|
||||
}
|
||||
while (true) {
|
||||
final BytesRef term = te.next();
|
||||
if (term == null) {
|
||||
break;
|
||||
}
|
||||
if (endTermExcl != null && term.compareTo(endTermExcl) >= 0) {
|
||||
break;
|
||||
}
|
||||
DocsEnum docs = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getDeletedDocs(reader), midTerm.field(), term);
|
||||
while (true) {
|
||||
final int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
incl.set(doc);
|
||||
} else break;
|
||||
}
|
||||
}
|
||||
OpenBitSet dels = new OpenBitSet(reader.maxDoc());
|
||||
for (int x=0; x < reader.maxDoc(); x++) {
|
||||
if (!incl.get(x)) {
|
||||
dels.set(x);
|
||||
}
|
||||
}
|
||||
return dels;
|
||||
}
|
||||
|
||||
public static class DeletesIndexReader extends FilterIndexReader {
|
||||
OpenBitSet readerDels;
|
||||
|
||||
public DeletesIndexReader(IndexReader reader, OpenBitSet deletes) {
|
||||
super(new SlowMultiReaderWrapper(reader));
|
||||
readerDels = new OpenBitSet(reader.maxDoc());
|
||||
if (in.hasDeletions()) {
|
||||
final Bits oldDelBits = MultiFields.getDeletedDocs(in);
|
||||
assert oldDelBits != null;
|
||||
for (int i = 0; i < in.maxDoc(); i++) {
|
||||
if (oldDelBits.get(i) || deletes.get(i)) {
|
||||
readerDels.set(i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
readerDels = deletes;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numDocs() {
|
||||
return in.maxDoc() - (int)readerDels.cardinality();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasDeletions() {
|
||||
return (int)readerDels.cardinality() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits getDeletedDocs() {
|
||||
return readerDels;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with this
|
||||
* work for additional information regarding copyright ownership. The ASF
|
||||
* licenses this file to You under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations under
|
||||
* the License.
|
||||
*/
|
||||
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
|
||||
public class TestPKIndexSplitter extends LuceneTestCase {
|
||||
public void testSplit() throws Exception {
|
||||
NumberFormat format = new DecimalFormat("000000000");
|
||||
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(
|
||||
Version.LUCENE_CURRENT,
|
||||
new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
|
||||
.setOpenMode(OpenMode.CREATE));
|
||||
for (int x=0; x < 10; x++) {
|
||||
Document doc = createDocument(x, "1", 3, format);
|
||||
w.addDocument(doc);
|
||||
}
|
||||
for (int x=15; x < 20; x++) {
|
||||
Document doc = createDocument(x, "2", 3, format);
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.close();
|
||||
|
||||
Directory dir1 = newDirectory();
|
||||
Directory dir2 = newDirectory();
|
||||
Term splitTerm = new Term("id", new BytesRef(format.format(11)));
|
||||
PKIndexSplitter splitter = new PKIndexSplitter(splitTerm,
|
||||
dir, dir1, dir2);
|
||||
splitter.split();
|
||||
|
||||
IndexReader ir1 = IndexReader.open(dir1);
|
||||
IndexReader ir2 = IndexReader.open(dir2);
|
||||
assertEquals(10, ir1.maxDoc());
|
||||
assertEquals(4, ir2.maxDoc());
|
||||
|
||||
ir1.close();
|
||||
ir2.close();
|
||||
|
||||
dir1.close();
|
||||
dir2.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public Document createDocument(int n, String indexName,
|
||||
int numFields, NumberFormat format) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Document doc = new Document();
|
||||
String id = format.format(n);
|
||||
doc.add(new Field("id", id, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("indexname", indexName, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
|
||||
sb.append("a");
|
||||
sb.append(n);
|
||||
doc.add(new Field("field1", sb.toString(), Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
|
||||
sb.append(" b");
|
||||
sb.append(n);
|
||||
for (int i = 1; i < numFields; i++) {
|
||||
doc.add(new Field("field" + (i + 1), sb.toString(), Store.YES,
|
||||
Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue