LUCENE-3291: clarify 2.1GB byte[] limit in FST/MemCodec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-07-11 16:38:20 +00:00
parent 30b3cc881d
commit fe93de7319
3 changed files with 110 additions and 0 deletions

View File

@ -74,6 +74,10 @@ import org.apache.lucene.util.fst.FST;
* queries that rely on advance will (AND BooleanQuery,
* PhraseQuery) will be relatively slow!
*
* <p><b>NOTE</b>: this codec cannot adress more than ~2.1 GB
* of postings, because the underlying FST uses an int
* to address the underlying byte[].
*
* @lucene.experimental */
public class MemoryCodec extends Codec {

View File

@ -33,6 +33,9 @@ import org.apache.lucene.util.fst.Builder.UnCompiledNode;
* <p> The format is similar to what's used by Morfologik
* (http://sourceforge.net/projects/morfologik).
*
* <p><b>NOTE</b>: the FST cannot be larger than ~2.1 GB
* because it uses int to address the byte[].
*
* @lucene.experimental
*/
public class FST<T> {

View File

@ -0,0 +1,103 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
/**
* Test indexes ~82M docs with 26 terms each, so you get > Integer.MAX_VALUE terms/docs pairs
* @lucene.experimental
*/
public class Test2BPostings extends LuceneTestCase {
@Ignore("Must run with large (14 GB) java heap, and not Memory nor SimpleText codec!")
public void test() throws Exception {
MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BPostings"));
dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
dir.setCheckIndexOnClose(false); // don't double-checkindex
IndexWriter w = new IndexWriter(dir,
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
.setRAMBufferSizeMB(256.0)
.setMergeScheduler(new ConcurrentMergeScheduler())
.setMergePolicy(newLogMergePolicy(false, 10))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
MergePolicy mp = w.getConfig().getMergePolicy();
if (mp instanceof LogByteSizeMergePolicy) {
// 1 petabyte:
((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
}
Document doc = new Document();
Field field = new Field("field", new MyTokenStream());
field.setOmitTermFreqAndPositions(true);
field.setOmitNorms(true);
doc.add(field);
final int numDocs = (Integer.MAX_VALUE / 26) + 1;
for (int i = 0; i < numDocs; i++) {
w.addDocument(doc);
if (i % 100000 == 0) {
System.out.println(i + " of " + numDocs + "...");
}
}
w.optimize();
w.close();
CheckIndex ci = new CheckIndex(dir);
ci.setInfoStream(System.out);
ci.checkIndex();
dir.close();
}
public static final class MyTokenStream extends TokenStream {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final char buffer[];
int index;
public MyTokenStream() {
termAtt.setLength(1);
buffer = termAtt.buffer();
}
@Override
public boolean incrementToken() throws IOException {
if (index <= 'z') {
buffer[0] = (char) index++;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
index = 'a';
}
}
}