mirror of https://github.com/apache/lucene.git
LUCENE-3291: clarify 2.1GB byte[] limit in FST/MemCodec
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
30b3cc881d
commit
fe93de7319
|
@ -74,6 +74,10 @@ import org.apache.lucene.util.fst.FST;
|
|||
* queries that rely on advance will (AND BooleanQuery,
|
||||
* PhraseQuery) will be relatively slow!
|
||||
*
|
||||
* <p><b>NOTE</b>: this codec cannot adress more than ~2.1 GB
|
||||
* of postings, because the underlying FST uses an int
|
||||
* to address the underlying byte[].
|
||||
*
|
||||
* @lucene.experimental */
|
||||
|
||||
public class MemoryCodec extends Codec {
|
||||
|
|
|
@ -33,6 +33,9 @@ import org.apache.lucene.util.fst.Builder.UnCompiledNode;
|
|||
* <p> The format is similar to what's used by Morfologik
|
||||
* (http://sourceforge.net/projects/morfologik).
|
||||
*
|
||||
* <p><b>NOTE</b>: the FST cannot be larger than ~2.1 GB
|
||||
* because it uses int to address the byte[].
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FST<T> {
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import org.junit.Ignore;
|
||||
|
||||
/**
|
||||
* Test indexes ~82M docs with 26 terms each, so you get > Integer.MAX_VALUE terms/docs pairs
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Test2BPostings extends LuceneTestCase {
|
||||
@Ignore("Must run with large (14 GB) java heap, and not Memory nor SimpleText codec!")
|
||||
public void test() throws Exception {
|
||||
MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BPostings"));
|
||||
dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
|
||||
dir.setCheckIndexOnClose(false); // don't double-checkindex
|
||||
|
||||
IndexWriter w = new IndexWriter(dir,
|
||||
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
|
||||
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||
.setRAMBufferSizeMB(256.0)
|
||||
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
|
||||
|
||||
MergePolicy mp = w.getConfig().getMergePolicy();
|
||||
if (mp instanceof LogByteSizeMergePolicy) {
|
||||
// 1 petabyte:
|
||||
((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
|
||||
}
|
||||
|
||||
Document doc = new Document();
|
||||
Field field = new Field("field", new MyTokenStream());
|
||||
field.setOmitTermFreqAndPositions(true);
|
||||
field.setOmitNorms(true);
|
||||
doc.add(field);
|
||||
|
||||
final int numDocs = (Integer.MAX_VALUE / 26) + 1;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
w.addDocument(doc);
|
||||
if (i % 100000 == 0) {
|
||||
System.out.println(i + " of " + numDocs + "...");
|
||||
}
|
||||
}
|
||||
w.optimize();
|
||||
w.close();
|
||||
CheckIndex ci = new CheckIndex(dir);
|
||||
ci.setInfoStream(System.out);
|
||||
ci.checkIndex();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public static final class MyTokenStream extends TokenStream {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final char buffer[];
|
||||
int index;
|
||||
|
||||
public MyTokenStream() {
|
||||
termAtt.setLength(1);
|
||||
buffer = termAtt.buffer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index <= 'z') {
|
||||
buffer[0] = (char) index++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
index = 'a';
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue