make sure max length term works

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1081829 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-03-15 16:11:21 +00:00
parent ac3ef52bd0
commit 987a9ef84c
1 changed files with 134 additions and 11 deletions

View File

@ -21,53 +21,57 @@ import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.BytesRef;
public class TestIndexWriter extends LuceneTestCase {
@ -2912,4 +2916,123 @@ public class TestIndexWriter extends LuceneTestCase {
w.close();
dir.close();
}
private static class StringSplitAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StringSplitTokenizer(reader);
}
}
private static class StringSplitTokenizer extends Tokenizer {
private final String[] tokens;
private int upto = 0;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public StringSplitTokenizer(Reader r) {
try {
final StringBuilder b = new StringBuilder();
final char[] buffer = new char[1024];
int n;
while((n = r.read(buffer)) != -1) {
b.append(buffer, 0, n);
}
tokens = b.toString().split(" ");
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (upto < tokens.length) {
termAtt.setEmpty();
termAtt.append(tokens[upto]);
upto++;
return true;
} else {
return false;
}
}
}
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir, new StringSplitAnalyzer());
char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
final BytesRef bigTermBytesRef = new BytesRef(bigTerm);
// This contents produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);
IndexReader reader = w.getReader();
w.close();
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
assertEquals(0, tps.nextDoc());
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
dir.close();
dir = newDirectory();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
Field contentField = new Field("content", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
doc.add(contentField);
w = new RandomIndexWriter(random, dir);
contentField.setValue("other");
w.addDocument(doc);
contentField.setValue("term");
w.addDocument(doc);
contentField.setValue(bigTerm);
w.addDocument(doc);
contentField.setValue("zzz");
w.addDocument(doc);
reader = w.getReader();
w.close();
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
FieldCache.DocTermsIndex dti = FieldCache.DEFAULT.getTermsIndex(reader, "content", random.nextBoolean());
assertEquals(5, dti.numOrd()); // +1 for null ord
assertEquals(4, dti.size());
assertEquals(bigTermBytesRef, dti.lookup(3, new BytesRef()));
reader.close();
dir.close();
}
}