mirror of https://github.com/apache/lucene.git
make sure max length term works
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1081829 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ac3ef52bd0
commit
987a9ef84c
|
@ -21,53 +21,57 @@ import java.io.ByteArrayOutputStream;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.Lock;
|
||||
import org.apache.lucene.store.LockFactory;
|
||||
import org.apache.lucene.store.NoLockFactory;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.store.NoLockFactory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public class TestIndexWriter extends LuceneTestCase {
|
||||
|
||||
|
@ -2912,4 +2916,123 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
w.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private static class StringSplitAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new StringSplitTokenizer(reader);
|
||||
}
|
||||
}
|
||||
|
||||
private static class StringSplitTokenizer extends Tokenizer {
|
||||
private final String[] tokens;
|
||||
private int upto = 0;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public StringSplitTokenizer(Reader r) {
|
||||
try {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
final char[] buffer = new char[1024];
|
||||
int n;
|
||||
while((n = r.read(buffer)) != -1) {
|
||||
b.append(buffer, 0, n);
|
||||
}
|
||||
tokens = b.toString().split(" ");
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (upto < tokens.length) {
|
||||
termAtt.setEmpty();
|
||||
termAtt.append(tokens[upto]);
|
||||
upto++;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure we skip wicked long terms.
|
||||
*/
|
||||
public void testWickedLongTerm() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, dir, new StringSplitAnalyzer());
|
||||
|
||||
char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8];
|
||||
Arrays.fill(chars, 'x');
|
||||
Document doc = new Document();
|
||||
final String bigTerm = new String(chars);
|
||||
final BytesRef bigTermBytesRef = new BytesRef(bigTerm);
|
||||
|
||||
// This contents produces a too-long term:
|
||||
String contents = "abc xyz x" + bigTerm + " another term";
|
||||
doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// Make sure we can add another normal document
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader reader = w.getReader();
|
||||
w.close();
|
||||
|
||||
// Make sure all terms < max size were indexed
|
||||
assertEquals(2, reader.docFreq(new Term("content", "abc")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "term")));
|
||||
assertEquals(1, reader.docFreq(new Term("content", "another")));
|
||||
|
||||
// Make sure position is still incremented when
|
||||
// massive term is skipped:
|
||||
DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
|
||||
assertEquals(0, tps.nextDoc());
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(3, tps.nextPosition());
|
||||
|
||||
// Make sure the doc that has the massive term is in
|
||||
// the index:
|
||||
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
dir = newDirectory();
|
||||
|
||||
// Make sure we can add a document with exactly the
|
||||
// maximum length term, and search on that term:
|
||||
doc = new Document();
|
||||
Field contentField = new Field("content", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||
doc.add(contentField);
|
||||
|
||||
w = new RandomIndexWriter(random, dir);
|
||||
|
||||
contentField.setValue("other");
|
||||
w.addDocument(doc);
|
||||
|
||||
contentField.setValue("term");
|
||||
w.addDocument(doc);
|
||||
|
||||
contentField.setValue(bigTerm);
|
||||
w.addDocument(doc);
|
||||
|
||||
contentField.setValue("zzz");
|
||||
w.addDocument(doc);
|
||||
|
||||
reader = w.getReader();
|
||||
w.close();
|
||||
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
|
||||
|
||||
FieldCache.DocTermsIndex dti = FieldCache.DEFAULT.getTermsIndex(reader, "content", random.nextBoolean());
|
||||
assertEquals(5, dti.numOrd()); // +1 for null ord
|
||||
assertEquals(4, dti.size());
|
||||
assertEquals(bigTermBytesRef, dti.lookup(3, new BytesRef()));
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue