diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index 2e94b81b315..c5430c2c4b9 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -21,53 +21,57 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.PrintStream; +import java.io.Reader; import java.io.StringReader; -import java.util.List; import java.util.ArrayList; import java.util.Arrays; -import java.util.Set; -import java.util.HashSet; +import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Map; import java.util.Random; -import java.util.Collections; +import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockFactory; -import org.apache.lucene.store.NoLockFactory; import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.store.NoLockFactory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.SingleInstanceLockFactory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; -import org.apache.lucene.util.ThreadInterruptedException; -import org.apache.lucene.util.BytesRef; public class TestIndexWriter extends LuceneTestCase { @@ -2912,4 +2916,123 @@ public class TestIndexWriter extends LuceneTestCase { w.close(); dir.close(); } + + private static class StringSplitAnalyzer extends Analyzer { + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return new StringSplitTokenizer(reader); + } + } + + private static class StringSplitTokenizer extends Tokenizer { + private final String[] tokens; + private int upto = 0; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public StringSplitTokenizer(Reader r) { + try { + final StringBuilder b = new StringBuilder(); + final char[] buffer = new char[1024]; + int n; + while((n = r.read(buffer)) != -1) { + b.append(buffer, 0, n); + } + tokens = b.toString().split(" "); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public final boolean incrementToken() throws IOException { + clearAttributes(); + if (upto < tokens.length) { + termAtt.setEmpty(); + termAtt.append(tokens[upto]); + upto++; + return true; + } else { + return false; + } + } + } + + /** + * Make sure we skip wicked long terms. + */ + public void testWickedLongTerm() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir, new StringSplitAnalyzer()); + + char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8]; + Arrays.fill(chars, 'x'); + Document doc = new Document(); + final String bigTerm = new String(chars); + final BytesRef bigTermBytesRef = new BytesRef(bigTerm); + + // This contents produces a too-long term: + String contents = "abc xyz x" + bigTerm + " another term"; + doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + + // Make sure we can add another normal document + doc = new Document(); + doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + + IndexReader reader = w.getReader(); + w.close(); + + // Make sure all terms < max size were indexed + assertEquals(2, reader.docFreq(new Term("content", "abc"))); + assertEquals(1, reader.docFreq(new Term("content", "bbb"))); + assertEquals(1, reader.docFreq(new Term("content", "term"))); + assertEquals(1, reader.docFreq(new Term("content", "another"))); + + // Make sure position is still incremented when + // massive term is skipped: + DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another")); + assertEquals(0, tps.nextDoc()); + assertEquals(1, tps.freq()); + assertEquals(3, tps.nextPosition()); + + // Make sure the doc that has the massive term is in + // the index: + assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs()); + + reader.close(); + dir.close(); + dir = newDirectory(); + + // Make sure we can add a document with exactly the + // maximum length term, and search on that term: + doc = new Document(); + Field contentField = new Field("content", "", Field.Store.NO, Field.Index.NOT_ANALYZED); + doc.add(contentField); + + w = new RandomIndexWriter(random, dir); + + contentField.setValue("other"); + w.addDocument(doc); + + contentField.setValue("term"); + w.addDocument(doc); + + contentField.setValue(bigTerm); + w.addDocument(doc); + + contentField.setValue("zzz"); + w.addDocument(doc); + + reader = w.getReader(); + w.close(); + assertEquals(1, reader.docFreq(new Term("content", bigTerm))); + + FieldCache.DocTermsIndex dti = FieldCache.DEFAULT.getTermsIndex(reader, "content", random.nextBoolean()); + assertEquals(5, dti.numOrd()); // +1 for null ord + assertEquals(4, dti.size()); + assertEquals(bigTermBytesRef, dti.lookup(3, new BytesRef())); + reader.close(); + dir.close(); + } }