From a7c49eeb8cbd2d76559c0430d991200a48e7b3ff Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 1 Feb 2013 10:06:53 +0000 Subject: [PATCH 1/2] LUCENE-4733: Refactor term vectors formats tests around a BaseTermVectorsFormatTestCase. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441367 13f79535-47bb-0310-9956-ffa450edef68 --- .../TestSimpleTextTermVectorsFormat.java | 30 + .../TestCompressingTermVectorsFormat.java | 30 + .../TestLucene40TermVectorsFormat.java | 30 + .../lucene/index/TestPayloadsOnVectors.java | 326 --------- .../apache/lucene/search/TestTermVectors.java | 309 +-------- .../index/BaseTermVectorsFormatTestCase.java | 632 ++++++++++++++++++ 6 files changed, 731 insertions(+), 626 deletions(-) create mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java new file mode 100644 index 00000000000..b0683c66d30 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextTermVectorsFormat.java @@ -0,0 +1,30 @@ +package org.apache.lucene.codecs.simpletext; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseTermVectorsFormatTestCase; + +public class TestSimpleTextTermVectorsFormat extends BaseTermVectorsFormatTestCase { + + @Override + protected Codec getCodec() { + return new SimpleTextCodec(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java new file mode 100644 index 00000000000..7a9536210a9 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestCompressingTermVectorsFormat.java @@ -0,0 +1,30 @@ +package org.apache.lucene.codecs.compressing; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseTermVectorsFormatTestCase; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestCase { + + @Override + protected Codec getCodec() { + return CompressingCodec.randomInstance(random()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java new file mode 100644 index 00000000000..5a00b1cce0b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40TermVectorsFormat.java @@ -0,0 +1,30 @@ +package org.apache.lucene.codecs.lucene40; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseTermVectorsFormatTestCase; + +public class TestLucene40TermVectorsFormat extends BaseTermVectorsFormatTestCase { + + @Override + protected Codec getCodec() { + return new Lucene40Codec(); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java b/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java index d497c117814..d1bbcca4dae 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPayloadsOnVectors.java @@ -17,38 +17,21 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; import java.io.StringReader; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.IntField; import org.apache.lucene.document.TextField; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; - -import com.carrotsearch.randomizedtesting.generators.RandomInts; -import com.carrotsearch.randomizedtesting.generators.RandomPicks; public class TestPayloadsOnVectors extends LuceneTestCase { @@ -158,314 +141,5 @@ public class TestPayloadsOnVectors extends LuceneTestCase { writer.close(); dir.close(); } - - // custom impl to test cases that are forbidden by the default OffsetAttribute impl - static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute { - int start, end; - - @Override - public int startOffset() { - return start; - } - - @Override - public int endOffset() { - return end; - } - - @Override - public void setOffset(int startOffset, int endOffset) { - // no check! - start = startOffset; - end = endOffset; - } - - @Override - public void clear() { - start = end = 0; - } - - @Override - public boolean equals(Object other) { - if (other == this) { - return true; - } - - if (other instanceof PermissiveOffsetAttributeImpl) { - PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other; - return o.start == start && o.end == end; - } - - return false; - } - - @Override - public int hashCode() { - return start + 31 * end; - } - - @Override - public void copyTo(AttributeImpl target) { - OffsetAttribute t = (OffsetAttribute) target; - t.setOffset(start, end); - } - - } - - static BytesRef randomPayload() { - final int len = random().nextInt(5); - if (len == 0) { - return null; - } - final BytesRef payload = new BytesRef(len); - random().nextBytes(payload.bytes); - payload.length = len; - return payload; - } - - class RandomTokenStream extends TokenStream { - - final String[] terms; - final int[] positionsIncrements; - final int[] positions; - final int[] startOffsets, endOffsets; - final BytesRef[] payloads; - - final Map> positionToTerms; - final Map> startOffsetToTerms; - - final CharTermAttribute termAtt; - final PositionIncrementAttribute piAtt; - final OffsetAttribute oAtt; - final PayloadAttribute pAtt; - int i = 0; - - RandomTokenStream(int len, String[] sampleTerms, boolean weird) { - terms = new String[len]; - positionsIncrements = new int[len]; - positions = new int[len]; - startOffsets = new int[len]; - endOffsets = new int[len]; - payloads = new BytesRef[len]; - for (int i = 0; i < len; ++i) { - terms[i] = RandomPicks.randomFrom(random(), sampleTerms); - if (weird) { - positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 18); - startOffsets[i] = random().nextInt(); - endOffsets[i] = random().nextInt(); - } else if (i == 0) { - positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 5); - startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16); - endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20); - } else { - positionsIncrements[i] = _TestUtil.nextInt(random(), 0, 1 << 5); - startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, 1 << 16); - endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20); - } - } - for (int i = 0; i < len; ++i) { - if (i == 0) { - positions[i] = positionsIncrements[i] - 1; - } else { - positions[i] = positions[i - 1] + positionsIncrements[i]; - } - } - if (rarely()) { - Arrays.fill(payloads, randomPayload()); - } else { - for (int i = 0; i < len; ++i) { - payloads[i] = randomPayload(); - } - } - - positionToTerms = new HashMap>(); - startOffsetToTerms = new HashMap>(); - for (int i = 0; i < len; ++i) { - if (!positionToTerms.containsKey(positions[i])) { - positionToTerms.put(positions[i], new HashSet(1)); - } - positionToTerms.get(positions[i]).add(i); - if (!startOffsetToTerms.containsKey(startOffsets[i])) { - startOffsetToTerms.put(startOffsets[i], new HashSet(1)); - } - startOffsetToTerms.get(startOffsets[i]).add(i); - } - - addAttributeImpl(new PermissiveOffsetAttributeImpl()); - - termAtt = addAttribute(CharTermAttribute.class); - piAtt = addAttribute(PositionIncrementAttribute.class); - oAtt = addAttribute(OffsetAttribute.class); - pAtt = addAttribute(PayloadAttribute.class); - } - - @Override - public final boolean incrementToken() throws IOException { - if (i < terms.length) { - termAtt.setLength(0).append(terms[i]); - piAtt.setPositionIncrement(positionsIncrements[i]); - oAtt.setOffset(startOffsets[i], endOffsets[i]); - pAtt.setPayload(payloads[i]); - ++i; - return true; - } else { - return false; - } - } - - } - - static FieldType randomFieldType() { - FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); - ft.setStoreTermVectors(true); - ft.setStoreTermVectorPositions(random().nextBoolean()); - ft.setStoreTermVectorOffsets(random().nextBoolean()); - if (random().nextBoolean()) { - ft.setStoreTermVectorPositions(true); - ft.setStoreTermVectorPayloads(true); - } - ft.freeze(); - return ft; - } - - public void testRandomVectors() throws IOException { - Directory dir = newDirectory(); - IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30)); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf); - String[] sampleTerms = new String[RandomInts.randomIntBetween(random(), 20, 50)]; - for (int i = 0; i < sampleTerms.length; ++i) { - sampleTerms[i] = _TestUtil.randomUnicodeString(random()); - } - FieldType ft = randomFieldType(); - // generate random documents and index them - final String[] fieldNames = new String[_TestUtil.nextInt(random(), 1, 200)]; - for (int i = 0; i < fieldNames.length; ++i) { - String fieldName; - do { - fieldName = _TestUtil.randomSimpleString(random()); - } while ("id".equals(fieldName)); - fieldNames[i] = fieldName; - } - final int numDocs = _TestUtil.nextInt(random(), 10, 100); - @SuppressWarnings("unchecked") - final Map[] fieldValues = new Map[numDocs]; - for (int i = 0; i < numDocs; ++i) { - fieldValues[i] = new HashMap(); - final int numFields = _TestUtil.nextInt(random(), 0, rarely() ? fieldNames.length : 5); - for (int j = 0; j < numFields; ++j) { - final String fieldName = fieldNames[(i+j*31) % fieldNames.length]; - final int tokenStreamLen = _TestUtil.nextInt(random(), 1, rarely() ? 300 : 5); - fieldValues[i].put(fieldName, new RandomTokenStream(tokenStreamLen, sampleTerms, rarely())); - } - } - - // index them - for (int i = 0; i < numDocs; ++i) { - Document doc = new Document(); - doc.add(new IntField("id", i, Store.YES)); - for (Map.Entry entry : fieldValues[i].entrySet()) { - doc.add(new Field(entry.getKey(), entry.getValue(), ft)); - } - iw.addDocument(doc); - } - - iw.commit(); - // make sure the format can merge - iw.forceMerge(2); - - // read term vectors - final DirectoryReader reader = DirectoryReader.open(dir); - for (int i = 0; i < 100; ++i) { - final int docID = random().nextInt(numDocs); - final Map fvs = fieldValues[reader.document(docID).getField("id").numericValue().intValue()]; - final Fields fields = reader.getTermVectors(docID); - if (fvs.isEmpty()) { - assertNull(fields); - } else { - Set fns = new HashSet(); - for (String field : fields) { - fns.add(field); - } - assertEquals(fields.size(), fns.size()); - assertEquals(fvs.keySet(), fns); - for (String field : fields) { - final RandomTokenStream tk = fvs.get(field); - assert tk != null; - final Terms terms = fields.terms(field); - assertEquals(ft.storeTermVectorPositions(), terms.hasPositions()); - assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets()); - assertEquals(1, terms.getDocCount()); - final TermsEnum termsEnum = terms.iterator(null); - while (termsEnum.next() != null) { - assertEquals(1, termsEnum.docFreq()); - final DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null); - final DocsEnum docsEnum = docsAndPositionsEnum == null ? termsEnum.docs(null, null) : docsAndPositionsEnum; - if (ft.storeTermVectorOffsets() || ft.storeTermVectorPositions()) { - assertNotNull(docsAndPositionsEnum); - } - assertEquals(0, docsEnum.nextDoc()); - if (terms.hasPositions() || terms.hasOffsets()) { - final int freq = docsEnum.freq(); - assertTrue(freq >= 1); - if (docsAndPositionsEnum != null) { - for (int k = 0; k < freq; ++k) { - final int position = docsAndPositionsEnum.nextPosition(); - final Set indexes; - if (terms.hasPositions()) { - indexes = tk.positionToTerms.get(position); - assertNotNull(tk.positionToTerms.keySet().toString() + " does not contain " + position, indexes); - } else { - indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset()); - assertNotNull(indexes); - } - if (terms.hasPositions()) { - boolean foundPosition = false; - for (int index : indexes) { - if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.positions[index] == position) { - foundPosition = true; - break; - } - } - assertTrue(foundPosition); - } - if (terms.hasOffsets()) { - boolean foundOffset = false; - for (int index : indexes) { - if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) { - foundOffset = true; - break; - } - } - assertTrue(foundOffset); - } - if (terms.hasPayloads()) { - boolean foundPayload = false; - for (int index : indexes) { - if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) { - foundPayload = true; - break; - } - } - assertTrue(foundPayload); - } - } - } - } - assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc()); - } - } - } - } - IOUtils.close(reader, iw, dir); - } - - private static boolean equals(Object o1, Object o2) { - if (o1 == null) { - return o2 == null; - } else { - return o1.equals(o2); - } - } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java b/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java index 4d60d9025c0..ebef3bd1335 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermVectors.java @@ -18,9 +18,6 @@ package org.apache.lucene.search; */ import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; @@ -28,19 +25,24 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.index.*; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; public class TestTermVectors extends LuceneTestCase { - private static IndexSearcher searcher; private static IndexReader reader; private static Directory directory; @@ -75,7 +77,6 @@ public class TestTermVectors extends LuceneTestCase { } reader = writer.getReader(); writer.close(); - searcher = newSearcher(reader); } @AfterClass @@ -84,300 +85,8 @@ public class TestTermVectors extends LuceneTestCase { directory.close(); reader = null; directory = null; - searcher = null; } - public void test() { - assertTrue(searcher != null); - } - - public void testTermVectors() throws IOException { - Query query = new TermQuery(new Term("field", "seventy")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(100, hits.length); - - for (int i = 0; i < hits.length; i++) { - Fields vectors = searcher.reader.getTermVectors(hits[i].doc); - assertNotNull(vectors); - assertEquals("doc=" + hits[i].doc + " tv=" + vectors, 1, vectors.size()); - } - Terms vector; - vector = searcher.reader.getTermVectors(hits[0].doc).terms("noTV"); - assertNull(vector); - } - - public void testTermVectorsFieldOrder() throws IOException { - Directory dir = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); - Document doc = new Document(); - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(true); - ft.setStoreTermVectorOffsets(true); - ft.setStoreTermVectorPositions(true); - doc.add(newField("c", "some content here", ft)); - doc.add(newField("a", "some content here", ft)); - doc.add(newField("b", "some content here", ft)); - doc.add(newField("x", "some content here", ft)); - writer.addDocument(doc); - IndexReader reader = writer.getReader(); - writer.close(); - Fields v = reader.getTermVectors(0); - assertEquals(4, v.size()); - String[] expectedFields = new String[]{"a", "b", "c", "x"}; - int[] expectedPositions = new int[]{1, 2, 0}; - Iterator fieldsEnum = v.iterator(); - for(int i=0;i test4Map = new HashMap(); - test4Map.put("chocolate", Integer.valueOf(3)); - test4Map.put("lab", Integer.valueOf(2)); - test4Map.put("eating", Integer.valueOf(1)); - test4Map.put("computer", Integer.valueOf(1)); - test4Map.put("with", Integer.valueOf(1)); - test4Map.put("a", Integer.valueOf(1)); - test4Map.put("colored", Integer.valueOf(1)); - test4Map.put("in", Integer.valueOf(1)); - test4Map.put("an", Integer.valueOf(1)); - test4Map.put("computer", Integer.valueOf(1)); - test4Map.put("old", Integer.valueOf(1)); - - Document testDoc1 = new Document(); - setupDoc(testDoc1, test1); - Document testDoc2 = new Document(); - setupDoc(testDoc2, test2); - Document testDoc3 = new Document(); - setupDoc(testDoc3, test3); - Document testDoc4 = new Document(); - setupDoc(testDoc4, test4); - - Directory dir = newDirectory(); - - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, - newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) - .setOpenMode(OpenMode.CREATE) - .setMergePolicy(newLogMergePolicy()) - .setSimilarity(new DefaultSimilarity())); - writer.addDocument(testDoc1); - writer.addDocument(testDoc2); - writer.addDocument(testDoc3); - writer.addDocument(testDoc4); - IndexReader reader = writer.getReader(); - writer.close(); - IndexSearcher knownSearcher = newSearcher(reader); - knownSearcher.setSimilarity(new DefaultSimilarity()); - Fields fields = MultiFields.getFields(knownSearcher.reader); - - DocsEnum docs = null; - for (String fieldName : fields) { - Terms terms = fields.terms(fieldName); - assertNotNull(terms); // NOTE: kinda sketchy assumptions, but ideally we would fix fieldsenum api... - TermsEnum termsEnum = terms.iterator(null); - - while (termsEnum.next() != null) { - String text = termsEnum.term().utf8ToString(); - docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(knownSearcher.reader), docs, DocsEnum.FLAG_FREQS); - - while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - int docId = docs.docID(); - int freq = docs.freq(); - //System.out.println("Doc Id: " + docId + " freq " + freq); - Terms vector = knownSearcher.reader.getTermVectors(docId).terms("field"); - //float tf = sim.tf(freq); - //float idf = sim.idf(knownSearcher.docFreq(term), knownSearcher.maxDoc()); - //float qNorm = sim.queryNorm() - //This is fine since we don't have stop words - //float lNorm = sim.lengthNorm("field", vector.getTerms().length); - //float coord = sim.coord() - //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); - assertNotNull(vector); - TermsEnum termsEnum2 = vector.iterator(null); - - while(termsEnum2.next() != null) { - if (text.equals(termsEnum2.term().utf8ToString())) { - assertEquals(freq, termsEnum2.totalTermFreq()); - } - } - } - } - //System.out.println("--------"); - } - Query query = new TermQuery(new Term("field", "chocolate")); - ScoreDoc[] hits = knownSearcher.search(query, null, 1000).scoreDocs; - //doc 3 should be the first hit b/c it is the shortest match - assertTrue(hits.length == 3); - /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); - System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); - System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); - System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ - assertTrue(hits[0].doc == 2); - assertTrue(hits[1].doc == 3); - assertTrue(hits[2].doc == 0); - Terms vector = knownSearcher.reader.getTermVectors(hits[1].doc).terms("field"); - assertNotNull(vector); - //System.out.println("Vector: " + vector); - assertEquals(10, vector.size()); - TermsEnum termsEnum = vector.iterator(null); - while(termsEnum.next() != null) { - String term = termsEnum.term().utf8ToString(); - //System.out.println("Term: " + term); - int freq = (int) termsEnum.totalTermFreq(); - assertTrue(test4.indexOf(term) != -1); - Integer freqInt = test4Map.get(term); - assertTrue(freqInt != null); - assertEquals(freqInt.intValue(), freq); - } - reader.close(); - dir.close(); - } - - private void setupDoc(Document doc, String text) - { - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(true); - ft.setStoreTermVectorOffsets(true); - ft.setStoreTermVectorPositions(true); - FieldType ft2 = new FieldType(TextField.TYPE_STORED); - ft2.setStoreTermVectors(true); - doc.add(newField("field2", text, ft)); - doc.add(newField("field", text, ft2)); - //System.out.println("Document: " + doc); - } - - // Test only a few docs having vectors - public void testRareVectors() throws IOException { - RandomIndexWriter writer = new RandomIndexWriter(random(), directory, - newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) - .setOpenMode(OpenMode.CREATE)); - if (VERBOSE) { - System.out.println("TEST: now add non-vectors"); - } - for (int i = 0; i < 100; i++) { - Document doc = new Document(); - doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); - writer.addDocument(doc); - } - if (VERBOSE) { - System.out.println("TEST: now add vectors"); - } - FieldType ft = new FieldType(TextField.TYPE_STORED); - ft.setStoreTermVectors(true); - ft.setStoreTermVectorOffsets(true); - ft.setStoreTermVectorPositions(true); - for(int i=0;i<10;i++) { - Document doc = new Document(); - doc.add(newField("field", English.intToEnglish(100+i), ft)); - writer.addDocument(doc); - } - - if (VERBOSE) { - System.out.println("TEST: now getReader"); - } - IndexReader reader = writer.getReader(); - writer.close(); - IndexSearcher searcher = newSearcher(reader); - - Query query = new TermQuery(new Term("field", "hundred")); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(10, hits.length); - for (int i = 0; i < hits.length; i++) { - - Fields vectors = searcher.reader.getTermVectors(hits[i].doc); - assertNotNull(vectors); - assertEquals(1, vectors.size()); - } - reader.close(); - } - - // In a single doc, for the same field, mix the term // vectors up public void testMixedVectrosVectors() throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java new file mode 100644 index 00000000000..af93522eb76 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java @@ -0,0 +1,632 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; + +/** + * Base class aiming at testing {@link TermVectorsFormat term vectors formats}. + * To test a new format, all you need is to register a new {@link Codec} which + * uses it and extend this class and override {@link #getCodec()}. + * @lucene.experimental + */ +public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase { + + private Codec savedCodec; + + /** + * Returns the Codec to run tests against + */ + protected abstract Codec getCodec(); + + public void setUp() throws Exception { + super.setUp(); + // set the default codec, so adding test cases to this isn't fragile + savedCodec = Codec.getDefault(); + Codec.setDefault(getCodec()); + } + + public void tearDown() throws Exception { + Codec.setDefault(savedCodec); // restore + super.tearDown(); + } + + /** + * A combination of term vectors options. + */ + protected enum Options { + NONE(false, false, false), + POSITIONS(true, false, false), + OFFSETS(false, true, false), + POSITIONS_AND_OFFSETS(true, true, false), + POSITIONS_AND_PAYLOADS(true, false, true), + POSITIONS_AND_OFFSETS_AND_PAYLOADS(true, true, true); + final boolean positions, offsets, payloads; + private Options(boolean positions, boolean offsets, boolean payloads) { + this.positions = positions; + this.offsets = offsets; + this.payloads = payloads; + } + } + + protected Set validOptions() { + return EnumSet.allOf(Options.class); + } + + protected Options randomOptions() { + return RandomPicks.randomFrom(random(), new ArrayList(validOptions())); + } + + protected FieldType fieldType(Options options) { + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.setStoreTermVectors(true); + ft.setStoreTermVectorPositions(options.positions); + ft.setStoreTermVectorOffsets(options.offsets); + ft.setStoreTermVectorPayloads(options.payloads); + ft.freeze(); + return ft; + } + + protected BytesRef randomPayload() { + final int len = random().nextInt(5); + if (len == 0) { + return null; + } + final BytesRef payload = new BytesRef(len); + random().nextBytes(payload.bytes); + payload.length = len; + return payload; + } + + // custom impl to test cases that are forbidden by the default OffsetAttribute impl + private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute { + + int start, end; + + @Override + public int startOffset() { + return start; + } + + @Override + public int endOffset() { + return end; + } + + @Override + public void setOffset(int startOffset, int endOffset) { + // no check! + start = startOffset; + end = endOffset; + } + + @Override + public void clear() { + start = end = 0; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PermissiveOffsetAttributeImpl) { + PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other; + return o.start == start && o.end == end; + } + + return false; + } + + @Override + public int hashCode() { + return start + 31 * end; + } + + @Override + public void copyTo(AttributeImpl target) { + OffsetAttribute t = (OffsetAttribute) target; + t.setOffset(start, end); + } + + } + + // TODO: use CannedTokenStream? + protected class RandomTokenStream extends TokenStream { + + final String[] terms; + final BytesRef[] termBytes; + final int[] positionsIncrements; + final int[] positions; + final int[] startOffsets, endOffsets; + final BytesRef[] payloads; + + final Map freqs; + final Map> positionToTerms; + final Map> startOffsetToTerms; + + final CharTermAttribute termAtt; + final PositionIncrementAttribute piAtt; + final OffsetAttribute oAtt; + final PayloadAttribute pAtt; + int i = 0; + + protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) { + this(len, sampleTerms, sampleTermBytes, rarely()); + } + + protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) { + terms = new String[len]; + termBytes = new BytesRef[len]; + positionsIncrements = new int[len]; + positions = new int[len]; + startOffsets = new int[len]; + endOffsets = new int[len]; + payloads = new BytesRef[len]; + for (int i = 0; i < len; ++i) { + final int o = random().nextInt(sampleTerms.length); + terms[i] = sampleTerms[o]; + termBytes[i] = sampleTermBytes[o]; + positionsIncrements[i] = _TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10); + if (offsetsGoBackwards) { + startOffsets[i] = random().nextInt(); + endOffsets[i] = random().nextInt(); + } else { + if (i == 0) { + startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16); + } else { + startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20); + } + endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20); + } + } + + for (int i = 0; i < len; ++i) { + if (i == 0) { + positions[i] = positionsIncrements[i] - 1; + } else { + positions[i] = positions[i - 1] + positionsIncrements[i]; + } + } + if (rarely()) { + Arrays.fill(payloads, randomPayload()); + } else { + for (int i = 0; i < len; ++i) { + payloads[i] = randomPayload(); + } + } + + positionToTerms = new HashMap>(len); + startOffsetToTerms = new HashMap>(len); + for (int i = 0; i < len; ++i) { + if (!positionToTerms.containsKey(positions[i])) { + positionToTerms.put(positions[i], new HashSet(1)); + } + positionToTerms.get(positions[i]).add(i); + if (!startOffsetToTerms.containsKey(startOffsets[i])) { + startOffsetToTerms.put(startOffsets[i], new HashSet(1)); + } + startOffsetToTerms.get(startOffsets[i]).add(i); + } + + freqs = new HashMap(); + for (String term : terms) { + if (freqs.containsKey(term)) { + freqs.put(term, freqs.get(term) + 1); + } else { + freqs.put(term, 1); + } + } + + addAttributeImpl(new PermissiveOffsetAttributeImpl()); + + termAtt = addAttribute(CharTermAttribute.class); + piAtt = addAttribute(PositionIncrementAttribute.class); + oAtt = addAttribute(OffsetAttribute.class); + pAtt = addAttribute(PayloadAttribute.class); + } + + public boolean hasPayloads() { + for (BytesRef payload : payloads) { + if (payload != null && payload.length > 0) { + return true; + } + } + return false; + } + + @Override + public final boolean incrementToken() throws IOException { + if (i < terms.length) { + termAtt.setLength(0).append(terms[i]); + piAtt.setPositionIncrement(positionsIncrements[i]); + oAtt.setOffset(startOffsets[i], endOffsets[i]); + pAtt.setPayload(payloads[i]); + ++i; + return true; + } else { + return false; + } + } + + } + + protected class RandomDocument { + + private final String[] fieldNames; + private final FieldType[] fieldTypes; + private final RandomTokenStream[] tokenStreams; + + protected RandomDocument(int fieldCount, int maxTermCount, Options options, String[] fieldNames, String[] sampleTerms, BytesRef[] sampleTermBytes) { + if (fieldCount > fieldNames.length) { + throw new IllegalArgumentException(); + } + this.fieldNames = new String[fieldCount]; + fieldTypes = new FieldType[fieldCount]; + tokenStreams = new RandomTokenStream[fieldCount]; + Arrays.fill(fieldTypes, fieldType(options)); + final Set usedFileNames = new HashSet(); + for (int i = 0; i < fieldCount; ++i) { + do { + this.fieldNames[i] = RandomPicks.randomFrom(random(), fieldNames); + } while (usedFileNames.contains(this.fieldNames[i])); + usedFileNames.add(this.fieldNames[i]); + tokenStreams[i] = new RandomTokenStream(_TestUtil.nextInt(random(), 1, maxTermCount), sampleTerms, sampleTermBytes); + } + } + + public Document toDocument() { + final Document doc = new Document(); + for (int i = 0; i < fieldNames.length; ++i) { + doc.add(new Field(fieldNames[i], tokenStreams[i], fieldTypes[i])); + } + return doc; + } + + } + + protected class RandomDocumentFactory { + + private final String[] fieldNames; + private final String[] terms; + private final BytesRef[] termBytes; + + protected RandomDocumentFactory(int distinctFieldNames, int disctinctTerms) { + final Set fieldNames = new HashSet(); + while (fieldNames.size() < distinctFieldNames) { + fieldNames.add(_TestUtil.randomSimpleString(random())); + fieldNames.remove("id"); + } + this.fieldNames = fieldNames.toArray(new String[0]); + terms = new String[disctinctTerms]; + termBytes = new BytesRef[disctinctTerms]; + for (int i = 0; i < disctinctTerms; ++i) { + terms[i] = _TestUtil.randomRealisticUnicodeString(random()); + termBytes[i] = new BytesRef(terms[i]); + } + } + + public RandomDocument newDocument(int fieldCount, int maxTermCount, Options options) { + return new RandomDocument(fieldCount, maxTermCount, options, fieldNames, terms, termBytes); + } + + } + + protected void assertEquals(RandomDocument doc, Fields fields) throws IOException { + // compare field names + assertEquals(doc == null, fields == null); + assertEquals(doc.fieldNames.length, fields.size()); + final Set fields1 = new HashSet(); + final Set fields2 = new HashSet(); + for (int i = 0; i < doc.fieldNames.length; ++i) { + fields1.add(doc.fieldNames[i]); + } + for (String field : fields) { + fields2.add(field); + } + assertEquals(fields1, fields2); + + for (int i = 0; i < doc.fieldNames.length; ++i) { + assertEquals(doc.tokenStreams[i], doc.fieldTypes[i], fields.terms(doc.fieldNames[i])); + } + } + + protected static boolean equals(Object o1, Object o2) { + if (o1 == null) { + return o2 == null; + } else { + return o1.equals(o2); + } + } + + // to test reuse + private TermsEnum termsEnum = null; + private DocsEnum docsEnum = null; + private DocsAndPositionsEnum docsAndPositionsEnum = null; + + protected void assertEquals(RandomTokenStream tk, FieldType ft, Terms terms) throws IOException { + assertEquals(1, terms.getDocCount()); + final int termCount = new HashSet(Arrays.asList(tk.terms)).size(); + assertEquals(termCount, terms.size()); + assertEquals(termCount, terms.getSumDocFreq()); + assertEquals(ft.storeTermVectorPositions(), terms.hasPositions()); + assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets()); + assertEquals(ft.storeTermVectorPayloads() && tk.hasPayloads(), terms.hasPayloads()); + final Set uniqueTerms = new HashSet(); + for (String term : tk.freqs.keySet()) { + uniqueTerms.add(new BytesRef(term)); + } + final BytesRef[] sortedTerms = uniqueTerms.toArray(new BytesRef[0]); + Arrays.sort(sortedTerms, terms.getComparator()); + termsEnum = terms.iterator(random().nextBoolean() ? null : termsEnum); + for (int i = 0; i < sortedTerms.length; ++i) { + final BytesRef nextTerm = termsEnum.next(); + assertEquals(sortedTerms[i], nextTerm); + assertEquals(sortedTerms[i], termsEnum.term()); + assertEquals(1, termsEnum.docFreq()); + + final FixedBitSet bits = new FixedBitSet(1); + docsEnum = termsEnum.docs(bits, random().nextBoolean() ? null : docsEnum); + assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc()); + bits.set(0); + + docsEnum = termsEnum.docs(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsEnum); + assertNotNull(docsEnum); + assertEquals(0, docsEnum.nextDoc()); + assertEquals(0, docsEnum.docID()); + assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) docsEnum.freq()); + assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc()); + + bits.clear(0); + docsAndPositionsEnum = termsEnum.docsAndPositions(bits, random().nextBoolean() ? null : docsAndPositionsEnum); + assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null); + if (docsAndPositionsEnum != null) { + assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc()); + } + bits.set(0); + + docsAndPositionsEnum = termsEnum.docsAndPositions(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsAndPositionsEnum); + assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null); + if (terms.hasPositions() || terms.hasOffsets()) { + assertEquals(0, docsAndPositionsEnum.nextDoc()); + final int freq = docsAndPositionsEnum.freq(); + assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) freq); + if (docsAndPositionsEnum != null) { + for (int k = 0; k < freq; ++k) { + final int position = docsAndPositionsEnum.nextPosition(); + final Set indexes; + if (terms.hasPositions()) { + indexes = tk.positionToTerms.get(position); + assertNotNull(indexes); + } else { + indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset()); + assertNotNull(indexes); + } + if (terms.hasPositions()) { + boolean foundPosition = false; + for (int index : indexes) { + if (tk.termBytes[index].equals(termsEnum.term()) && tk.positions[index] == position) { + foundPosition = true; + break; + } + } + assertTrue(foundPosition); + } + if (terms.hasOffsets()) { + boolean foundOffset = false; + for (int index : indexes) { + if (tk.termBytes[index].equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) { + foundOffset = true; + break; + } + } + assertTrue(foundOffset); + } + if (terms.hasPayloads()) { + boolean foundPayload = false; + for (int index : indexes) { + if (tk.termBytes[index].equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) { + foundPayload = true; + break; + } + } + assertTrue(foundPayload); + } + } + try { + docsAndPositionsEnum.nextPosition(); + fail(); + } catch (Exception e) { + // ok + } catch (AssertionError e) { + // ok + } + } + assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc()); + } + } + assertNull(termsEnum.next()); + for (int i = 0; i < 5; ++i) { + if (random().nextBoolean()) { + assertTrue(termsEnum.seekExact(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean())); + } else { + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean())); + } + } + } + + protected Document addId(Document doc, String id) { + doc.add(new StringField("id", id, Store.NO)); + return doc; + } + + protected int docID(IndexReader reader, String id) throws IOException { + return new IndexSearcher(reader).search(new TermQuery(new Term("id", id)), 1).scoreDocs[0].doc; + } + + // only one doc with vectors + public void testRareVectors() throws IOException { + final RandomDocumentFactory docFactory = new RandomDocumentFactory(10, 20); + for (Options options : validOptions()) { + final int numDocs = _TestUtil.nextInt(random(), 10, 10000); + final int docWithVectors = random().nextInt(numDocs); + final Document emptyDoc = new Document(); + final Directory dir = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), 20, options); + for (int i = 0; i < numDocs; ++i) { + if (i == docWithVectors) { + writer.addDocument(addId(doc.toDocument(), "42")); + } else { + writer.addDocument(emptyDoc); + } + } + final IndexReader reader = writer.getReader(); + final int docWithVectorsID = docID(reader, "42"); + for (int i = 0; i < 10; ++i) { + final int docID = random().nextInt(numDocs); + final Fields fields = reader.getTermVectors(docID); + if (docID == docWithVectorsID) { + assertEquals(doc, fields); + } else { + assertNull(fields); + } + } + final Fields fields = reader.getTermVectors(docWithVectorsID); + assertEquals(doc, fields); + reader.close(); + writer.close(); + dir.close(); + } + } + + public void testHighFreqs() throws IOException { + final RandomDocumentFactory docFactory = new RandomDocumentFactory(3, 5); + for (Options options : validOptions()) { + if (options == Options.NONE) { + continue; + } + final Directory dir = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 2), _TestUtil.nextInt(random(), 50000, 100000), options); + writer.addDocument(doc.toDocument()); + final IndexReader reader = writer.getReader(); + assertEquals(doc, reader.getTermVectors(0)); + reader.close(); + writer.close(); + dir.close(); + } + } + + public void testLotsOfFields() throws IOException { + final RandomDocumentFactory docFactory = new RandomDocumentFactory(5000, 10); + for (Options options : validOptions()) { + final Directory dir = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 500, 1000), 5, options); + writer.addDocument(doc.toDocument()); + final IndexReader reader = writer.getReader(); + assertEquals(doc, reader.getTermVectors(0)); + reader.close(); + writer.close(); + dir.close(); + } + } + + // different options for the same field + public void testMixedOptions() throws IOException { + final int numFields = _TestUtil.nextInt(random(), 1, 3); + final RandomDocumentFactory docFactory = new RandomDocumentFactory(numFields, 10); + for (Options options1 : validOptions()) { + for (Options options2 : validOptions()) { + if (options1 == options2) { + continue; + } + final Directory dir = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + final RandomDocument doc1 = docFactory.newDocument(numFields, 20, options1); + final RandomDocument doc2 = docFactory.newDocument(numFields, 20, options2); + writer.addDocument(addId(doc1.toDocument(), "1")); + writer.addDocument(addId(doc2.toDocument(), "2")); + final IndexReader reader = writer.getReader(); + final int doc1ID = docID(reader, "1"); + assertEquals(doc1, reader.getTermVectors(doc1ID)); + final int doc2ID = docID(reader, "2"); + assertEquals(doc2, reader.getTermVectors(doc2ID)); + reader.close(); + writer.close(); + dir.close(); + } + } + } + + public void testRandom() throws IOException { + final RandomDocumentFactory docFactory = new RandomDocumentFactory(5, 20); + final int numDocs = _TestUtil.nextInt(random(), 100, 1000); + final RandomDocument[] docs = new RandomDocument[numDocs]; + for (int i = 0; i < numDocs; ++i) { + docs[i] = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), _TestUtil.nextInt(random(), 10, 50), randomOptions()); + } + final Directory dir = newDirectory(); + final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + for (int i = 0; i < numDocs; ++i) { + writer.addDocument(docs[i].toDocument()); + } + final IndexReader reader = writer.getReader(); + for (int i = 0; i < numDocs; ++i) { + assertEquals(docs[i], reader.getTermVectors(i)); + } + reader.close(); + writer.close(); + dir.close(); + } + +} From 6ae5a4e52d0c46923573643481fe79685aca9967 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 1 Feb 2013 11:39:58 +0000 Subject: [PATCH 2/2] Fix test bug (used to rely on internal doc IDs). git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441395 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/index/BaseTermVectorsFormatTestCase.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java index af93522eb76..d4c4a6857ce 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java @@ -618,11 +618,12 @@ public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase { final Directory dir = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random(), dir); for (int i = 0; i < numDocs; ++i) { - writer.addDocument(docs[i].toDocument()); + writer.addDocument(addId(docs[i].toDocument(), ""+i)); } final IndexReader reader = writer.getReader(); for (int i = 0; i < numDocs; ++i) { - assertEquals(docs[i], reader.getTermVectors(i)); + final int docID = docID(reader, ""+i); + assertEquals(docs[i], reader.getTermVectors(docID)); } reader.close(); writer.close();