Merged /lucene/dev/trunk:r1441297-1441422

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1441423 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-02-01 12:31:48 +00:00
commit fc98c11fb9
6 changed files with 732 additions and 626 deletions

View File

@ -0,0 +1,30 @@
package org.apache.lucene.codecs.simpletext;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
public class TestSimpleTextTermVectorsFormat extends BaseTermVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return new SimpleTextCodec();
}
}

View File

@ -0,0 +1,30 @@
package org.apache.lucene.codecs.compressing;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return CompressingCodec.randomInstance(random());
}
}

View File

@ -0,0 +1,30 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
public class TestLucene40TermVectorsFormat extends BaseTermVectorsFormatTestCase {
@Override
protected Codec getCodec() {
return new Lucene40RWCodec();
}
}

View File

@ -17,38 +17,21 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomInts;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
public class TestPayloadsOnVectors extends LuceneTestCase {
@ -158,314 +141,5 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
writer.close();
dir.close();
}
// custom impl to test cases that are forbidden by the default OffsetAttribute impl
static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
int start, end;
@Override
public int startOffset() {
return start;
}
@Override
public int endOffset() {
return end;
}
@Override
public void setOffset(int startOffset, int endOffset) {
// no check!
start = startOffset;
end = endOffset;
}
@Override
public void clear() {
start = end = 0;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PermissiveOffsetAttributeImpl) {
PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
return o.start == start && o.end == end;
}
return false;
}
@Override
public int hashCode() {
return start + 31 * end;
}
@Override
public void copyTo(AttributeImpl target) {
OffsetAttribute t = (OffsetAttribute) target;
t.setOffset(start, end);
}
}
static BytesRef randomPayload() {
final int len = random().nextInt(5);
if (len == 0) {
return null;
}
final BytesRef payload = new BytesRef(len);
random().nextBytes(payload.bytes);
payload.length = len;
return payload;
}
class RandomTokenStream extends TokenStream {
final String[] terms;
final int[] positionsIncrements;
final int[] positions;
final int[] startOffsets, endOffsets;
final BytesRef[] payloads;
final Map<Integer, Set<Integer>> positionToTerms;
final Map<Integer, Set<Integer>> startOffsetToTerms;
final CharTermAttribute termAtt;
final PositionIncrementAttribute piAtt;
final OffsetAttribute oAtt;
final PayloadAttribute pAtt;
int i = 0;
RandomTokenStream(int len, String[] sampleTerms, boolean weird) {
terms = new String[len];
positionsIncrements = new int[len];
positions = new int[len];
startOffsets = new int[len];
endOffsets = new int[len];
payloads = new BytesRef[len];
for (int i = 0; i < len; ++i) {
terms[i] = RandomPicks.randomFrom(random(), sampleTerms);
if (weird) {
positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 18);
startOffsets[i] = random().nextInt();
endOffsets[i] = random().nextInt();
} else if (i == 0) {
positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 5);
startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
} else {
positionsIncrements[i] = _TestUtil.nextInt(random(), 0, 1 << 5);
startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, 1 << 16);
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
}
}
for (int i = 0; i < len; ++i) {
if (i == 0) {
positions[i] = positionsIncrements[i] - 1;
} else {
positions[i] = positions[i - 1] + positionsIncrements[i];
}
}
if (rarely()) {
Arrays.fill(payloads, randomPayload());
} else {
for (int i = 0; i < len; ++i) {
payloads[i] = randomPayload();
}
}
positionToTerms = new HashMap<Integer, Set<Integer>>();
startOffsetToTerms = new HashMap<Integer, Set<Integer>>();
for (int i = 0; i < len; ++i) {
if (!positionToTerms.containsKey(positions[i])) {
positionToTerms.put(positions[i], new HashSet<Integer>(1));
}
positionToTerms.get(positions[i]).add(i);
if (!startOffsetToTerms.containsKey(startOffsets[i])) {
startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
}
startOffsetToTerms.get(startOffsets[i]).add(i);
}
addAttributeImpl(new PermissiveOffsetAttributeImpl());
termAtt = addAttribute(CharTermAttribute.class);
piAtt = addAttribute(PositionIncrementAttribute.class);
oAtt = addAttribute(OffsetAttribute.class);
pAtt = addAttribute(PayloadAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (i < terms.length) {
termAtt.setLength(0).append(terms[i]);
piAtt.setPositionIncrement(positionsIncrements[i]);
oAtt.setOffset(startOffsets[i], endOffsets[i]);
pAtt.setPayload(payloads[i]);
++i;
return true;
} else {
return false;
}
}
}
static FieldType randomFieldType() {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(random().nextBoolean());
ft.setStoreTermVectorOffsets(random().nextBoolean());
if (random().nextBoolean()) {
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorPayloads(true);
}
ft.freeze();
return ft;
}
public void testRandomVectors() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
String[] sampleTerms = new String[RandomInts.randomIntBetween(random(), 20, 50)];
for (int i = 0; i < sampleTerms.length; ++i) {
sampleTerms[i] = _TestUtil.randomUnicodeString(random());
}
FieldType ft = randomFieldType();
// generate random documents and index them
final String[] fieldNames = new String[_TestUtil.nextInt(random(), 1, 200)];
for (int i = 0; i < fieldNames.length; ++i) {
String fieldName;
do {
fieldName = _TestUtil.randomSimpleString(random());
} while ("id".equals(fieldName));
fieldNames[i] = fieldName;
}
final int numDocs = _TestUtil.nextInt(random(), 10, 100);
@SuppressWarnings("unchecked")
final Map<String, RandomTokenStream>[] fieldValues = new Map[numDocs];
for (int i = 0; i < numDocs; ++i) {
fieldValues[i] = new HashMap<String, RandomTokenStream>();
final int numFields = _TestUtil.nextInt(random(), 0, rarely() ? fieldNames.length : 5);
for (int j = 0; j < numFields; ++j) {
final String fieldName = fieldNames[(i+j*31) % fieldNames.length];
final int tokenStreamLen = _TestUtil.nextInt(random(), 1, rarely() ? 300 : 5);
fieldValues[i].put(fieldName, new RandomTokenStream(tokenStreamLen, sampleTerms, rarely()));
}
}
// index them
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new IntField("id", i, Store.YES));
for (Map.Entry<String, RandomTokenStream> entry : fieldValues[i].entrySet()) {
doc.add(new Field(entry.getKey(), entry.getValue(), ft));
}
iw.addDocument(doc);
}
iw.commit();
// make sure the format can merge
iw.forceMerge(2);
// read term vectors
final DirectoryReader reader = DirectoryReader.open(dir);
for (int i = 0; i < 100; ++i) {
final int docID = random().nextInt(numDocs);
final Map<String, RandomTokenStream> fvs = fieldValues[reader.document(docID).getField("id").numericValue().intValue()];
final Fields fields = reader.getTermVectors(docID);
if (fvs.isEmpty()) {
assertNull(fields);
} else {
Set<String> fns = new HashSet<String>();
for (String field : fields) {
fns.add(field);
}
assertEquals(fields.size(), fns.size());
assertEquals(fvs.keySet(), fns);
for (String field : fields) {
final RandomTokenStream tk = fvs.get(field);
assert tk != null;
final Terms terms = fields.terms(field);
assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
assertEquals(1, terms.getDocCount());
final TermsEnum termsEnum = terms.iterator(null);
while (termsEnum.next() != null) {
assertEquals(1, termsEnum.docFreq());
final DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
final DocsEnum docsEnum = docsAndPositionsEnum == null ? termsEnum.docs(null, null) : docsAndPositionsEnum;
if (ft.storeTermVectorOffsets() || ft.storeTermVectorPositions()) {
assertNotNull(docsAndPositionsEnum);
}
assertEquals(0, docsEnum.nextDoc());
if (terms.hasPositions() || terms.hasOffsets()) {
final int freq = docsEnum.freq();
assertTrue(freq >= 1);
if (docsAndPositionsEnum != null) {
for (int k = 0; k < freq; ++k) {
final int position = docsAndPositionsEnum.nextPosition();
final Set<Integer> indexes;
if (terms.hasPositions()) {
indexes = tk.positionToTerms.get(position);
assertNotNull(tk.positionToTerms.keySet().toString() + " does not contain " + position, indexes);
} else {
indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
assertNotNull(indexes);
}
if (terms.hasPositions()) {
boolean foundPosition = false;
for (int index : indexes) {
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.positions[index] == position) {
foundPosition = true;
break;
}
}
assertTrue(foundPosition);
}
if (terms.hasOffsets()) {
boolean foundOffset = false;
for (int index : indexes) {
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
foundOffset = true;
break;
}
}
assertTrue(foundOffset);
}
if (terms.hasPayloads()) {
boolean foundPayload = false;
for (int index : indexes) {
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
foundPayload = true;
break;
}
}
assertTrue(foundPayload);
}
}
}
}
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
}
}
}
}
IOUtils.close(reader, iw, dir);
}
private static boolean equals(Object o1, Object o2) {
if (o1 == null) {
return o2 == null;
} else {
return o1.equals(o2);
}
}
}

View File

@ -18,9 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
@ -28,19 +25,24 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestTermVectors extends LuceneTestCase {
private static IndexSearcher searcher;
private static IndexReader reader;
private static Directory directory;
@ -75,7 +77,6 @@ public class TestTermVectors extends LuceneTestCase {
}
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
}
@AfterClass
@ -84,300 +85,8 @@ public class TestTermVectors extends LuceneTestCase {
directory.close();
reader = null;
directory = null;
searcher = null;
}
public void test() {
assertTrue(searcher != null);
}
public void testTermVectors() throws IOException {
Query query = new TermQuery(new Term("field", "seventy"));
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(100, hits.length);
for (int i = 0; i < hits.length; i++) {
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
assertNotNull(vectors);
assertEquals("doc=" + hits[i].doc + " tv=" + vectors, 1, vectors.size());
}
Terms vector;
vector = searcher.reader.getTermVectors(hits[0].doc).terms("noTV");
assertNull(vector);
}
public void testTermVectorsFieldOrder() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
doc.add(newField("c", "some content here", ft));
doc.add(newField("a", "some content here", ft));
doc.add(newField("b", "some content here", ft));
doc.add(newField("x", "some content here", ft));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
Fields v = reader.getTermVectors(0);
assertEquals(4, v.size());
String[] expectedFields = new String[]{"a", "b", "c", "x"};
int[] expectedPositions = new int[]{1, 2, 0};
Iterator<String> fieldsEnum = v.iterator();
for(int i=0;i<expectedFields.length;i++) {
assertEquals(expectedFields[i], fieldsEnum.next());
assertEquals(3, v.terms(expectedFields[i]).size());
DocsAndPositionsEnum dpEnum = null;
Terms terms = v.terms(expectedFields[i]);
assertNotNull(terms);
TermsEnum termsEnum = terms.iterator(null);
assertEquals("content", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[0], dpEnum.nextPosition());
assertEquals("here", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[1], dpEnum.nextPosition());
assertEquals("some", termsEnum.next().utf8ToString());
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, dpEnum.freq());
assertEquals(expectedPositions[2], dpEnum.nextPosition());
assertNull(termsEnum.next());
}
reader.close();
dir.close();
}
public void testTermPositionVectors() throws IOException {
Query query = new TermQuery(new Term("field", "zero"));
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
DocsAndPositionsEnum dpEnum = null;
for (int i = 0; i < hits.length; i++) {
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
assertNotNull(vectors);
assertEquals(1, vectors.size());
TermsEnum termsEnum = vectors.terms("field").iterator(null);
assertNotNull(termsEnum.next());
boolean shouldBePosVector = hits[i].doc % 2 == 0;
boolean shouldBeOffVector = hits[i].doc % 3 == 0;
if (shouldBePosVector || shouldBeOffVector) {
while(true) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
dpEnum.nextPosition();
if (shouldBeOffVector) {
assertTrue(dpEnum.startOffset() != -1);
assertTrue(dpEnum.endOffset() != -1);
}
if (termsEnum.next() == null) {
break;
}
}
} else {
fail();
}
}
}
public void testTermOffsetVectors() throws IOException {
Query query = new TermQuery(new Term("field", "fifty"));
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(100, hits.length);
for (int i = 0; i < hits.length; i++) {
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
assertNotNull(vectors);
assertEquals(1, vectors.size());
}
}
public void testKnownSetOfDocuments() throws IOException {
String test1 = "eating chocolate in a computer lab"; //6 terms
String test2 = "computer in a computer lab"; //5 terms
String test3 = "a chocolate lab grows old"; //5 terms
String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
Map<String,Integer> test4Map = new HashMap<String,Integer>();
test4Map.put("chocolate", Integer.valueOf(3));
test4Map.put("lab", Integer.valueOf(2));
test4Map.put("eating", Integer.valueOf(1));
test4Map.put("computer", Integer.valueOf(1));
test4Map.put("with", Integer.valueOf(1));
test4Map.put("a", Integer.valueOf(1));
test4Map.put("colored", Integer.valueOf(1));
test4Map.put("in", Integer.valueOf(1));
test4Map.put("an", Integer.valueOf(1));
test4Map.put("computer", Integer.valueOf(1));
test4Map.put("old", Integer.valueOf(1));
Document testDoc1 = new Document();
setupDoc(testDoc1, test1);
Document testDoc2 = new Document();
setupDoc(testDoc2, test2);
Document testDoc3 = new Document();
setupDoc(testDoc3, test3);
Document testDoc4 = new Document();
setupDoc(testDoc4, test4);
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
.setOpenMode(OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy())
.setSimilarity(new DefaultSimilarity()));
writer.addDocument(testDoc1);
writer.addDocument(testDoc2);
writer.addDocument(testDoc3);
writer.addDocument(testDoc4);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher knownSearcher = newSearcher(reader);
knownSearcher.setSimilarity(new DefaultSimilarity());
Fields fields = MultiFields.getFields(knownSearcher.reader);
DocsEnum docs = null;
for (String fieldName : fields) {
Terms terms = fields.terms(fieldName);
assertNotNull(terms); // NOTE: kinda sketchy assumptions, but ideally we would fix fieldsenum api...
TermsEnum termsEnum = terms.iterator(null);
while (termsEnum.next() != null) {
String text = termsEnum.term().utf8ToString();
docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(knownSearcher.reader), docs, DocsEnum.FLAG_FREQS);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int docId = docs.docID();
int freq = docs.freq();
//System.out.println("Doc Id: " + docId + " freq " + freq);
Terms vector = knownSearcher.reader.getTermVectors(docId).terms("field");
//float tf = sim.tf(freq);
//float idf = sim.idf(knownSearcher.docFreq(term), knownSearcher.maxDoc());
//float qNorm = sim.queryNorm()
//This is fine since we don't have stop words
//float lNorm = sim.lengthNorm("field", vector.getTerms().length);
//float coord = sim.coord()
//System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
assertNotNull(vector);
TermsEnum termsEnum2 = vector.iterator(null);
while(termsEnum2.next() != null) {
if (text.equals(termsEnum2.term().utf8ToString())) {
assertEquals(freq, termsEnum2.totalTermFreq());
}
}
}
}
//System.out.println("--------");
}
Query query = new TermQuery(new Term("field", "chocolate"));
ScoreDoc[] hits = knownSearcher.search(query, null, 1000).scoreDocs;
//doc 3 should be the first hit b/c it is the shortest match
assertTrue(hits.length == 3);
/*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
assertTrue(hits[0].doc == 2);
assertTrue(hits[1].doc == 3);
assertTrue(hits[2].doc == 0);
Terms vector = knownSearcher.reader.getTermVectors(hits[1].doc).terms("field");
assertNotNull(vector);
//System.out.println("Vector: " + vector);
assertEquals(10, vector.size());
TermsEnum termsEnum = vector.iterator(null);
while(termsEnum.next() != null) {
String term = termsEnum.term().utf8ToString();
//System.out.println("Term: " + term);
int freq = (int) termsEnum.totalTermFreq();
assertTrue(test4.indexOf(term) != -1);
Integer freqInt = test4Map.get(term);
assertTrue(freqInt != null);
assertEquals(freqInt.intValue(), freq);
}
reader.close();
dir.close();
}
private void setupDoc(Document doc, String text)
{
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
FieldType ft2 = new FieldType(TextField.TYPE_STORED);
ft2.setStoreTermVectors(true);
doc.add(newField("field2", text, ft));
doc.add(newField("field", text, ft2));
//System.out.println("Document: " + doc);
}
// Test only a few docs having vectors
public void testRareVectors() throws IOException {
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
.setOpenMode(OpenMode.CREATE));
if (VERBOSE) {
System.out.println("TEST: now add non-vectors");
}
for (int i = 0; i < 100; i++) {
Document doc = new Document();
doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES));
writer.addDocument(doc);
}
if (VERBOSE) {
System.out.println("TEST: now add vectors");
}
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
for(int i=0;i<10;i++) {
Document doc = new Document();
doc.add(newField("field", English.intToEnglish(100+i), ft));
writer.addDocument(doc);
}
if (VERBOSE) {
System.out.println("TEST: now getReader");
}
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
Query query = new TermQuery(new Term("field", "hundred"));
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(10, hits.length);
for (int i = 0; i < hits.length; i++) {
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
assertNotNull(vectors);
assertEquals(1, vectors.size());
}
reader.close();
}
// In a single doc, for the same field, mix the term
// vectors up
public void testMixedVectrosVectors() throws IOException {

View File

@ -0,0 +1,633 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
/**
* Base class aiming at testing {@link TermVectorsFormat term vectors formats}.
* To test a new format, all you need is to register a new {@link Codec} which
* uses it and extend this class and override {@link #getCodec()}.
* @lucene.experimental
*/
public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
private Codec savedCodec;
/**
* Returns the Codec to run tests against
*/
protected abstract Codec getCodec();
public void setUp() throws Exception {
super.setUp();
// set the default codec, so adding test cases to this isn't fragile
savedCodec = Codec.getDefault();
Codec.setDefault(getCodec());
}
public void tearDown() throws Exception {
Codec.setDefault(savedCodec); // restore
super.tearDown();
}
/**
* A combination of term vectors options.
*/
protected enum Options {
NONE(false, false, false),
POSITIONS(true, false, false),
OFFSETS(false, true, false),
POSITIONS_AND_OFFSETS(true, true, false),
POSITIONS_AND_PAYLOADS(true, false, true),
POSITIONS_AND_OFFSETS_AND_PAYLOADS(true, true, true);
final boolean positions, offsets, payloads;
private Options(boolean positions, boolean offsets, boolean payloads) {
this.positions = positions;
this.offsets = offsets;
this.payloads = payloads;
}
}
protected Set<Options> validOptions() {
return EnumSet.allOf(Options.class);
}
protected Options randomOptions() {
return RandomPicks.randomFrom(random(), new ArrayList<Options>(validOptions()));
}
protected FieldType fieldType(Options options) {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(options.positions);
ft.setStoreTermVectorOffsets(options.offsets);
ft.setStoreTermVectorPayloads(options.payloads);
ft.freeze();
return ft;
}
protected BytesRef randomPayload() {
final int len = random().nextInt(5);
if (len == 0) {
return null;
}
final BytesRef payload = new BytesRef(len);
random().nextBytes(payload.bytes);
payload.length = len;
return payload;
}
// custom impl to test cases that are forbidden by the default OffsetAttribute impl
private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
int start, end;
@Override
public int startOffset() {
return start;
}
@Override
public int endOffset() {
return end;
}
@Override
public void setOffset(int startOffset, int endOffset) {
// no check!
start = startOffset;
end = endOffset;
}
@Override
public void clear() {
start = end = 0;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PermissiveOffsetAttributeImpl) {
PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
return o.start == start && o.end == end;
}
return false;
}
@Override
public int hashCode() {
return start + 31 * end;
}
@Override
public void copyTo(AttributeImpl target) {
OffsetAttribute t = (OffsetAttribute) target;
t.setOffset(start, end);
}
}
// TODO: use CannedTokenStream?
protected class RandomTokenStream extends TokenStream {
final String[] terms;
final BytesRef[] termBytes;
final int[] positionsIncrements;
final int[] positions;
final int[] startOffsets, endOffsets;
final BytesRef[] payloads;
final Map<String, Integer> freqs;
final Map<Integer, Set<Integer>> positionToTerms;
final Map<Integer, Set<Integer>> startOffsetToTerms;
final CharTermAttribute termAtt;
final PositionIncrementAttribute piAtt;
final OffsetAttribute oAtt;
final PayloadAttribute pAtt;
int i = 0;
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
this(len, sampleTerms, sampleTermBytes, rarely());
}
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
terms = new String[len];
termBytes = new BytesRef[len];
positionsIncrements = new int[len];
positions = new int[len];
startOffsets = new int[len];
endOffsets = new int[len];
payloads = new BytesRef[len];
for (int i = 0; i < len; ++i) {
final int o = random().nextInt(sampleTerms.length);
terms[i] = sampleTerms[o];
termBytes[i] = sampleTermBytes[o];
positionsIncrements[i] = _TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
if (offsetsGoBackwards) {
startOffsets[i] = random().nextInt();
endOffsets[i] = random().nextInt();
} else {
if (i == 0) {
startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
} else {
startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
}
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
}
}
for (int i = 0; i < len; ++i) {
if (i == 0) {
positions[i] = positionsIncrements[i] - 1;
} else {
positions[i] = positions[i - 1] + positionsIncrements[i];
}
}
if (rarely()) {
Arrays.fill(payloads, randomPayload());
} else {
for (int i = 0; i < len; ++i) {
payloads[i] = randomPayload();
}
}
positionToTerms = new HashMap<Integer, Set<Integer>>(len);
startOffsetToTerms = new HashMap<Integer, Set<Integer>>(len);
for (int i = 0; i < len; ++i) {
if (!positionToTerms.containsKey(positions[i])) {
positionToTerms.put(positions[i], new HashSet<Integer>(1));
}
positionToTerms.get(positions[i]).add(i);
if (!startOffsetToTerms.containsKey(startOffsets[i])) {
startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
}
startOffsetToTerms.get(startOffsets[i]).add(i);
}
freqs = new HashMap<String, Integer>();
for (String term : terms) {
if (freqs.containsKey(term)) {
freqs.put(term, freqs.get(term) + 1);
} else {
freqs.put(term, 1);
}
}
addAttributeImpl(new PermissiveOffsetAttributeImpl());
termAtt = addAttribute(CharTermAttribute.class);
piAtt = addAttribute(PositionIncrementAttribute.class);
oAtt = addAttribute(OffsetAttribute.class);
pAtt = addAttribute(PayloadAttribute.class);
}
public boolean hasPayloads() {
for (BytesRef payload : payloads) {
if (payload != null && payload.length > 0) {
return true;
}
}
return false;
}
@Override
public final boolean incrementToken() throws IOException {
if (i < terms.length) {
termAtt.setLength(0).append(terms[i]);
piAtt.setPositionIncrement(positionsIncrements[i]);
oAtt.setOffset(startOffsets[i], endOffsets[i]);
pAtt.setPayload(payloads[i]);
++i;
return true;
} else {
return false;
}
}
}
protected class RandomDocument {
private final String[] fieldNames;
private final FieldType[] fieldTypes;
private final RandomTokenStream[] tokenStreams;
protected RandomDocument(int fieldCount, int maxTermCount, Options options, String[] fieldNames, String[] sampleTerms, BytesRef[] sampleTermBytes) {
if (fieldCount > fieldNames.length) {
throw new IllegalArgumentException();
}
this.fieldNames = new String[fieldCount];
fieldTypes = new FieldType[fieldCount];
tokenStreams = new RandomTokenStream[fieldCount];
Arrays.fill(fieldTypes, fieldType(options));
final Set<String> usedFileNames = new HashSet<String>();
for (int i = 0; i < fieldCount; ++i) {
do {
this.fieldNames[i] = RandomPicks.randomFrom(random(), fieldNames);
} while (usedFileNames.contains(this.fieldNames[i]));
usedFileNames.add(this.fieldNames[i]);
tokenStreams[i] = new RandomTokenStream(_TestUtil.nextInt(random(), 1, maxTermCount), sampleTerms, sampleTermBytes);
}
}
public Document toDocument() {
final Document doc = new Document();
for (int i = 0; i < fieldNames.length; ++i) {
doc.add(new Field(fieldNames[i], tokenStreams[i], fieldTypes[i]));
}
return doc;
}
}
protected class RandomDocumentFactory {
private final String[] fieldNames;
private final String[] terms;
private final BytesRef[] termBytes;
protected RandomDocumentFactory(int distinctFieldNames, int disctinctTerms) {
final Set<String> fieldNames = new HashSet<String>();
while (fieldNames.size() < distinctFieldNames) {
fieldNames.add(_TestUtil.randomSimpleString(random()));
fieldNames.remove("id");
}
this.fieldNames = fieldNames.toArray(new String[0]);
terms = new String[disctinctTerms];
termBytes = new BytesRef[disctinctTerms];
for (int i = 0; i < disctinctTerms; ++i) {
terms[i] = _TestUtil.randomRealisticUnicodeString(random());
termBytes[i] = new BytesRef(terms[i]);
}
}
public RandomDocument newDocument(int fieldCount, int maxTermCount, Options options) {
return new RandomDocument(fieldCount, maxTermCount, options, fieldNames, terms, termBytes);
}
}
protected void assertEquals(RandomDocument doc, Fields fields) throws IOException {
// compare field names
assertEquals(doc == null, fields == null);
assertEquals(doc.fieldNames.length, fields.size());
final Set<String> fields1 = new HashSet<String>();
final Set<String> fields2 = new HashSet<String>();
for (int i = 0; i < doc.fieldNames.length; ++i) {
fields1.add(doc.fieldNames[i]);
}
for (String field : fields) {
fields2.add(field);
}
assertEquals(fields1, fields2);
for (int i = 0; i < doc.fieldNames.length; ++i) {
assertEquals(doc.tokenStreams[i], doc.fieldTypes[i], fields.terms(doc.fieldNames[i]));
}
}
protected static boolean equals(Object o1, Object o2) {
if (o1 == null) {
return o2 == null;
} else {
return o1.equals(o2);
}
}
// to test reuse
private TermsEnum termsEnum = null;
private DocsEnum docsEnum = null;
private DocsAndPositionsEnum docsAndPositionsEnum = null;
protected void assertEquals(RandomTokenStream tk, FieldType ft, Terms terms) throws IOException {
assertEquals(1, terms.getDocCount());
final int termCount = new HashSet<String>(Arrays.asList(tk.terms)).size();
assertEquals(termCount, terms.size());
assertEquals(termCount, terms.getSumDocFreq());
assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
assertEquals(ft.storeTermVectorPayloads() && tk.hasPayloads(), terms.hasPayloads());
final Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
for (String term : tk.freqs.keySet()) {
uniqueTerms.add(new BytesRef(term));
}
final BytesRef[] sortedTerms = uniqueTerms.toArray(new BytesRef[0]);
Arrays.sort(sortedTerms, terms.getComparator());
termsEnum = terms.iterator(random().nextBoolean() ? null : termsEnum);
for (int i = 0; i < sortedTerms.length; ++i) {
final BytesRef nextTerm = termsEnum.next();
assertEquals(sortedTerms[i], nextTerm);
assertEquals(sortedTerms[i], termsEnum.term());
assertEquals(1, termsEnum.docFreq());
final FixedBitSet bits = new FixedBitSet(1);
docsEnum = termsEnum.docs(bits, random().nextBoolean() ? null : docsEnum);
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
bits.set(0);
docsEnum = termsEnum.docs(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsEnum);
assertNotNull(docsEnum);
assertEquals(0, docsEnum.nextDoc());
assertEquals(0, docsEnum.docID());
assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) docsEnum.freq());
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
bits.clear(0);
docsAndPositionsEnum = termsEnum.docsAndPositions(bits, random().nextBoolean() ? null : docsAndPositionsEnum);
assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
if (docsAndPositionsEnum != null) {
assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
}
bits.set(0);
docsAndPositionsEnum = termsEnum.docsAndPositions(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsAndPositionsEnum);
assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
if (terms.hasPositions() || terms.hasOffsets()) {
assertEquals(0, docsAndPositionsEnum.nextDoc());
final int freq = docsAndPositionsEnum.freq();
assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) freq);
if (docsAndPositionsEnum != null) {
for (int k = 0; k < freq; ++k) {
final int position = docsAndPositionsEnum.nextPosition();
final Set<Integer> indexes;
if (terms.hasPositions()) {
indexes = tk.positionToTerms.get(position);
assertNotNull(indexes);
} else {
indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
assertNotNull(indexes);
}
if (terms.hasPositions()) {
boolean foundPosition = false;
for (int index : indexes) {
if (tk.termBytes[index].equals(termsEnum.term()) && tk.positions[index] == position) {
foundPosition = true;
break;
}
}
assertTrue(foundPosition);
}
if (terms.hasOffsets()) {
boolean foundOffset = false;
for (int index : indexes) {
if (tk.termBytes[index].equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
foundOffset = true;
break;
}
}
assertTrue(foundOffset);
}
if (terms.hasPayloads()) {
boolean foundPayload = false;
for (int index : indexes) {
if (tk.termBytes[index].equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
foundPayload = true;
break;
}
}
assertTrue(foundPayload);
}
}
try {
docsAndPositionsEnum.nextPosition();
fail();
} catch (Exception e) {
// ok
} catch (AssertionError e) {
// ok
}
}
assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
}
}
assertNull(termsEnum.next());
for (int i = 0; i < 5; ++i) {
if (random().nextBoolean()) {
assertTrue(termsEnum.seekExact(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
} else {
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
}
}
}
protected Document addId(Document doc, String id) {
doc.add(new StringField("id", id, Store.NO));
return doc;
}
protected int docID(IndexReader reader, String id) throws IOException {
return new IndexSearcher(reader).search(new TermQuery(new Term("id", id)), 1).scoreDocs[0].doc;
}
// only one doc with vectors
public void testRareVectors() throws IOException {
final RandomDocumentFactory docFactory = new RandomDocumentFactory(10, 20);
for (Options options : validOptions()) {
final int numDocs = _TestUtil.nextInt(random(), 10, 10000);
final int docWithVectors = random().nextInt(numDocs);
final Document emptyDoc = new Document();
final Directory dir = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), 20, options);
for (int i = 0; i < numDocs; ++i) {
if (i == docWithVectors) {
writer.addDocument(addId(doc.toDocument(), "42"));
} else {
writer.addDocument(emptyDoc);
}
}
final IndexReader reader = writer.getReader();
final int docWithVectorsID = docID(reader, "42");
for (int i = 0; i < 10; ++i) {
final int docID = random().nextInt(numDocs);
final Fields fields = reader.getTermVectors(docID);
if (docID == docWithVectorsID) {
assertEquals(doc, fields);
} else {
assertNull(fields);
}
}
final Fields fields = reader.getTermVectors(docWithVectorsID);
assertEquals(doc, fields);
reader.close();
writer.close();
dir.close();
}
}
public void testHighFreqs() throws IOException {
final RandomDocumentFactory docFactory = new RandomDocumentFactory(3, 5);
for (Options options : validOptions()) {
if (options == Options.NONE) {
continue;
}
final Directory dir = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 2), _TestUtil.nextInt(random(), 50000, 100000), options);
writer.addDocument(doc.toDocument());
final IndexReader reader = writer.getReader();
assertEquals(doc, reader.getTermVectors(0));
reader.close();
writer.close();
dir.close();
}
}
public void testLotsOfFields() throws IOException {
final RandomDocumentFactory docFactory = new RandomDocumentFactory(5000, 10);
for (Options options : validOptions()) {
final Directory dir = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 500, 1000), 5, options);
writer.addDocument(doc.toDocument());
final IndexReader reader = writer.getReader();
assertEquals(doc, reader.getTermVectors(0));
reader.close();
writer.close();
dir.close();
}
}
// different options for the same field
public void testMixedOptions() throws IOException {
final int numFields = _TestUtil.nextInt(random(), 1, 3);
final RandomDocumentFactory docFactory = new RandomDocumentFactory(numFields, 10);
for (Options options1 : validOptions()) {
for (Options options2 : validOptions()) {
if (options1 == options2) {
continue;
}
final Directory dir = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
final RandomDocument doc1 = docFactory.newDocument(numFields, 20, options1);
final RandomDocument doc2 = docFactory.newDocument(numFields, 20, options2);
writer.addDocument(addId(doc1.toDocument(), "1"));
writer.addDocument(addId(doc2.toDocument(), "2"));
final IndexReader reader = writer.getReader();
final int doc1ID = docID(reader, "1");
assertEquals(doc1, reader.getTermVectors(doc1ID));
final int doc2ID = docID(reader, "2");
assertEquals(doc2, reader.getTermVectors(doc2ID));
reader.close();
writer.close();
dir.close();
}
}
}
public void testRandom() throws IOException {
final RandomDocumentFactory docFactory = new RandomDocumentFactory(5, 20);
final int numDocs = _TestUtil.nextInt(random(), 100, 1000);
final RandomDocument[] docs = new RandomDocument[numDocs];
for (int i = 0; i < numDocs; ++i) {
docs[i] = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), _TestUtil.nextInt(random(), 10, 50), randomOptions());
}
final Directory dir = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; ++i) {
writer.addDocument(addId(docs[i].toDocument(), ""+i));
}
final IndexReader reader = writer.getReader();
for (int i = 0; i < numDocs; ++i) {
final int docID = docID(reader, ""+i);
assertEquals(docs[i], reader.getTermVectors(docID));
}
reader.close();
writer.close();
dir.close();
}
}