mirror of
https://github.com/apache/lucene.git
synced 2025-03-06 08:19:23 +00:00
LUCENE-4733: Refactor term vectors formats tests around a BaseTermVectorsFormatTestCase.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441367 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d5a3a2a7e4
commit
a7c49eeb8c
@ -0,0 +1,30 @@
|
||||
package org.apache.lucene.codecs.simpletext;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||
|
||||
public class TestSimpleTextTermVectorsFormat extends BaseTermVectorsFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new SimpleTextCodec();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package org.apache.lucene.codecs.compressing;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public class TestCompressingTermVectorsFormat extends BaseTermVectorsFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return CompressingCodec.randomInstance(random());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package org.apache.lucene.codecs.lucene40;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
|
||||
|
||||
public class TestLucene40TermVectorsFormat extends BaseTermVectorsFormatTestCase {
|
||||
|
||||
@Override
|
||||
protected Codec getCodec() {
|
||||
return new Lucene40Codec();
|
||||
}
|
||||
|
||||
}
|
@ -17,38 +17,21 @@ package org.apache.lucene.index;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.IntField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomInts;
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
public class TestPayloadsOnVectors extends LuceneTestCase {
|
||||
|
||||
@ -158,314 +141,5 @@ public class TestPayloadsOnVectors extends LuceneTestCase {
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// custom impl to test cases that are forbidden by the default OffsetAttribute impl
|
||||
static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
|
||||
|
||||
int start, end;
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
// no check!
|
||||
start = startOffset;
|
||||
end = endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
start = end = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PermissiveOffsetAttributeImpl) {
|
||||
PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
|
||||
return o.start == start && o.end == end;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return start + 31 * end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setOffset(start, end);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static BytesRef randomPayload() {
|
||||
final int len = random().nextInt(5);
|
||||
if (len == 0) {
|
||||
return null;
|
||||
}
|
||||
final BytesRef payload = new BytesRef(len);
|
||||
random().nextBytes(payload.bytes);
|
||||
payload.length = len;
|
||||
return payload;
|
||||
}
|
||||
|
||||
class RandomTokenStream extends TokenStream {
|
||||
|
||||
final String[] terms;
|
||||
final int[] positionsIncrements;
|
||||
final int[] positions;
|
||||
final int[] startOffsets, endOffsets;
|
||||
final BytesRef[] payloads;
|
||||
|
||||
final Map<Integer, Set<Integer>> positionToTerms;
|
||||
final Map<Integer, Set<Integer>> startOffsetToTerms;
|
||||
|
||||
final CharTermAttribute termAtt;
|
||||
final PositionIncrementAttribute piAtt;
|
||||
final OffsetAttribute oAtt;
|
||||
final PayloadAttribute pAtt;
|
||||
int i = 0;
|
||||
|
||||
RandomTokenStream(int len, String[] sampleTerms, boolean weird) {
|
||||
terms = new String[len];
|
||||
positionsIncrements = new int[len];
|
||||
positions = new int[len];
|
||||
startOffsets = new int[len];
|
||||
endOffsets = new int[len];
|
||||
payloads = new BytesRef[len];
|
||||
for (int i = 0; i < len; ++i) {
|
||||
terms[i] = RandomPicks.randomFrom(random(), sampleTerms);
|
||||
if (weird) {
|
||||
positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 18);
|
||||
startOffsets[i] = random().nextInt();
|
||||
endOffsets[i] = random().nextInt();
|
||||
} else if (i == 0) {
|
||||
positionsIncrements[i] = _TestUtil.nextInt(random(), 1, 1 << 5);
|
||||
startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
|
||||
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
|
||||
} else {
|
||||
positionsIncrements[i] = _TestUtil.nextInt(random(), 0, 1 << 5);
|
||||
startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, 1 << 16);
|
||||
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i == 0) {
|
||||
positions[i] = positionsIncrements[i] - 1;
|
||||
} else {
|
||||
positions[i] = positions[i - 1] + positionsIncrements[i];
|
||||
}
|
||||
}
|
||||
if (rarely()) {
|
||||
Arrays.fill(payloads, randomPayload());
|
||||
} else {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
payloads[i] = randomPayload();
|
||||
}
|
||||
}
|
||||
|
||||
positionToTerms = new HashMap<Integer, Set<Integer>>();
|
||||
startOffsetToTerms = new HashMap<Integer, Set<Integer>>();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (!positionToTerms.containsKey(positions[i])) {
|
||||
positionToTerms.put(positions[i], new HashSet<Integer>(1));
|
||||
}
|
||||
positionToTerms.get(positions[i]).add(i);
|
||||
if (!startOffsetToTerms.containsKey(startOffsets[i])) {
|
||||
startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
|
||||
}
|
||||
startOffsetToTerms.get(startOffsets[i]).add(i);
|
||||
}
|
||||
|
||||
addAttributeImpl(new PermissiveOffsetAttributeImpl());
|
||||
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
piAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
oAtt = addAttribute(OffsetAttribute.class);
|
||||
pAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (i < terms.length) {
|
||||
termAtt.setLength(0).append(terms[i]);
|
||||
piAtt.setPositionIncrement(positionsIncrements[i]);
|
||||
oAtt.setOffset(startOffsets[i], endOffsets[i]);
|
||||
pAtt.setPayload(payloads[i]);
|
||||
++i;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static FieldType randomFieldType() {
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorPositions(random().nextBoolean());
|
||||
ft.setStoreTermVectorOffsets(random().nextBoolean());
|
||||
if (random().nextBoolean()) {
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
ft.setStoreTermVectorPayloads(true);
|
||||
}
|
||||
ft.freeze();
|
||||
return ft;
|
||||
}
|
||||
|
||||
public void testRandomVectors() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwConf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwConf.setMaxBufferedDocs(RandomInts.randomIntBetween(random(), 2, 30));
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwConf);
|
||||
String[] sampleTerms = new String[RandomInts.randomIntBetween(random(), 20, 50)];
|
||||
for (int i = 0; i < sampleTerms.length; ++i) {
|
||||
sampleTerms[i] = _TestUtil.randomUnicodeString(random());
|
||||
}
|
||||
FieldType ft = randomFieldType();
|
||||
// generate random documents and index them
|
||||
final String[] fieldNames = new String[_TestUtil.nextInt(random(), 1, 200)];
|
||||
for (int i = 0; i < fieldNames.length; ++i) {
|
||||
String fieldName;
|
||||
do {
|
||||
fieldName = _TestUtil.randomSimpleString(random());
|
||||
} while ("id".equals(fieldName));
|
||||
fieldNames[i] = fieldName;
|
||||
}
|
||||
final int numDocs = _TestUtil.nextInt(random(), 10, 100);
|
||||
@SuppressWarnings("unchecked")
|
||||
final Map<String, RandomTokenStream>[] fieldValues = new Map[numDocs];
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
fieldValues[i] = new HashMap<String, RandomTokenStream>();
|
||||
final int numFields = _TestUtil.nextInt(random(), 0, rarely() ? fieldNames.length : 5);
|
||||
for (int j = 0; j < numFields; ++j) {
|
||||
final String fieldName = fieldNames[(i+j*31) % fieldNames.length];
|
||||
final int tokenStreamLen = _TestUtil.nextInt(random(), 1, rarely() ? 300 : 5);
|
||||
fieldValues[i].put(fieldName, new RandomTokenStream(tokenStreamLen, sampleTerms, rarely()));
|
||||
}
|
||||
}
|
||||
|
||||
// index them
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
Document doc = new Document();
|
||||
doc.add(new IntField("id", i, Store.YES));
|
||||
for (Map.Entry<String, RandomTokenStream> entry : fieldValues[i].entrySet()) {
|
||||
doc.add(new Field(entry.getKey(), entry.getValue(), ft));
|
||||
}
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
|
||||
iw.commit();
|
||||
// make sure the format can merge
|
||||
iw.forceMerge(2);
|
||||
|
||||
// read term vectors
|
||||
final DirectoryReader reader = DirectoryReader.open(dir);
|
||||
for (int i = 0; i < 100; ++i) {
|
||||
final int docID = random().nextInt(numDocs);
|
||||
final Map<String, RandomTokenStream> fvs = fieldValues[reader.document(docID).getField("id").numericValue().intValue()];
|
||||
final Fields fields = reader.getTermVectors(docID);
|
||||
if (fvs.isEmpty()) {
|
||||
assertNull(fields);
|
||||
} else {
|
||||
Set<String> fns = new HashSet<String>();
|
||||
for (String field : fields) {
|
||||
fns.add(field);
|
||||
}
|
||||
assertEquals(fields.size(), fns.size());
|
||||
assertEquals(fvs.keySet(), fns);
|
||||
for (String field : fields) {
|
||||
final RandomTokenStream tk = fvs.get(field);
|
||||
assert tk != null;
|
||||
final Terms terms = fields.terms(field);
|
||||
assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
|
||||
assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
|
||||
assertEquals(1, terms.getDocCount());
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
while (termsEnum.next() != null) {
|
||||
assertEquals(1, termsEnum.docFreq());
|
||||
final DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
|
||||
final DocsEnum docsEnum = docsAndPositionsEnum == null ? termsEnum.docs(null, null) : docsAndPositionsEnum;
|
||||
if (ft.storeTermVectorOffsets() || ft.storeTermVectorPositions()) {
|
||||
assertNotNull(docsAndPositionsEnum);
|
||||
}
|
||||
assertEquals(0, docsEnum.nextDoc());
|
||||
if (terms.hasPositions() || terms.hasOffsets()) {
|
||||
final int freq = docsEnum.freq();
|
||||
assertTrue(freq >= 1);
|
||||
if (docsAndPositionsEnum != null) {
|
||||
for (int k = 0; k < freq; ++k) {
|
||||
final int position = docsAndPositionsEnum.nextPosition();
|
||||
final Set<Integer> indexes;
|
||||
if (terms.hasPositions()) {
|
||||
indexes = tk.positionToTerms.get(position);
|
||||
assertNotNull(tk.positionToTerms.keySet().toString() + " does not contain " + position, indexes);
|
||||
} else {
|
||||
indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
|
||||
assertNotNull(indexes);
|
||||
}
|
||||
if (terms.hasPositions()) {
|
||||
boolean foundPosition = false;
|
||||
for (int index : indexes) {
|
||||
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.positions[index] == position) {
|
||||
foundPosition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundPosition);
|
||||
}
|
||||
if (terms.hasOffsets()) {
|
||||
boolean foundOffset = false;
|
||||
for (int index : indexes) {
|
||||
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
|
||||
foundOffset = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundOffset);
|
||||
}
|
||||
if (terms.hasPayloads()) {
|
||||
boolean foundPayload = false;
|
||||
for (int index : indexes) {
|
||||
if (new BytesRef(tk.terms[index]).equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
|
||||
foundPayload = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundPayload);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
IOUtils.close(reader, iw, dir);
|
||||
}
|
||||
|
||||
private static boolean equals(Object o1, Object o2) {
|
||||
if (o1 == null) {
|
||||
return o2 == null;
|
||||
} else {
|
||||
return o1.equals(o2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,9 +18,6 @@ package org.apache.lucene.search;
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
@ -28,19 +25,24 @@ import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestTermVectors extends LuceneTestCase {
|
||||
private static IndexSearcher searcher;
|
||||
private static IndexReader reader;
|
||||
private static Directory directory;
|
||||
|
||||
@ -75,7 +77,6 @@ public class TestTermVectors extends LuceneTestCase {
|
||||
}
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher = newSearcher(reader);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
@ -84,300 +85,8 @@ public class TestTermVectors extends LuceneTestCase {
|
||||
directory.close();
|
||||
reader = null;
|
||||
directory = null;
|
||||
searcher = null;
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(searcher != null);
|
||||
}
|
||||
|
||||
public void testTermVectors() throws IOException {
|
||||
Query query = new TermQuery(new Term("field", "seventy"));
|
||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(100, hits.length);
|
||||
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
|
||||
assertNotNull(vectors);
|
||||
assertEquals("doc=" + hits[i].doc + " tv=" + vectors, 1, vectors.size());
|
||||
}
|
||||
Terms vector;
|
||||
vector = searcher.reader.getTermVectors(hits[0].doc).terms("noTV");
|
||||
assertNull(vector);
|
||||
}
|
||||
|
||||
public void testTermVectorsFieldOrder() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
|
||||
Document doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
doc.add(newField("c", "some content here", ft));
|
||||
doc.add(newField("a", "some content here", ft));
|
||||
doc.add(newField("b", "some content here", ft));
|
||||
doc.add(newField("x", "some content here", ft));
|
||||
writer.addDocument(doc);
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
Fields v = reader.getTermVectors(0);
|
||||
assertEquals(4, v.size());
|
||||
String[] expectedFields = new String[]{"a", "b", "c", "x"};
|
||||
int[] expectedPositions = new int[]{1, 2, 0};
|
||||
Iterator<String> fieldsEnum = v.iterator();
|
||||
for(int i=0;i<expectedFields.length;i++) {
|
||||
assertEquals(expectedFields[i], fieldsEnum.next());
|
||||
assertEquals(3, v.terms(expectedFields[i]).size());
|
||||
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
Terms terms = v.terms(expectedFields[i]);
|
||||
assertNotNull(terms);
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
assertEquals("content", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[0], dpEnum.nextPosition());
|
||||
|
||||
assertEquals("here", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[1], dpEnum.nextPosition());
|
||||
|
||||
assertEquals("some", termsEnum.next().utf8ToString());
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
assertEquals(1, dpEnum.freq());
|
||||
assertEquals(expectedPositions[2], dpEnum.nextPosition());
|
||||
|
||||
assertNull(termsEnum.next());
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testTermPositionVectors() throws IOException {
|
||||
Query query = new TermQuery(new Term("field", "zero"));
|
||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(1, hits.length);
|
||||
|
||||
DocsAndPositionsEnum dpEnum = null;
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
|
||||
assertNotNull(vectors);
|
||||
assertEquals(1, vectors.size());
|
||||
|
||||
TermsEnum termsEnum = vectors.terms("field").iterator(null);
|
||||
assertNotNull(termsEnum.next());
|
||||
|
||||
boolean shouldBePosVector = hits[i].doc % 2 == 0;
|
||||
boolean shouldBeOffVector = hits[i].doc % 3 == 0;
|
||||
|
||||
if (shouldBePosVector || shouldBeOffVector) {
|
||||
while(true) {
|
||||
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
|
||||
assertNotNull(dpEnum);
|
||||
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
|
||||
|
||||
dpEnum.nextPosition();
|
||||
|
||||
if (shouldBeOffVector) {
|
||||
assertTrue(dpEnum.startOffset() != -1);
|
||||
assertTrue(dpEnum.endOffset() != -1);
|
||||
}
|
||||
|
||||
if (termsEnum.next() == null) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermOffsetVectors() throws IOException {
|
||||
Query query = new TermQuery(new Term("field", "fifty"));
|
||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(100, hits.length);
|
||||
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
|
||||
assertNotNull(vectors);
|
||||
assertEquals(1, vectors.size());
|
||||
}
|
||||
}
|
||||
|
||||
public void testKnownSetOfDocuments() throws IOException {
|
||||
String test1 = "eating chocolate in a computer lab"; //6 terms
|
||||
String test2 = "computer in a computer lab"; //5 terms
|
||||
String test3 = "a chocolate lab grows old"; //5 terms
|
||||
String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
|
||||
Map<String,Integer> test4Map = new HashMap<String,Integer>();
|
||||
test4Map.put("chocolate", Integer.valueOf(3));
|
||||
test4Map.put("lab", Integer.valueOf(2));
|
||||
test4Map.put("eating", Integer.valueOf(1));
|
||||
test4Map.put("computer", Integer.valueOf(1));
|
||||
test4Map.put("with", Integer.valueOf(1));
|
||||
test4Map.put("a", Integer.valueOf(1));
|
||||
test4Map.put("colored", Integer.valueOf(1));
|
||||
test4Map.put("in", Integer.valueOf(1));
|
||||
test4Map.put("an", Integer.valueOf(1));
|
||||
test4Map.put("computer", Integer.valueOf(1));
|
||||
test4Map.put("old", Integer.valueOf(1));
|
||||
|
||||
Document testDoc1 = new Document();
|
||||
setupDoc(testDoc1, test1);
|
||||
Document testDoc2 = new Document();
|
||||
setupDoc(testDoc2, test2);
|
||||
Document testDoc3 = new Document();
|
||||
setupDoc(testDoc3, test3);
|
||||
Document testDoc4 = new Document();
|
||||
setupDoc(testDoc4, test4);
|
||||
|
||||
Directory dir = newDirectory();
|
||||
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
|
||||
.setOpenMode(OpenMode.CREATE)
|
||||
.setMergePolicy(newLogMergePolicy())
|
||||
.setSimilarity(new DefaultSimilarity()));
|
||||
writer.addDocument(testDoc1);
|
||||
writer.addDocument(testDoc2);
|
||||
writer.addDocument(testDoc3);
|
||||
writer.addDocument(testDoc4);
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher knownSearcher = newSearcher(reader);
|
||||
knownSearcher.setSimilarity(new DefaultSimilarity());
|
||||
Fields fields = MultiFields.getFields(knownSearcher.reader);
|
||||
|
||||
DocsEnum docs = null;
|
||||
for (String fieldName : fields) {
|
||||
Terms terms = fields.terms(fieldName);
|
||||
assertNotNull(terms); // NOTE: kinda sketchy assumptions, but ideally we would fix fieldsenum api...
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
|
||||
while (termsEnum.next() != null) {
|
||||
String text = termsEnum.term().utf8ToString();
|
||||
docs = _TestUtil.docs(random(), termsEnum, MultiFields.getLiveDocs(knownSearcher.reader), docs, DocsEnum.FLAG_FREQS);
|
||||
|
||||
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int docId = docs.docID();
|
||||
int freq = docs.freq();
|
||||
//System.out.println("Doc Id: " + docId + " freq " + freq);
|
||||
Terms vector = knownSearcher.reader.getTermVectors(docId).terms("field");
|
||||
//float tf = sim.tf(freq);
|
||||
//float idf = sim.idf(knownSearcher.docFreq(term), knownSearcher.maxDoc());
|
||||
//float qNorm = sim.queryNorm()
|
||||
//This is fine since we don't have stop words
|
||||
//float lNorm = sim.lengthNorm("field", vector.getTerms().length);
|
||||
//float coord = sim.coord()
|
||||
//System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
|
||||
assertNotNull(vector);
|
||||
TermsEnum termsEnum2 = vector.iterator(null);
|
||||
|
||||
while(termsEnum2.next() != null) {
|
||||
if (text.equals(termsEnum2.term().utf8ToString())) {
|
||||
assertEquals(freq, termsEnum2.totalTermFreq());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//System.out.println("--------");
|
||||
}
|
||||
Query query = new TermQuery(new Term("field", "chocolate"));
|
||||
ScoreDoc[] hits = knownSearcher.search(query, null, 1000).scoreDocs;
|
||||
//doc 3 should be the first hit b/c it is the shortest match
|
||||
assertTrue(hits.length == 3);
|
||||
/*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
|
||||
System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
|
||||
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
|
||||
assertTrue(hits[0].doc == 2);
|
||||
assertTrue(hits[1].doc == 3);
|
||||
assertTrue(hits[2].doc == 0);
|
||||
Terms vector = knownSearcher.reader.getTermVectors(hits[1].doc).terms("field");
|
||||
assertNotNull(vector);
|
||||
//System.out.println("Vector: " + vector);
|
||||
assertEquals(10, vector.size());
|
||||
TermsEnum termsEnum = vector.iterator(null);
|
||||
while(termsEnum.next() != null) {
|
||||
String term = termsEnum.term().utf8ToString();
|
||||
//System.out.println("Term: " + term);
|
||||
int freq = (int) termsEnum.totalTermFreq();
|
||||
assertTrue(test4.indexOf(term) != -1);
|
||||
Integer freqInt = test4Map.get(term);
|
||||
assertTrue(freqInt != null);
|
||||
assertEquals(freqInt.intValue(), freq);
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void setupDoc(Document doc, String text)
|
||||
{
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
FieldType ft2 = new FieldType(TextField.TYPE_STORED);
|
||||
ft2.setStoreTermVectors(true);
|
||||
doc.add(newField("field2", text, ft));
|
||||
doc.add(newField("field", text, ft2));
|
||||
//System.out.println("Document: " + doc);
|
||||
}
|
||||
|
||||
// Test only a few docs having vectors
|
||||
public void testRareVectors() throws IOException {
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
|
||||
.setOpenMode(OpenMode.CREATE));
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now add non-vectors");
|
||||
}
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now add vectors");
|
||||
}
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
for(int i=0;i<10;i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("field", English.intToEnglish(100+i), ft));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now getReader");
|
||||
}
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
Query query = new TermQuery(new Term("field", "hundred"));
|
||||
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(10, hits.length);
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
|
||||
Fields vectors = searcher.reader.getTermVectors(hits[i].doc);
|
||||
assertNotNull(vectors);
|
||||
assertEquals(1, vectors.size());
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
|
||||
|
||||
// In a single doc, for the same field, mix the term
|
||||
// vectors up
|
||||
public void testMixedVectrosVectors() throws IOException {
|
||||
|
@ -0,0 +1,632 @@
|
||||
package org.apache.lucene.index;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
/**
|
||||
* Base class aiming at testing {@link TermVectorsFormat term vectors formats}.
|
||||
* To test a new format, all you need is to register a new {@link Codec} which
|
||||
* uses it and extend this class and override {@link #getCodec()}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class BaseTermVectorsFormatTestCase extends LuceneTestCase {
|
||||
|
||||
private Codec savedCodec;
|
||||
|
||||
/**
|
||||
* Returns the Codec to run tests against
|
||||
*/
|
||||
protected abstract Codec getCodec();
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
// set the default codec, so adding test cases to this isn't fragile
|
||||
savedCodec = Codec.getDefault();
|
||||
Codec.setDefault(getCodec());
|
||||
}
|
||||
|
||||
public void tearDown() throws Exception {
|
||||
Codec.setDefault(savedCodec); // restore
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
/**
|
||||
* A combination of term vectors options.
|
||||
*/
|
||||
protected enum Options {
|
||||
NONE(false, false, false),
|
||||
POSITIONS(true, false, false),
|
||||
OFFSETS(false, true, false),
|
||||
POSITIONS_AND_OFFSETS(true, true, false),
|
||||
POSITIONS_AND_PAYLOADS(true, false, true),
|
||||
POSITIONS_AND_OFFSETS_AND_PAYLOADS(true, true, true);
|
||||
final boolean positions, offsets, payloads;
|
||||
private Options(boolean positions, boolean offsets, boolean payloads) {
|
||||
this.positions = positions;
|
||||
this.offsets = offsets;
|
||||
this.payloads = payloads;
|
||||
}
|
||||
}
|
||||
|
||||
protected Set<Options> validOptions() {
|
||||
return EnumSet.allOf(Options.class);
|
||||
}
|
||||
|
||||
protected Options randomOptions() {
|
||||
return RandomPicks.randomFrom(random(), new ArrayList<Options>(validOptions()));
|
||||
}
|
||||
|
||||
protected FieldType fieldType(Options options) {
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorPositions(options.positions);
|
||||
ft.setStoreTermVectorOffsets(options.offsets);
|
||||
ft.setStoreTermVectorPayloads(options.payloads);
|
||||
ft.freeze();
|
||||
return ft;
|
||||
}
|
||||
|
||||
protected BytesRef randomPayload() {
|
||||
final int len = random().nextInt(5);
|
||||
if (len == 0) {
|
||||
return null;
|
||||
}
|
||||
final BytesRef payload = new BytesRef(len);
|
||||
random().nextBytes(payload.bytes);
|
||||
payload.length = len;
|
||||
return payload;
|
||||
}
|
||||
|
||||
// custom impl to test cases that are forbidden by the default OffsetAttribute impl
|
||||
private static class PermissiveOffsetAttributeImpl extends AttributeImpl implements OffsetAttribute {
|
||||
|
||||
int start, end;
|
||||
|
||||
@Override
|
||||
public int startOffset() {
|
||||
return start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() {
|
||||
return end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOffset(int startOffset, int endOffset) {
|
||||
// no check!
|
||||
start = startOffset;
|
||||
end = endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
start = end = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PermissiveOffsetAttributeImpl) {
|
||||
PermissiveOffsetAttributeImpl o = (PermissiveOffsetAttributeImpl) other;
|
||||
return o.start == start && o.end == end;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return start + 31 * end;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
OffsetAttribute t = (OffsetAttribute) target;
|
||||
t.setOffset(start, end);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO: use CannedTokenStream?
|
||||
protected class RandomTokenStream extends TokenStream {
|
||||
|
||||
final String[] terms;
|
||||
final BytesRef[] termBytes;
|
||||
final int[] positionsIncrements;
|
||||
final int[] positions;
|
||||
final int[] startOffsets, endOffsets;
|
||||
final BytesRef[] payloads;
|
||||
|
||||
final Map<String, Integer> freqs;
|
||||
final Map<Integer, Set<Integer>> positionToTerms;
|
||||
final Map<Integer, Set<Integer>> startOffsetToTerms;
|
||||
|
||||
final CharTermAttribute termAtt;
|
||||
final PositionIncrementAttribute piAtt;
|
||||
final OffsetAttribute oAtt;
|
||||
final PayloadAttribute pAtt;
|
||||
int i = 0;
|
||||
|
||||
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
|
||||
this(len, sampleTerms, sampleTermBytes, rarely());
|
||||
}
|
||||
|
||||
protected RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
|
||||
terms = new String[len];
|
||||
termBytes = new BytesRef[len];
|
||||
positionsIncrements = new int[len];
|
||||
positions = new int[len];
|
||||
startOffsets = new int[len];
|
||||
endOffsets = new int[len];
|
||||
payloads = new BytesRef[len];
|
||||
for (int i = 0; i < len; ++i) {
|
||||
final int o = random().nextInt(sampleTerms.length);
|
||||
terms[i] = sampleTerms[o];
|
||||
termBytes[i] = sampleTermBytes[o];
|
||||
positionsIncrements[i] = _TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
|
||||
if (offsetsGoBackwards) {
|
||||
startOffsets[i] = random().nextInt();
|
||||
endOffsets[i] = random().nextInt();
|
||||
} else {
|
||||
if (i == 0) {
|
||||
startOffsets[i] = _TestUtil.nextInt(random(), 0, 1 << 16);
|
||||
} else {
|
||||
startOffsets[i] = startOffsets[i-1] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
|
||||
}
|
||||
endOffsets[i] = startOffsets[i] + _TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i == 0) {
|
||||
positions[i] = positionsIncrements[i] - 1;
|
||||
} else {
|
||||
positions[i] = positions[i - 1] + positionsIncrements[i];
|
||||
}
|
||||
}
|
||||
if (rarely()) {
|
||||
Arrays.fill(payloads, randomPayload());
|
||||
} else {
|
||||
for (int i = 0; i < len; ++i) {
|
||||
payloads[i] = randomPayload();
|
||||
}
|
||||
}
|
||||
|
||||
positionToTerms = new HashMap<Integer, Set<Integer>>(len);
|
||||
startOffsetToTerms = new HashMap<Integer, Set<Integer>>(len);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (!positionToTerms.containsKey(positions[i])) {
|
||||
positionToTerms.put(positions[i], new HashSet<Integer>(1));
|
||||
}
|
||||
positionToTerms.get(positions[i]).add(i);
|
||||
if (!startOffsetToTerms.containsKey(startOffsets[i])) {
|
||||
startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
|
||||
}
|
||||
startOffsetToTerms.get(startOffsets[i]).add(i);
|
||||
}
|
||||
|
||||
freqs = new HashMap<String, Integer>();
|
||||
for (String term : terms) {
|
||||
if (freqs.containsKey(term)) {
|
||||
freqs.put(term, freqs.get(term) + 1);
|
||||
} else {
|
||||
freqs.put(term, 1);
|
||||
}
|
||||
}
|
||||
|
||||
addAttributeImpl(new PermissiveOffsetAttributeImpl());
|
||||
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
piAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
oAtt = addAttribute(OffsetAttribute.class);
|
||||
pAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public boolean hasPayloads() {
|
||||
for (BytesRef payload : payloads) {
|
||||
if (payload != null && payload.length > 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (i < terms.length) {
|
||||
termAtt.setLength(0).append(terms[i]);
|
||||
piAtt.setPositionIncrement(positionsIncrements[i]);
|
||||
oAtt.setOffset(startOffsets[i], endOffsets[i]);
|
||||
pAtt.setPayload(payloads[i]);
|
||||
++i;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected class RandomDocument {
|
||||
|
||||
private final String[] fieldNames;
|
||||
private final FieldType[] fieldTypes;
|
||||
private final RandomTokenStream[] tokenStreams;
|
||||
|
||||
protected RandomDocument(int fieldCount, int maxTermCount, Options options, String[] fieldNames, String[] sampleTerms, BytesRef[] sampleTermBytes) {
|
||||
if (fieldCount > fieldNames.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.fieldNames = new String[fieldCount];
|
||||
fieldTypes = new FieldType[fieldCount];
|
||||
tokenStreams = new RandomTokenStream[fieldCount];
|
||||
Arrays.fill(fieldTypes, fieldType(options));
|
||||
final Set<String> usedFileNames = new HashSet<String>();
|
||||
for (int i = 0; i < fieldCount; ++i) {
|
||||
do {
|
||||
this.fieldNames[i] = RandomPicks.randomFrom(random(), fieldNames);
|
||||
} while (usedFileNames.contains(this.fieldNames[i]));
|
||||
usedFileNames.add(this.fieldNames[i]);
|
||||
tokenStreams[i] = new RandomTokenStream(_TestUtil.nextInt(random(), 1, maxTermCount), sampleTerms, sampleTermBytes);
|
||||
}
|
||||
}
|
||||
|
||||
public Document toDocument() {
|
||||
final Document doc = new Document();
|
||||
for (int i = 0; i < fieldNames.length; ++i) {
|
||||
doc.add(new Field(fieldNames[i], tokenStreams[i], fieldTypes[i]));
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected class RandomDocumentFactory {
|
||||
|
||||
private final String[] fieldNames;
|
||||
private final String[] terms;
|
||||
private final BytesRef[] termBytes;
|
||||
|
||||
protected RandomDocumentFactory(int distinctFieldNames, int disctinctTerms) {
|
||||
final Set<String> fieldNames = new HashSet<String>();
|
||||
while (fieldNames.size() < distinctFieldNames) {
|
||||
fieldNames.add(_TestUtil.randomSimpleString(random()));
|
||||
fieldNames.remove("id");
|
||||
}
|
||||
this.fieldNames = fieldNames.toArray(new String[0]);
|
||||
terms = new String[disctinctTerms];
|
||||
termBytes = new BytesRef[disctinctTerms];
|
||||
for (int i = 0; i < disctinctTerms; ++i) {
|
||||
terms[i] = _TestUtil.randomRealisticUnicodeString(random());
|
||||
termBytes[i] = new BytesRef(terms[i]);
|
||||
}
|
||||
}
|
||||
|
||||
public RandomDocument newDocument(int fieldCount, int maxTermCount, Options options) {
|
||||
return new RandomDocument(fieldCount, maxTermCount, options, fieldNames, terms, termBytes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected void assertEquals(RandomDocument doc, Fields fields) throws IOException {
|
||||
// compare field names
|
||||
assertEquals(doc == null, fields == null);
|
||||
assertEquals(doc.fieldNames.length, fields.size());
|
||||
final Set<String> fields1 = new HashSet<String>();
|
||||
final Set<String> fields2 = new HashSet<String>();
|
||||
for (int i = 0; i < doc.fieldNames.length; ++i) {
|
||||
fields1.add(doc.fieldNames[i]);
|
||||
}
|
||||
for (String field : fields) {
|
||||
fields2.add(field);
|
||||
}
|
||||
assertEquals(fields1, fields2);
|
||||
|
||||
for (int i = 0; i < doc.fieldNames.length; ++i) {
|
||||
assertEquals(doc.tokenStreams[i], doc.fieldTypes[i], fields.terms(doc.fieldNames[i]));
|
||||
}
|
||||
}
|
||||
|
||||
protected static boolean equals(Object o1, Object o2) {
|
||||
if (o1 == null) {
|
||||
return o2 == null;
|
||||
} else {
|
||||
return o1.equals(o2);
|
||||
}
|
||||
}
|
||||
|
||||
// to test reuse
|
||||
private TermsEnum termsEnum = null;
|
||||
private DocsEnum docsEnum = null;
|
||||
private DocsAndPositionsEnum docsAndPositionsEnum = null;
|
||||
|
||||
protected void assertEquals(RandomTokenStream tk, FieldType ft, Terms terms) throws IOException {
|
||||
assertEquals(1, terms.getDocCount());
|
||||
final int termCount = new HashSet<String>(Arrays.asList(tk.terms)).size();
|
||||
assertEquals(termCount, terms.size());
|
||||
assertEquals(termCount, terms.getSumDocFreq());
|
||||
assertEquals(ft.storeTermVectorPositions(), terms.hasPositions());
|
||||
assertEquals(ft.storeTermVectorOffsets(), terms.hasOffsets());
|
||||
assertEquals(ft.storeTermVectorPayloads() && tk.hasPayloads(), terms.hasPayloads());
|
||||
final Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
|
||||
for (String term : tk.freqs.keySet()) {
|
||||
uniqueTerms.add(new BytesRef(term));
|
||||
}
|
||||
final BytesRef[] sortedTerms = uniqueTerms.toArray(new BytesRef[0]);
|
||||
Arrays.sort(sortedTerms, terms.getComparator());
|
||||
termsEnum = terms.iterator(random().nextBoolean() ? null : termsEnum);
|
||||
for (int i = 0; i < sortedTerms.length; ++i) {
|
||||
final BytesRef nextTerm = termsEnum.next();
|
||||
assertEquals(sortedTerms[i], nextTerm);
|
||||
assertEquals(sortedTerms[i], termsEnum.term());
|
||||
assertEquals(1, termsEnum.docFreq());
|
||||
|
||||
final FixedBitSet bits = new FixedBitSet(1);
|
||||
docsEnum = termsEnum.docs(bits, random().nextBoolean() ? null : docsEnum);
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
|
||||
bits.set(0);
|
||||
|
||||
docsEnum = termsEnum.docs(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsEnum);
|
||||
assertNotNull(docsEnum);
|
||||
assertEquals(0, docsEnum.nextDoc());
|
||||
assertEquals(0, docsEnum.docID());
|
||||
assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) docsEnum.freq());
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsEnum.nextDoc());
|
||||
|
||||
bits.clear(0);
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(bits, random().nextBoolean() ? null : docsAndPositionsEnum);
|
||||
assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
|
||||
if (docsAndPositionsEnum != null) {
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
|
||||
}
|
||||
bits.set(0);
|
||||
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(random().nextBoolean() ? bits : null, random().nextBoolean() ? null : docsAndPositionsEnum);
|
||||
assertEquals(ft.storeTermVectorOffsets() || ft.storeTermVectorPositions(), docsAndPositionsEnum != null);
|
||||
if (terms.hasPositions() || terms.hasOffsets()) {
|
||||
assertEquals(0, docsAndPositionsEnum.nextDoc());
|
||||
final int freq = docsAndPositionsEnum.freq();
|
||||
assertEquals(tk.freqs.get(termsEnum.term().utf8ToString()), (Integer) freq);
|
||||
if (docsAndPositionsEnum != null) {
|
||||
for (int k = 0; k < freq; ++k) {
|
||||
final int position = docsAndPositionsEnum.nextPosition();
|
||||
final Set<Integer> indexes;
|
||||
if (terms.hasPositions()) {
|
||||
indexes = tk.positionToTerms.get(position);
|
||||
assertNotNull(indexes);
|
||||
} else {
|
||||
indexes = tk.startOffsetToTerms.get(docsAndPositionsEnum.startOffset());
|
||||
assertNotNull(indexes);
|
||||
}
|
||||
if (terms.hasPositions()) {
|
||||
boolean foundPosition = false;
|
||||
for (int index : indexes) {
|
||||
if (tk.termBytes[index].equals(termsEnum.term()) && tk.positions[index] == position) {
|
||||
foundPosition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundPosition);
|
||||
}
|
||||
if (terms.hasOffsets()) {
|
||||
boolean foundOffset = false;
|
||||
for (int index : indexes) {
|
||||
if (tk.termBytes[index].equals(termsEnum.term()) && tk.startOffsets[index] == docsAndPositionsEnum.startOffset() && tk.endOffsets[index] == docsAndPositionsEnum.endOffset()) {
|
||||
foundOffset = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundOffset);
|
||||
}
|
||||
if (terms.hasPayloads()) {
|
||||
boolean foundPayload = false;
|
||||
for (int index : indexes) {
|
||||
if (tk.termBytes[index].equals(termsEnum.term()) && equals(tk.payloads[index], docsAndPositionsEnum.getPayload())) {
|
||||
foundPayload = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(foundPayload);
|
||||
}
|
||||
}
|
||||
try {
|
||||
docsAndPositionsEnum.nextPosition();
|
||||
fail();
|
||||
} catch (Exception e) {
|
||||
// ok
|
||||
} catch (AssertionError e) {
|
||||
// ok
|
||||
}
|
||||
}
|
||||
assertEquals(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.nextDoc());
|
||||
}
|
||||
}
|
||||
assertNull(termsEnum.next());
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
if (random().nextBoolean()) {
|
||||
assertTrue(termsEnum.seekExact(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
|
||||
} else {
|
||||
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(RandomPicks.randomFrom(random(), tk.termBytes), random().nextBoolean()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected Document addId(Document doc, String id) {
|
||||
doc.add(new StringField("id", id, Store.NO));
|
||||
return doc;
|
||||
}
|
||||
|
||||
protected int docID(IndexReader reader, String id) throws IOException {
|
||||
return new IndexSearcher(reader).search(new TermQuery(new Term("id", id)), 1).scoreDocs[0].doc;
|
||||
}
|
||||
|
||||
// only one doc with vectors
|
||||
public void testRareVectors() throws IOException {
|
||||
final RandomDocumentFactory docFactory = new RandomDocumentFactory(10, 20);
|
||||
for (Options options : validOptions()) {
|
||||
final int numDocs = _TestUtil.nextInt(random(), 10, 10000);
|
||||
final int docWithVectors = random().nextInt(numDocs);
|
||||
final Document emptyDoc = new Document();
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), 20, options);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
if (i == docWithVectors) {
|
||||
writer.addDocument(addId(doc.toDocument(), "42"));
|
||||
} else {
|
||||
writer.addDocument(emptyDoc);
|
||||
}
|
||||
}
|
||||
final IndexReader reader = writer.getReader();
|
||||
final int docWithVectorsID = docID(reader, "42");
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
final int docID = random().nextInt(numDocs);
|
||||
final Fields fields = reader.getTermVectors(docID);
|
||||
if (docID == docWithVectorsID) {
|
||||
assertEquals(doc, fields);
|
||||
} else {
|
||||
assertNull(fields);
|
||||
}
|
||||
}
|
||||
final Fields fields = reader.getTermVectors(docWithVectorsID);
|
||||
assertEquals(doc, fields);
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testHighFreqs() throws IOException {
|
||||
final RandomDocumentFactory docFactory = new RandomDocumentFactory(3, 5);
|
||||
for (Options options : validOptions()) {
|
||||
if (options == Options.NONE) {
|
||||
continue;
|
||||
}
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 2), _TestUtil.nextInt(random(), 50000, 100000), options);
|
||||
writer.addDocument(doc.toDocument());
|
||||
final IndexReader reader = writer.getReader();
|
||||
assertEquals(doc, reader.getTermVectors(0));
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testLotsOfFields() throws IOException {
|
||||
final RandomDocumentFactory docFactory = new RandomDocumentFactory(5000, 10);
|
||||
for (Options options : validOptions()) {
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
final RandomDocument doc = docFactory.newDocument(_TestUtil.nextInt(random(), 500, 1000), 5, options);
|
||||
writer.addDocument(doc.toDocument());
|
||||
final IndexReader reader = writer.getReader();
|
||||
assertEquals(doc, reader.getTermVectors(0));
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
// different options for the same field
|
||||
public void testMixedOptions() throws IOException {
|
||||
final int numFields = _TestUtil.nextInt(random(), 1, 3);
|
||||
final RandomDocumentFactory docFactory = new RandomDocumentFactory(numFields, 10);
|
||||
for (Options options1 : validOptions()) {
|
||||
for (Options options2 : validOptions()) {
|
||||
if (options1 == options2) {
|
||||
continue;
|
||||
}
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
final RandomDocument doc1 = docFactory.newDocument(numFields, 20, options1);
|
||||
final RandomDocument doc2 = docFactory.newDocument(numFields, 20, options2);
|
||||
writer.addDocument(addId(doc1.toDocument(), "1"));
|
||||
writer.addDocument(addId(doc2.toDocument(), "2"));
|
||||
final IndexReader reader = writer.getReader();
|
||||
final int doc1ID = docID(reader, "1");
|
||||
assertEquals(doc1, reader.getTermVectors(doc1ID));
|
||||
final int doc2ID = docID(reader, "2");
|
||||
assertEquals(doc2, reader.getTermVectors(doc2ID));
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws IOException {
|
||||
final RandomDocumentFactory docFactory = new RandomDocumentFactory(5, 20);
|
||||
final int numDocs = _TestUtil.nextInt(random(), 100, 1000);
|
||||
final RandomDocument[] docs = new RandomDocument[numDocs];
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
docs[i] = docFactory.newDocument(_TestUtil.nextInt(random(), 1, 3), _TestUtil.nextInt(random(), 10, 50), randomOptions());
|
||||
}
|
||||
final Directory dir = newDirectory();
|
||||
final RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
writer.addDocument(docs[i].toDocument());
|
||||
}
|
||||
final IndexReader reader = writer.getReader();
|
||||
for (int i = 0; i < numDocs; ++i) {
|
||||
assertEquals(docs[i], reader.getTermVectors(i));
|
||||
}
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user