mirror of https://github.com/apache/lucene.git
LUCENE-1016 : TermVectorAccessor, transparent vector space access via stored vectors or by resolving the inverted index.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@688745 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
85102bd84a
commit
82c70c018e
|
@ -32,6 +32,9 @@ New features
|
|||
might not be compatible with these updated classes as some algorithms have changed.
|
||||
(Karl Wettin)
|
||||
|
||||
3. LUCENE-1016: TermVectorAccessor, transparent vector space access via stored vectors
|
||||
or by resolving the inverted index. (Karl Wettin)
|
||||
|
||||
Documentation
|
||||
|
||||
(None)
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
/*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Transparent access to the vector space model,
|
||||
* either via TermFreqVector or by resolving it from the inverted index.
|
||||
* <p/>
|
||||
* Resolving a term vector from a large index can be a time consuming process.
|
||||
* <p/>
|
||||
* Warning! This class is not thread safe!
|
||||
*/
|
||||
public class TermVectorAccessor {
|
||||
|
||||
public TermVectorAccessor() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Instance reused to save garbage collector some time
|
||||
*/
|
||||
private TermVectorMapperDecorator decoratedMapper = new TermVectorMapperDecorator();
|
||||
|
||||
|
||||
/**
|
||||
* Visits the TermVectorMapper and populates it with terms available for a given document,
|
||||
* either via a vector created at index time or by resolving them from the inverted index.
|
||||
*
|
||||
* @param indexReader Index source
|
||||
* @param documentNumber Source document to access
|
||||
* @param fieldName Field to resolve
|
||||
* @param mapper Mapper to be mapped with data
|
||||
* @throws IOException
|
||||
*/
|
||||
public void accept(IndexReader indexReader, int documentNumber, String fieldName, TermVectorMapper mapper) throws IOException {
|
||||
|
||||
fieldName = fieldName.intern();
|
||||
|
||||
decoratedMapper.decorated = mapper;
|
||||
decoratedMapper.termVectorStored = false;
|
||||
|
||||
indexReader.getTermFreqVector(documentNumber, fieldName, decoratedMapper);
|
||||
|
||||
if (!decoratedMapper.termVectorStored) {
|
||||
mapper.setDocumentNumber(documentNumber);
|
||||
build(indexReader, fieldName, mapper, documentNumber);
|
||||
}
|
||||
}
|
||||
|
||||
/** Instance reused to save garbage collector some time */
|
||||
private List/*<String>*/ tokens;
|
||||
|
||||
/** Instance reused to save garbage collector some time */
|
||||
private List/*<int[]>*/ positions;
|
||||
|
||||
/** Instance reused to save garbage collector some time */
|
||||
private List/*<Integer>*/ frequencies;
|
||||
|
||||
|
||||
/**
|
||||
* Populates the mapper with terms available for the given field in a document
|
||||
* by resolving the inverted index.
|
||||
*
|
||||
* @param indexReader
|
||||
* @param field interned field name
|
||||
* @param mapper
|
||||
* @param documentNumber
|
||||
* @throws IOException
|
||||
*/
|
||||
private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException {
|
||||
|
||||
if (tokens == null) {
|
||||
tokens = new ArrayList/*<String>*/(500);
|
||||
positions = new ArrayList/*<int[]>*/(500);
|
||||
frequencies = new ArrayList/*<Integer>*/(500);
|
||||
} else {
|
||||
tokens.clear();
|
||||
frequencies.clear();
|
||||
positions.clear();
|
||||
}
|
||||
|
||||
TermEnum termEnum = indexReader.terms();
|
||||
if (termEnum.skipTo(new Term(field, ""))) {
|
||||
|
||||
while (termEnum.term().field() == field) {
|
||||
TermPositions termPositions = indexReader.termPositions(termEnum.term());
|
||||
if (termPositions.skipTo(documentNumber)) {
|
||||
|
||||
frequencies.add(new Integer(termPositions.freq()));
|
||||
tokens.add(termEnum.term().text());
|
||||
|
||||
|
||||
if (!mapper.isIgnoringPositions()) {
|
||||
int[] positions = new int[termPositions.freq()];
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
positions[i] = termPositions.nextPosition();
|
||||
}
|
||||
this.positions.add(positions);
|
||||
} else {
|
||||
positions.add(null);
|
||||
}
|
||||
}
|
||||
termPositions.close();
|
||||
if (!termEnum.next()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mapper.setDocumentNumber(documentNumber);
|
||||
mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions());
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
mapper.map((String) tokens.get(i), ((Integer) frequencies.get(i)).intValue(), (TermVectorOffsetInfo[]) null, (int[]) positions.get(i));
|
||||
}
|
||||
|
||||
}
|
||||
termEnum.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static class TermVectorMapperDecorator extends TermVectorMapper {
|
||||
|
||||
private TermVectorMapper decorated;
|
||||
|
||||
public boolean isIgnoringPositions() {
|
||||
return decorated.isIgnoringPositions();
|
||||
}
|
||||
|
||||
public boolean isIgnoringOffsets() {
|
||||
return decorated.isIgnoringOffsets();
|
||||
}
|
||||
|
||||
private boolean termVectorStored = false;
|
||||
|
||||
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||
decorated.setExpectations(field, numTerms, storeOffsets, storePositions);
|
||||
termVectorStored = true;
|
||||
}
|
||||
|
||||
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||
decorated.map(term, frequency, offsets, positions);
|
||||
}
|
||||
|
||||
public void setDocumentNumber(int documentNumber) {
|
||||
decorated.setDocumentNumber(documentNumber);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.util.Collections;
|
||||
/*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
public class TestTermVectorAccessor extends TestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Collections.EMPTY_SET), true);
|
||||
|
||||
Document doc;
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||
doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||
doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||
iw.addDocument(doc);
|
||||
|
||||
iw.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
|
||||
TermVectorAccessor accessor = new TermVectorAccessor();
|
||||
|
||||
ParallelArrayTermVectorMapper mapper;
|
||||
TermFreqVector tfv;
|
||||
|
||||
for (int i = 0; i < ir.maxDoc(); i++) {
|
||||
|
||||
mapper = new ParallelArrayTermVectorMapper();
|
||||
accessor.accept(ir, i, "a", mapper);
|
||||
tfv = mapper.materializeVector();
|
||||
assertEquals("doc " + i, "a", tfv.getTerms()[0]);
|
||||
assertEquals("doc " + i, 8, tfv.getTermFrequencies()[0]);
|
||||
|
||||
mapper = new ParallelArrayTermVectorMapper();
|
||||
accessor.accept(ir, i, "b", mapper);
|
||||
tfv = mapper.materializeVector();
|
||||
assertEquals("doc " + i, 8, tfv.getTermFrequencies().length);
|
||||
assertEquals("doc " + i, "b", tfv.getTerms()[1]);
|
||||
assertEquals("doc " + i, 7, tfv.getTermFrequencies()[1]);
|
||||
|
||||
mapper = new ParallelArrayTermVectorMapper();
|
||||
accessor.accept(ir, i, "c", mapper);
|
||||
tfv = mapper.materializeVector();
|
||||
assertEquals("doc " + i, 8, tfv.getTermFrequencies().length);
|
||||
assertEquals("doc " + i, "c", tfv.getTerms()[2]);
|
||||
assertEquals("doc " + i, 7, tfv.getTermFrequencies()[2]);
|
||||
|
||||
mapper = new ParallelArrayTermVectorMapper();
|
||||
accessor.accept(ir, i, "q", mapper);
|
||||
tfv = mapper.materializeVector();
|
||||
assertNull("doc " + i, tfv);
|
||||
|
||||
}
|
||||
|
||||
ir.close();
|
||||
|
||||
dir.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue