LUCENE-1016 : TermVectorAccessor, transparent vector space access via stored vectors or by resolving the inverted index.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@688745 13f79535-47bb-0310-9956-ffa450edef68
2008-08-25 15:02:20 +00:00 · 2008-08-25 15:02:20 +00:00 · 82c70c018e
parent 85102bd84a
commit 82c70c018e
3 changed files with 282 additions and 0 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -32,6 +32,9 @@ New features
    might not be compatible with these updated classes as some algorithms have changed.
    (Karl Wettin)

+ 3. LUCENE-1016: TermVectorAccessor, transparent vector space access via stored vectors
+    or by resolving the inverted index. (Karl Wettin) 
+
 Documentation

 (None)
--- a/contrib/miscellaneous/src/java/org/apache/lucene/index/TermVectorAccessor.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/index/TermVectorAccessor.java
@ -0,0 +1,168 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collection;
+import java.util.Iterator;
+/*
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+
+/**
+ * Transparent access to the vector space model,
+ * either via TermFreqVector or by resolving it from the inverted index.
+ * <p/>
+ * Resolving a term vector from a large index can be a time consuming process.
+ * <p/>
+ * Warning! This class is not thread safe!
+ */
+public class TermVectorAccessor {
+
+  public TermVectorAccessor() {
+  }
+
+  /**
+   * Instance reused to save garbage collector some time
+   */
+  private TermVectorMapperDecorator decoratedMapper = new TermVectorMapperDecorator();
+
+
+  /**
+   * Visits the TermVectorMapper and populates it with terms available for a given document,
+   * either via a vector created at index time or by resolving them from the inverted index.
+   *
+   * @param indexReader    Index source
+   * @param documentNumber Source document to access
+   * @param fieldName      Field to resolve
+   * @param mapper         Mapper to be mapped with data
+   * @throws IOException
+   */
+  public void accept(IndexReader indexReader, int documentNumber, String fieldName, TermVectorMapper mapper) throws IOException {
+
+    fieldName = fieldName.intern();
+
+    decoratedMapper.decorated = mapper;
+    decoratedMapper.termVectorStored = false;
+
+    indexReader.getTermFreqVector(documentNumber, fieldName, decoratedMapper);
+
+    if (!decoratedMapper.termVectorStored) {
+      mapper.setDocumentNumber(documentNumber);
+      build(indexReader, fieldName, mapper, documentNumber);
+    }
+  }
+
+  /** Instance reused to save garbage collector some time */
+  private List/*<String>*/ tokens;
+
+  /** Instance reused to save garbage collector some time */
+  private List/*<int[]>*/ positions;
+
+  /** Instance reused to save garbage collector some time */
+  private List/*<Integer>*/ frequencies;
+
+
+  /**
+   * Populates the mapper with terms available for the given field in a document
+   * by resolving the inverted index.
+   *
+   * @param indexReader
+   * @param field interned field name
+   * @param mapper
+   * @param documentNumber
+   * @throws IOException
+   */
+  private void build(IndexReader indexReader, String field, TermVectorMapper mapper, int documentNumber) throws IOException {
+
+    if (tokens == null) {
+      tokens = new ArrayList/*<String>*/(500);
+      positions = new ArrayList/*<int[]>*/(500);
+      frequencies = new ArrayList/*<Integer>*/(500);
+    } else {
+      tokens.clear();
+      frequencies.clear();
+      positions.clear();
+    }
+
+    TermEnum termEnum = indexReader.terms();
+    if (termEnum.skipTo(new Term(field, ""))) {
+
+      while (termEnum.term().field() == field) {
+        TermPositions termPositions = indexReader.termPositions(termEnum.term());
+        if (termPositions.skipTo(documentNumber)) {
+
+          frequencies.add(new Integer(termPositions.freq()));
+          tokens.add(termEnum.term().text());
+
+
+          if (!mapper.isIgnoringPositions()) {
+            int[] positions = new int[termPositions.freq()];
+            for (int i = 0; i < positions.length; i++) {
+              positions[i] = termPositions.nextPosition();
+            }
+            this.positions.add(positions);
+          } else {
+            positions.add(null);
+          }
+        }
+        termPositions.close();
+        if (!termEnum.next()) {
+          break;
+        }
+      }
+
+      mapper.setDocumentNumber(documentNumber);
+      mapper.setExpectations(field, tokens.size(), false, !mapper.isIgnoringPositions());
+      for (int i = 0; i < tokens.size(); i++) {
+        mapper.map((String) tokens.get(i), ((Integer) frequencies.get(i)).intValue(), (TermVectorOffsetInfo[]) null, (int[]) positions.get(i));
+      }
+
+    }
+    termEnum.close();
+
+
+  }
+
+
+  private static class TermVectorMapperDecorator extends TermVectorMapper {
+
+    private TermVectorMapper decorated;
+
+    public boolean isIgnoringPositions() {
+      return decorated.isIgnoringPositions();
+    }
+
+    public boolean isIgnoringOffsets() {
+      return decorated.isIgnoringOffsets();
+    }
+
+    private boolean termVectorStored = false;
+
+    public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+      decorated.setExpectations(field, numTerms, storeOffsets, storePositions);
+      termVectorStored = true;
+    }
+
+    public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+      decorated.map(term, frequency, offsets, positions);
+    }
+
+    public void setDocumentNumber(int documentNumber) {
+      decorated.setDocumentNumber(documentNumber);
+    }
+  }
+
+}
--- a/contrib/miscellaneous/src/test/org/apache/lucene/index/TestTermVectorAccessor.java
+++ b/contrib/miscellaneous/src/test/org/apache/lucene/index/TestTermVectorAccessor.java
@ -0,0 +1,111 @@
+package org.apache.lucene.index;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+import java.util.Collections;
+/*
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+
+public class TestTermVectorAccessor extends TestCase {
+
+  public void test() throws Exception {
+
+    Directory dir = new RAMDirectory();
+    IndexWriter iw = new IndexWriter(dir, new StandardAnalyzer(Collections.EMPTY_SET), true);
+
+    Document doc;
+
+    doc = new Document();
+    doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+    doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+    doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
+    doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
+    doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
+    doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
+    doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
+    iw.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new Field("a", "a b a c a d a e a f a g a h a", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("b", "a b c b d b e b f b g b h b", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
+    doc.add(new Field("c", "a c b c d c e c f c g c h c", Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
+    iw.addDocument(doc);
+
+    iw.close();
+
+    IndexReader ir = IndexReader.open(dir);
+
+    TermVectorAccessor accessor = new TermVectorAccessor();
+
+    ParallelArrayTermVectorMapper mapper;
+    TermFreqVector tfv;
+
+    for (int i = 0; i < ir.maxDoc(); i++) {
+
+      mapper = new ParallelArrayTermVectorMapper();
+      accessor.accept(ir, i, "a", mapper);
+      tfv = mapper.materializeVector();
+      assertEquals("doc " + i, "a", tfv.getTerms()[0]);
+      assertEquals("doc " + i, 8, tfv.getTermFrequencies()[0]);
+
+      mapper = new ParallelArrayTermVectorMapper();
+      accessor.accept(ir, i, "b", mapper);
+      tfv = mapper.materializeVector();
+      assertEquals("doc " + i, 8, tfv.getTermFrequencies().length);
+      assertEquals("doc " + i, "b", tfv.getTerms()[1]);
+      assertEquals("doc " + i, 7, tfv.getTermFrequencies()[1]);
+
+      mapper = new ParallelArrayTermVectorMapper();
+      accessor.accept(ir, i, "c", mapper);
+      tfv = mapper.materializeVector();
+      assertEquals("doc " + i, 8, tfv.getTermFrequencies().length);
+      assertEquals("doc " + i, "c", tfv.getTerms()[2]);
+      assertEquals("doc " + i, 7, tfv.getTermFrequencies()[2]);
+
+      mapper = new ParallelArrayTermVectorMapper();
+      accessor.accept(ir, i, "q", mapper);
+      tfv = mapper.materializeVector();
+      assertNull("doc " + i, tfv);
+
+    }
+
+    ir.close();
+
+    dir.close();
+
+
+  }
+
+}