LUCENE-975: New PositionBasedTermVectorMapper for getting term vector information on a position by position basis.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@565994 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-08-15 01:14:54 +00:00
parent a94db219bd
commit f38b1cf2f8
3 changed files with 276 additions and 0 deletions

View File

@ -83,6 +83,8 @@ New features
This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the
actual storage of Term Vectors has taken place.
4. LUCENE-975: Added PositionBasedTermVectorMapper that allows for position based lookup of term vector information. See item #3 above (LUCENE-868).
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -0,0 +1,167 @@
package org.apache.lucene.index;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* For each Field, store position by position information. It ignores frequency information
* <p/>
* This is not thread-safe.
*/
public class PositionBasedTermVectorMapper extends TermVectorMapper{
private Map/*<String, Map<Integer, TVPositionInfo>>*/ fieldToTerms;
private String currentField;
/**
* A Map of Integer and TVPositionInfo
*/
private Map/*<Integer, TVPositionInfo>*/ currentPositions;
private boolean storeOffsets;
/**
*
*
*/
public PositionBasedTermVectorMapper() {
super(false, false);
}
public PositionBasedTermVectorMapper(boolean ignoringOffsets)
{
super(false, ignoringOffsets);
}
/**
* Never ignores positions. This mapper doesn't make much sense unless there are positions
* @return
*/
public boolean isIgnoringPositions() {
return false;
}
/**
* Callback for the TermVectorReader.
* @param term
* @param frequency
* @param offsets
* @param positions
*/
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
for (int i = 0; i < positions.length; i++) {
Integer posVal = new Integer(positions[i]);
TVPositionInfo pos = (TVPositionInfo) currentPositions.get(posVal);
if (pos == null) {
pos = new TVPositionInfo(positions[i], storeOffsets);
currentPositions.put(posVal, pos);
}
pos.addTerm(term, offsets != null ? offsets[i] : null);
}
}
/**
* Callback mechanism used by the TermVectorReader
* @param field The field being read
* @param numTerms The number of terms in the vector
* @param storeOffsets Whether offsets are available
* @param storePositions Whether positions are available
*/
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
if (storePositions == false)
{
throw new RuntimeException("You must store positions in order to use this Mapper");
}
if (storeOffsets == true)
{
//ignoring offsets
}
fieldToTerms = new HashMap(numTerms);
this.storeOffsets = storeOffsets;
currentField = field;
currentPositions = new HashMap();
fieldToTerms.put(currentField, currentPositions);
}
/**
* Get the mapping between fields and terms, sorted by the comparator
*
* @return A map between field names and a Map. The sub-Map key is the position as the integer, the value is {@link org.apache.lucene.index.PositionBasedTermVectorMapper.TVPositionInfo}.
*/
public Map getFieldToTerms() {
return fieldToTerms;
}
/**
* Container for a term at a position
*/
public static class TVPositionInfo{
private int position;
//a list of Strings
private List terms;
//A list of TermVectorOffsetInfo
private List offsets;
public TVPositionInfo(int position, boolean storeOffsets) {
this.position = position;
terms = new ArrayList();
if (storeOffsets) {
offsets = new ArrayList();
}
}
void addTerm(String term, TermVectorOffsetInfo info)
{
terms.add(term);
if (offsets != null) {
offsets.add(info);
}
}
/**
*
* @return The position of the term
*/
public int getPosition() {
return position;
}
/**
* Note, there may be multiple terms at the same position
* @return A List of Strings
*/
public List getTerms() {
return terms;
}
/**
* Parallel list (to {@link #getTerms()}) of TermVectorOffsetInfo objects. There may be multiple entries since there may be multiple terms at a position
* @return A List of TermVectorOffsetInfo objects, if offsets are store.
*/
public List getOffsets() {
return offsets;
}
}
}

View File

@ -0,0 +1,107 @@
package org.apache.lucene.index;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import java.io.IOException;
import java.util.BitSet;
import java.util.Iterator;
import java.util.Map;
public class TestPositionBasedTermVectorMapper extends TestCase {
protected String[] tokens;
protected int[][] thePositions;
protected TermVectorOffsetInfo[][] offsets;
protected int numPositions;
public TestPositionBasedTermVectorMapper(String s) {
super(s);
}
protected void setUp() {
tokens = new String[]{"here", "is", "some", "text", "to", "test", "extra"};
thePositions = new int[tokens.length][];
offsets = new TermVectorOffsetInfo[tokens.length][];
numPositions = 0;
//save off the last one so we can add it with the same positions as some of the others, but in a predictable way
for (int i = 0; i < tokens.length - 1; i++)
{
thePositions[i] = new int[2 * i + 1];//give 'em all some positions
for (int j = 0; j < thePositions[i].length; j++)
{
thePositions[i][j] = numPositions++;
}
offsets[i] = new TermVectorOffsetInfo[thePositions[i].length];
for (int j = 0; j < offsets[i].length; j++) {
offsets[i][j] = new TermVectorOffsetInfo(j, j + 1);//the actual value here doesn't much matter
}
}
thePositions[tokens.length - 1] = new int[1];
thePositions[tokens.length - 1][0] = 0;//put this at the same position as "here"
offsets[tokens.length - 1] = new TermVectorOffsetInfo[1];
offsets[tokens.length - 1][0] = new TermVectorOffsetInfo(0, 1);
}
protected void tearDown() {
}
public void test() throws IOException {
PositionBasedTermVectorMapper mapper = new PositionBasedTermVectorMapper();
mapper.setExpectations("test", tokens.length, true, true);
//Test single position
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
mapper.map(token, 1, null, thePositions[i]);
}
Map map = mapper.getFieldToTerms();
assertTrue("map is null and it shouldn't be", map != null);
assertTrue("map Size: " + map.size() + " is not: " + 1, map.size() == 1);
Map positions = (Map) map.get("test");
assertTrue("thePositions is null and it shouldn't be", positions != null);
assertTrue("thePositions Size: " + positions.size() + " is not: " + numPositions, positions.size() == numPositions);
BitSet bits = new BitSet(numPositions);
for (Iterator iterator = positions.entrySet().iterator(); iterator.hasNext();) {
Map.Entry entry = (Map.Entry) iterator.next();
PositionBasedTermVectorMapper.TVPositionInfo info = (PositionBasedTermVectorMapper.TVPositionInfo) entry.getValue();
assertTrue("info is null and it shouldn't be", info != null);
int pos = ((Integer) entry.getKey()).intValue();
bits.set(pos);
assertTrue(info.getPosition() + " does not equal: " + pos, info.getPosition() == pos);
assertTrue("info.getOffsets() is null and it shouldn't be", info.getOffsets() != null);
if (pos == 0)
{
assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 2, info.getTerms().size() == 2);//need a test for multiple terms at one pos
assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " + 2, info.getOffsets().size() == 2);
}
else
{
assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 1, info.getTerms().size() == 1);//need a test for multiple terms at one pos
assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " + 1, info.getOffsets().size() == 1);
}
}
assertTrue("Bits are not all on", bits.cardinality() == numPositions);
}
}