mirror of https://github.com/apache/lucene.git
LUCENE-975: New PositionBasedTermVectorMapper for getting term vector information on a position by position basis.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@565994 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a94db219bd
commit
f38b1cf2f8
|
@ -83,6 +83,8 @@ New features
|
|||
This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the
|
||||
actual storage of Term Vectors has taken place.
|
||||
|
||||
4. LUCENE-975: Added PositionBasedTermVectorMapper that allows for position based lookup of term vector information. See item #3 above (LUCENE-868).
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
package org.apache.lucene.index;
|
||||
/**
|
||||
* Copyright 2007 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* For each Field, store position by position information. It ignores frequency information
|
||||
* <p/>
|
||||
* This is not thread-safe.
|
||||
*/
|
||||
public class PositionBasedTermVectorMapper extends TermVectorMapper{
|
||||
private Map/*<String, Map<Integer, TVPositionInfo>>*/ fieldToTerms;
|
||||
|
||||
private String currentField;
|
||||
/**
|
||||
* A Map of Integer and TVPositionInfo
|
||||
*/
|
||||
private Map/*<Integer, TVPositionInfo>*/ currentPositions;
|
||||
private boolean storeOffsets;
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
public PositionBasedTermVectorMapper() {
|
||||
super(false, false);
|
||||
}
|
||||
|
||||
public PositionBasedTermVectorMapper(boolean ignoringOffsets)
|
||||
{
|
||||
super(false, ignoringOffsets);
|
||||
}
|
||||
|
||||
/**
|
||||
* Never ignores positions. This mapper doesn't make much sense unless there are positions
|
||||
* @return
|
||||
*/
|
||||
public boolean isIgnoringPositions() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback for the TermVectorReader.
|
||||
* @param term
|
||||
* @param frequency
|
||||
* @param offsets
|
||||
* @param positions
|
||||
*/
|
||||
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||
for (int i = 0; i < positions.length; i++) {
|
||||
Integer posVal = new Integer(positions[i]);
|
||||
TVPositionInfo pos = (TVPositionInfo) currentPositions.get(posVal);
|
||||
if (pos == null) {
|
||||
pos = new TVPositionInfo(positions[i], storeOffsets);
|
||||
currentPositions.put(posVal, pos);
|
||||
}
|
||||
pos.addTerm(term, offsets != null ? offsets[i] : null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback mechanism used by the TermVectorReader
|
||||
* @param field The field being read
|
||||
* @param numTerms The number of terms in the vector
|
||||
* @param storeOffsets Whether offsets are available
|
||||
* @param storePositions Whether positions are available
|
||||
*/
|
||||
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||
if (storePositions == false)
|
||||
{
|
||||
throw new RuntimeException("You must store positions in order to use this Mapper");
|
||||
}
|
||||
if (storeOffsets == true)
|
||||
{
|
||||
//ignoring offsets
|
||||
}
|
||||
fieldToTerms = new HashMap(numTerms);
|
||||
this.storeOffsets = storeOffsets;
|
||||
currentField = field;
|
||||
currentPositions = new HashMap();
|
||||
fieldToTerms.put(currentField, currentPositions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the mapping between fields and terms, sorted by the comparator
|
||||
*
|
||||
* @return A map between field names and a Map. The sub-Map key is the position as the integer, the value is {@link org.apache.lucene.index.PositionBasedTermVectorMapper.TVPositionInfo}.
|
||||
*/
|
||||
public Map getFieldToTerms() {
|
||||
return fieldToTerms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Container for a term at a position
|
||||
*/
|
||||
public static class TVPositionInfo{
|
||||
private int position;
|
||||
//a list of Strings
|
||||
private List terms;
|
||||
//A list of TermVectorOffsetInfo
|
||||
private List offsets;
|
||||
|
||||
|
||||
public TVPositionInfo(int position, boolean storeOffsets) {
|
||||
this.position = position;
|
||||
terms = new ArrayList();
|
||||
if (storeOffsets) {
|
||||
offsets = new ArrayList();
|
||||
}
|
||||
}
|
||||
|
||||
void addTerm(String term, TermVectorOffsetInfo info)
|
||||
{
|
||||
terms.add(term);
|
||||
if (offsets != null) {
|
||||
offsets.add(info);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The position of the term
|
||||
*/
|
||||
public int getPosition() {
|
||||
return position;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note, there may be multiple terms at the same position
|
||||
* @return A List of Strings
|
||||
*/
|
||||
public List getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parallel list (to {@link #getTerms()}) of TermVectorOffsetInfo objects. There may be multiple entries since there may be multiple terms at a position
|
||||
* @return A List of TermVectorOffsetInfo objects, if offsets are store.
|
||||
*/
|
||||
public List getOffsets() {
|
||||
return offsets;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package org.apache.lucene.index;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestPositionBasedTermVectorMapper extends TestCase {
|
||||
protected String[] tokens;
|
||||
protected int[][] thePositions;
|
||||
protected TermVectorOffsetInfo[][] offsets;
|
||||
protected int numPositions;
|
||||
|
||||
|
||||
public TestPositionBasedTermVectorMapper(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
tokens = new String[]{"here", "is", "some", "text", "to", "test", "extra"};
|
||||
thePositions = new int[tokens.length][];
|
||||
offsets = new TermVectorOffsetInfo[tokens.length][];
|
||||
numPositions = 0;
|
||||
//save off the last one so we can add it with the same positions as some of the others, but in a predictable way
|
||||
for (int i = 0; i < tokens.length - 1; i++)
|
||||
{
|
||||
thePositions[i] = new int[2 * i + 1];//give 'em all some positions
|
||||
for (int j = 0; j < thePositions[i].length; j++)
|
||||
{
|
||||
thePositions[i][j] = numPositions++;
|
||||
}
|
||||
offsets[i] = new TermVectorOffsetInfo[thePositions[i].length];
|
||||
for (int j = 0; j < offsets[i].length; j++) {
|
||||
offsets[i][j] = new TermVectorOffsetInfo(j, j + 1);//the actual value here doesn't much matter
|
||||
}
|
||||
}
|
||||
thePositions[tokens.length - 1] = new int[1];
|
||||
thePositions[tokens.length - 1][0] = 0;//put this at the same position as "here"
|
||||
offsets[tokens.length - 1] = new TermVectorOffsetInfo[1];
|
||||
offsets[tokens.length - 1][0] = new TermVectorOffsetInfo(0, 1);
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() throws IOException {
|
||||
PositionBasedTermVectorMapper mapper = new PositionBasedTermVectorMapper();
|
||||
|
||||
mapper.setExpectations("test", tokens.length, true, true);
|
||||
//Test single position
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
String token = tokens[i];
|
||||
mapper.map(token, 1, null, thePositions[i]);
|
||||
|
||||
}
|
||||
Map map = mapper.getFieldToTerms();
|
||||
assertTrue("map is null and it shouldn't be", map != null);
|
||||
assertTrue("map Size: " + map.size() + " is not: " + 1, map.size() == 1);
|
||||
Map positions = (Map) map.get("test");
|
||||
assertTrue("thePositions is null and it shouldn't be", positions != null);
|
||||
|
||||
assertTrue("thePositions Size: " + positions.size() + " is not: " + numPositions, positions.size() == numPositions);
|
||||
BitSet bits = new BitSet(numPositions);
|
||||
for (Iterator iterator = positions.entrySet().iterator(); iterator.hasNext();) {
|
||||
Map.Entry entry = (Map.Entry) iterator.next();
|
||||
PositionBasedTermVectorMapper.TVPositionInfo info = (PositionBasedTermVectorMapper.TVPositionInfo) entry.getValue();
|
||||
assertTrue("info is null and it shouldn't be", info != null);
|
||||
int pos = ((Integer) entry.getKey()).intValue();
|
||||
bits.set(pos);
|
||||
assertTrue(info.getPosition() + " does not equal: " + pos, info.getPosition() == pos);
|
||||
assertTrue("info.getOffsets() is null and it shouldn't be", info.getOffsets() != null);
|
||||
if (pos == 0)
|
||||
{
|
||||
assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 2, info.getTerms().size() == 2);//need a test for multiple terms at one pos
|
||||
assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " + 2, info.getOffsets().size() == 2);
|
||||
}
|
||||
else
|
||||
{
|
||||
assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 1, info.getTerms().size() == 1);//need a test for multiple terms at one pos
|
||||
assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " + 1, info.getOffsets().size() == 1);
|
||||
}
|
||||
}
|
||||
assertTrue("Bits are not all on", bits.cardinality() == numPositions);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
Loading…
Reference in New Issue