LUCENE-1257: Generified ShingleMatrixFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821311 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Karl-Johan Wettin 2009-10-03 13:17:11 +00:00
parent 4b2a71b621
commit 4f878bdc93
2 changed files with 47 additions and 52 deletions

View File

@ -112,7 +112,7 @@ import org.apache.lucene.index.Payload;
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}. * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
* <p/> * <p/>
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
* the ones located in org.apache.lucene.analysis.tokenattributes. * the ones located in org.apache.lucene.analysis.tokenattributes.
*/ */
public class ShingleMatrixFilter extends TokenStream { public class ShingleMatrixFilter extends TokenStream {
@ -206,7 +206,7 @@ public class ShingleMatrixFilter extends TokenStream {
private TypeAttribute in_typeAtt; private TypeAttribute in_typeAtt;
private FlagsAttribute in_flagsAtt; private FlagsAttribute in_flagsAtt;
/** /**
* Creates a shingle filter based on a user defined matrix. * Creates a shingle filter based on a user defined matrix.
* *
@ -237,7 +237,7 @@ public class ShingleMatrixFilter extends TokenStream {
// set the input to be an empty token stream, we already have the data. // set the input to be an empty token stream, we already have the data.
this.input = new EmptyTokenStream(); this.input = new EmptyTokenStream();
in_termAtt = input.addAttribute(TermAttribute.class); in_termAtt = input.addAttribute(TermAttribute.class);
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
in_payloadAtt = input.addAttribute(PayloadAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class);
@ -316,7 +316,7 @@ public class ShingleMatrixFilter extends TokenStream {
offsetAtt = addAttribute(OffsetAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class);
typeAtt = addAttribute(TypeAttribute.class); typeAtt = addAttribute(TypeAttribute.class);
flagsAtt = addAttribute(FlagsAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class);
in_termAtt = input.addAttribute(TermAttribute.class); in_termAtt = input.addAttribute(TermAttribute.class);
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
in_payloadAtt = input.addAttribute(PayloadAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class);
@ -328,12 +328,12 @@ public class ShingleMatrixFilter extends TokenStream {
// internal filter instance variables // internal filter instance variables
/** iterator over the current matrix row permutations */ /** iterator over the current matrix row permutations */
private Iterator permutations; private Iterator<Matrix.Column.Row[]> permutations;
/** the current permutation of tokens used to produce shingles */ /** the current permutation of tokens used to produce shingles */
private List currentPermuationTokens; private List<Token> currentPermuationTokens;
/** index to what row a token in currentShingleTokens represents*/ /** index to what row a token in currentShingleTokens represents*/
private List currentPermutationRows; private List<Matrix.Column.Row> currentPermutationRows;
private int currentPermutationTokensStartOffset; private int currentPermutationTokensStartOffset;
private int currentShingleLength; private int currentShingleLength;
@ -342,7 +342,7 @@ public class ShingleMatrixFilter extends TokenStream {
* a set containing shingles that has been the result of a call to next(Token), * a set containing shingles that has been the result of a call to next(Token),
* used to avoid producing the same shingle more than once. * used to avoid producing the same shingle more than once.
*/ */
private Set shinglesSeen = new HashSet(); private Set<List<Token>> shinglesSeen = new HashSet<List<Token>>();
public void reset() throws IOException { public void reset() throws IOException {
@ -352,9 +352,9 @@ public class ShingleMatrixFilter extends TokenStream {
} }
private Matrix matrix; private Matrix matrix;
private Token reusableToken = new Token(); private Token reusableToken = new Token();
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
if (matrix == null) { if (matrix == null) {
matrix = new Matrix(); matrix = new Matrix();
@ -372,7 +372,7 @@ public class ShingleMatrixFilter extends TokenStream {
token = produceNextToken(reusableToken); token = produceNextToken(reusableToken);
} while (token == request_next_token); } while (token == request_next_token);
if (token == null) return false; if (token == null) return false;
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posIncrAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags()); flagsAtt.setFlags(token.getFlags());
@ -381,7 +381,7 @@ public class ShingleMatrixFilter extends TokenStream {
payloadAtt.setPayload(token.getPayload()); payloadAtt.setPayload(token.getPayload());
return true; return true;
} }
private Token getNextInputToken(Token token) throws IOException { private Token getNextInputToken(Token token) throws IOException {
if (!input.incrementToken()) return null; if (!input.incrementToken()) return null;
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength()); token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
@ -404,7 +404,7 @@ public class ShingleMatrixFilter extends TokenStream {
public final Token next() throws java.io.IOException { public final Token next() throws java.io.IOException {
return super.next(); return super.next();
} }
private static final Token request_next_token = new Token(); private static final Token request_next_token = new Token();
/** /**
@ -428,16 +428,16 @@ public class ShingleMatrixFilter extends TokenStream {
if (ignoringSinglePrefixOrSuffixShingle if (ignoringSinglePrefixOrSuffixShingle
&& currentShingleLength == 1 && currentShingleLength == 1
&& (((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || ((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) {
return next(reusableToken); return next(reusableToken);
} }
int termLength = 0; int termLength = 0;
List shingle = new ArrayList(); List<Token> shingle = new ArrayList<Token>(currentShingleLength);
for (int i = 0; i < currentShingleLength; i++) { for (int i = 0; i < currentShingleLength; i++) {
Token shingleToken = (Token) currentPermuationTokens.get(i + currentPermutationTokensStartOffset); Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
termLength += shingleToken.termLength(); termLength += shingleToken.termLength();
shingle.add(shingleToken); shingle.add(shingleToken);
} }
@ -452,8 +452,7 @@ public class ShingleMatrixFilter extends TokenStream {
// shingle token factory // shingle token factory
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
for (Iterator iterator = shingle.iterator(); iterator.hasNext();) { for (Token shingleToken : shingle) {
Token shingleToken = (Token) iterator.next();
if (spacerCharacter != null && sb.length() > 0) { if (spacerCharacter != null && sb.length() > 0) {
sb.append(spacerCharacter); sb.append(spacerCharacter);
} }
@ -493,22 +492,19 @@ public class ShingleMatrixFilter extends TokenStream {
// get rid of resources // get rid of resources
// delete the first column in the matrix // delete the first column in the matrix
Matrix.Column deletedColumn = (Matrix.Column) matrix.columns.remove(0); Matrix.Column deletedColumn = matrix.columns.remove(0);
// remove all shingles seen that include any of the tokens from the deleted column. // remove all shingles seen that include any of the tokens from the deleted column.
List deletedColumnTokens = new ArrayList(); List<Token> deletedColumnTokens = new ArrayList<Token>();
for (Iterator iterator = deletedColumn.getRows().iterator(); iterator.hasNext();) { for (Matrix.Column.Row row : deletedColumn.getRows()) {
Matrix.Column.Row row = (Matrix.Column.Row) iterator.next(); for (Token token : row.getTokens()) {
for (Iterator rowIter = row.getTokens().iterator(); rowIter.hasNext();) { deletedColumnTokens.add(token);
Object o = rowIter.next();//Token
deletedColumnTokens.add(o);
} }
} }
for (Iterator shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { for (Iterator<List<Token>> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
List shingle = (List) shinglesSeenIterator.next(); List<Token> shingle = shinglesSeenIterator.next();
for (Iterator deletedIter = deletedColumnTokens.iterator(); deletedIter.hasNext();) { for (Token deletedColumnToken : deletedColumnTokens) {
Token deletedColumnToken = (Token) deletedIter.next();
if (shingle.contains(deletedColumnToken)) { if (shingle.contains(deletedColumnToken)) {
shinglesSeenIterator.remove(); shinglesSeenIterator.remove();
break; break;
@ -552,14 +548,12 @@ public class ShingleMatrixFilter extends TokenStream {
* finally resets the current (next) shingle size and offset. * finally resets the current (next) shingle size and offset.
*/ */
private void nextTokensPermutation() { private void nextTokensPermutation() {
Matrix.Column.Row[] rowsPermutation; Matrix.Column.Row[] rowsPermutation = permutations.next();
rowsPermutation = (Matrix.Column.Row[]) permutations.next(); List<Matrix.Column.Row> currentPermutationRows = new ArrayList<Matrix.Column.Row>();
List currentPermutationRows = new ArrayList(); List<Token> currentPermuationTokens = new ArrayList<Token>();
List currentPermuationTokens = new ArrayList(); for (Matrix.Column.Row row : rowsPermutation) {
for (int i = 0; i < rowsPermutation.length; i++) { for (Token token : row.getTokens()) {
Matrix.Column.Row row = rowsPermutation[i]; currentPermuationTokens.add(token);
for (Iterator iterator = row.getTokens().iterator(); iterator.hasNext();) {
currentPermuationTokens.add(iterator.next());
currentPermutationRows.add(row); currentPermutationRows.add(row);
} }
} }
@ -627,8 +621,7 @@ public class ShingleMatrixFilter extends TokenStream {
double factor = 1d / Math.sqrt(total); double factor = 1d / Math.sqrt(total);
double weight = 0d; double weight = 0d;
for (int i = 0; i < weights.length; i++) { for (double partWeight : weights) {
double partWeight = weights[i];
weight += partWeight * factor; weight += partWeight * factor;
} }
@ -709,7 +702,7 @@ public class ShingleMatrixFilter extends TokenStream {
private boolean columnsHasBeenCreated = false; private boolean columnsHasBeenCreated = false;
private List columns = new ArrayList(); private List<Column> columns = new ArrayList<Column>();
public List getColumns() { public List getColumns() {
return columns; return columns;
@ -740,9 +733,9 @@ public class ShingleMatrixFilter extends TokenStream {
Matrix.this.columns.add(this); Matrix.this.columns.add(this);
} }
private List rows = new ArrayList(); private List<Row> rows = new ArrayList<Row>();
public List getRows() { public List<Row> getRows() {
return rows; return rows;
} }
@ -781,7 +774,7 @@ public class ShingleMatrixFilter extends TokenStream {
return Column.this; return Column.this;
} }
private List tokens = new LinkedList(); private List<Token> tokens = new LinkedList<Token>();
public Row() { public Row() {
Column.this.rows.add(this); Column.this.rows.add(this);
@ -791,11 +784,11 @@ public class ShingleMatrixFilter extends TokenStream {
return Column.this.rows.indexOf(this); return Column.this.rows.indexOf(this);
} }
public List getTokens() { public List<Token> getTokens() {
return tokens; return tokens;
} }
public void setTokens(List tokens) { public void setTokens(List<Token> tokens) {
this.tokens = tokens; this.tokens = tokens;
} }
@ -826,9 +819,9 @@ public class ShingleMatrixFilter extends TokenStream {
} }
public Iterator permutationIterator() { public Iterator<Column.Row[]> permutationIterator() {
return new Iterator() { return new Iterator<Column.Row[]>() {
private int[] columnRowCounters = new int[columns.size()]; private int[] columnRowCounters = new int[columns.size()];
@ -838,10 +831,10 @@ public class ShingleMatrixFilter extends TokenStream {
public boolean hasNext() { public boolean hasNext() {
int s = columnRowCounters.length; int s = columnRowCounters.length;
return s != 0 && columnRowCounters[s - 1] < ((Column) columns.get(s - 1)).getRows().size(); return s != 0 && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size();
} }
public Object next() { public Column.Row[] next() {
if (!hasNext()) { if (!hasNext()) {
throw new NoSuchElementException("no more elements"); throw new NoSuchElementException("no more elements");
} }
@ -849,7 +842,7 @@ public class ShingleMatrixFilter extends TokenStream {
Column.Row[] rows = new Column.Row[columnRowCounters.length]; Column.Row[] rows = new Column.Row[columnRowCounters.length];
for (int i = 0; i < columnRowCounters.length; i++) { for (int i = 0; i < columnRowCounters.length; i++) {
rows[i] = (Matrix.Column.Row) ((Column) columns.get(i)).rows.get(columnRowCounters[i]); rows[i] = columns.get(i).rows.get(columnRowCounters[i]);
} }
incrementColumnRowCounters(); incrementColumnRowCounters();
@ -859,7 +852,7 @@ public class ShingleMatrixFilter extends TokenStream {
private void incrementColumnRowCounters() { private void incrementColumnRowCounters() {
for (int i = 0; i < columnRowCounters.length; i++) { for (int i = 0; i < columnRowCounters.length; i++) {
columnRowCounters[i]++; columnRowCounters[i]++;
if (columnRowCounters[i] == ((Column) columns.get(i)).rows.size() && if (columnRowCounters[i] == columns.get(i).rows.size() &&
i < columnRowCounters.length - 1) { i < columnRowCounters.length - 1) {
columnRowCounters[i] = 0; columnRowCounters[i] = 0;
} else { } else {

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.shingle;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.StringReader;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
@ -28,6 +29,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter; import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
@ -44,7 +46,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
"testBehavingAsShingleFilter", "testMatrix" "testBehavingAsShingleFilter", "testMatrix"
}))); })));
} }
public void testBehavingAsShingleFilter() throws IOException { public void testBehavingAsShingleFilter() throws IOException {
ShingleMatrixFilter.defaultSettingsCodec = null; ShingleMatrixFilter.defaultSettingsCodec = null;