mirror of https://github.com/apache/lucene.git
LUCENE-1257: Generified ShingleMatrixFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821311 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4b2a71b621
commit
4f878bdc93
|
@ -112,7 +112,7 @@ import org.apache.lucene.index.Payload;
|
||||||
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
|
* See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
|
||||||
* <p/>
|
* <p/>
|
||||||
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
* <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than
|
||||||
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
* the ones located in org.apache.lucene.analysis.tokenattributes.
|
||||||
*/
|
*/
|
||||||
public class ShingleMatrixFilter extends TokenStream {
|
public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
|
@ -206,7 +206,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
private TypeAttribute in_typeAtt;
|
private TypeAttribute in_typeAtt;
|
||||||
private FlagsAttribute in_flagsAtt;
|
private FlagsAttribute in_flagsAtt;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a shingle filter based on a user defined matrix.
|
* Creates a shingle filter based on a user defined matrix.
|
||||||
*
|
*
|
||||||
|
@ -237,7 +237,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
// set the input to be an empty token stream, we already have the data.
|
// set the input to be an empty token stream, we already have the data.
|
||||||
this.input = new EmptyTokenStream();
|
this.input = new EmptyTokenStream();
|
||||||
|
|
||||||
in_termAtt = input.addAttribute(TermAttribute.class);
|
in_termAtt = input.addAttribute(TermAttribute.class);
|
||||||
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
||||||
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
||||||
|
@ -316,7 +316,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
typeAtt = addAttribute(TypeAttribute.class);
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
in_termAtt = input.addAttribute(TermAttribute.class);
|
in_termAtt = input.addAttribute(TermAttribute.class);
|
||||||
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
||||||
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
||||||
|
@ -328,12 +328,12 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
// internal filter instance variables
|
// internal filter instance variables
|
||||||
|
|
||||||
/** iterator over the current matrix row permutations */
|
/** iterator over the current matrix row permutations */
|
||||||
private Iterator permutations;
|
private Iterator<Matrix.Column.Row[]> permutations;
|
||||||
|
|
||||||
/** the current permutation of tokens used to produce shingles */
|
/** the current permutation of tokens used to produce shingles */
|
||||||
private List currentPermuationTokens;
|
private List<Token> currentPermuationTokens;
|
||||||
/** index to what row a token in currentShingleTokens represents*/
|
/** index to what row a token in currentShingleTokens represents*/
|
||||||
private List currentPermutationRows;
|
private List<Matrix.Column.Row> currentPermutationRows;
|
||||||
|
|
||||||
private int currentPermutationTokensStartOffset;
|
private int currentPermutationTokensStartOffset;
|
||||||
private int currentShingleLength;
|
private int currentShingleLength;
|
||||||
|
@ -342,7 +342,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
* a set containing shingles that has been the result of a call to next(Token),
|
* a set containing shingles that has been the result of a call to next(Token),
|
||||||
* used to avoid producing the same shingle more than once.
|
* used to avoid producing the same shingle more than once.
|
||||||
*/
|
*/
|
||||||
private Set shinglesSeen = new HashSet();
|
private Set<List<Token>> shinglesSeen = new HashSet<List<Token>>();
|
||||||
|
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
|
@ -352,9 +352,9 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
}
|
}
|
||||||
|
|
||||||
private Matrix matrix;
|
private Matrix matrix;
|
||||||
|
|
||||||
private Token reusableToken = new Token();
|
private Token reusableToken = new Token();
|
||||||
|
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (matrix == null) {
|
if (matrix == null) {
|
||||||
matrix = new Matrix();
|
matrix = new Matrix();
|
||||||
|
@ -372,7 +372,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
token = produceNextToken(reusableToken);
|
token = produceNextToken(reusableToken);
|
||||||
} while (token == request_next_token);
|
} while (token == request_next_token);
|
||||||
if (token == null) return false;
|
if (token == null) return false;
|
||||||
|
|
||||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
||||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
|
@ -381,7 +381,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
payloadAtt.setPayload(token.getPayload());
|
payloadAtt.setPayload(token.getPayload());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Token getNextInputToken(Token token) throws IOException {
|
private Token getNextInputToken(Token token) throws IOException {
|
||||||
if (!input.incrementToken()) return null;
|
if (!input.incrementToken()) return null;
|
||||||
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
|
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
|
||||||
|
@ -404,7 +404,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
public final Token next() throws java.io.IOException {
|
public final Token next() throws java.io.IOException {
|
||||||
return super.next();
|
return super.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Token request_next_token = new Token();
|
private static final Token request_next_token = new Token();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -428,16 +428,16 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
if (ignoringSinglePrefixOrSuffixShingle
|
if (ignoringSinglePrefixOrSuffixShingle
|
||||||
&& currentShingleLength == 1
|
&& currentShingleLength == 1
|
||||||
&& (((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || ((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) {
|
&& ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) {
|
||||||
return next(reusableToken);
|
return next(reusableToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
int termLength = 0;
|
int termLength = 0;
|
||||||
|
|
||||||
List shingle = new ArrayList();
|
List<Token> shingle = new ArrayList<Token>(currentShingleLength);
|
||||||
|
|
||||||
for (int i = 0; i < currentShingleLength; i++) {
|
for (int i = 0; i < currentShingleLength; i++) {
|
||||||
Token shingleToken = (Token) currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
|
Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
|
||||||
termLength += shingleToken.termLength();
|
termLength += shingleToken.termLength();
|
||||||
shingle.add(shingleToken);
|
shingle.add(shingleToken);
|
||||||
}
|
}
|
||||||
|
@ -452,8 +452,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
// shingle token factory
|
// shingle token factory
|
||||||
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
|
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
|
||||||
for (Iterator iterator = shingle.iterator(); iterator.hasNext();) {
|
for (Token shingleToken : shingle) {
|
||||||
Token shingleToken = (Token) iterator.next();
|
|
||||||
if (spacerCharacter != null && sb.length() > 0) {
|
if (spacerCharacter != null && sb.length() > 0) {
|
||||||
sb.append(spacerCharacter);
|
sb.append(spacerCharacter);
|
||||||
}
|
}
|
||||||
|
@ -493,22 +492,19 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
// get rid of resources
|
// get rid of resources
|
||||||
|
|
||||||
// delete the first column in the matrix
|
// delete the first column in the matrix
|
||||||
Matrix.Column deletedColumn = (Matrix.Column) matrix.columns.remove(0);
|
Matrix.Column deletedColumn = matrix.columns.remove(0);
|
||||||
|
|
||||||
// remove all shingles seen that include any of the tokens from the deleted column.
|
// remove all shingles seen that include any of the tokens from the deleted column.
|
||||||
List deletedColumnTokens = new ArrayList();
|
List<Token> deletedColumnTokens = new ArrayList<Token>();
|
||||||
for (Iterator iterator = deletedColumn.getRows().iterator(); iterator.hasNext();) {
|
for (Matrix.Column.Row row : deletedColumn.getRows()) {
|
||||||
Matrix.Column.Row row = (Matrix.Column.Row) iterator.next();
|
for (Token token : row.getTokens()) {
|
||||||
for (Iterator rowIter = row.getTokens().iterator(); rowIter.hasNext();) {
|
deletedColumnTokens.add(token);
|
||||||
Object o = rowIter.next();//Token
|
|
||||||
deletedColumnTokens.add(o);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
for (Iterator shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
|
for (Iterator<List<Token>> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
|
||||||
List shingle = (List) shinglesSeenIterator.next();
|
List<Token> shingle = shinglesSeenIterator.next();
|
||||||
for (Iterator deletedIter = deletedColumnTokens.iterator(); deletedIter.hasNext();) {
|
for (Token deletedColumnToken : deletedColumnTokens) {
|
||||||
Token deletedColumnToken = (Token) deletedIter.next();
|
|
||||||
if (shingle.contains(deletedColumnToken)) {
|
if (shingle.contains(deletedColumnToken)) {
|
||||||
shinglesSeenIterator.remove();
|
shinglesSeenIterator.remove();
|
||||||
break;
|
break;
|
||||||
|
@ -552,14 +548,12 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
* finally resets the current (next) shingle size and offset.
|
* finally resets the current (next) shingle size and offset.
|
||||||
*/
|
*/
|
||||||
private void nextTokensPermutation() {
|
private void nextTokensPermutation() {
|
||||||
Matrix.Column.Row[] rowsPermutation;
|
Matrix.Column.Row[] rowsPermutation = permutations.next();
|
||||||
rowsPermutation = (Matrix.Column.Row[]) permutations.next();
|
List<Matrix.Column.Row> currentPermutationRows = new ArrayList<Matrix.Column.Row>();
|
||||||
List currentPermutationRows = new ArrayList();
|
List<Token> currentPermuationTokens = new ArrayList<Token>();
|
||||||
List currentPermuationTokens = new ArrayList();
|
for (Matrix.Column.Row row : rowsPermutation) {
|
||||||
for (int i = 0; i < rowsPermutation.length; i++) {
|
for (Token token : row.getTokens()) {
|
||||||
Matrix.Column.Row row = rowsPermutation[i];
|
currentPermuationTokens.add(token);
|
||||||
for (Iterator iterator = row.getTokens().iterator(); iterator.hasNext();) {
|
|
||||||
currentPermuationTokens.add(iterator.next());
|
|
||||||
currentPermutationRows.add(row);
|
currentPermutationRows.add(row);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -627,8 +621,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
double factor = 1d / Math.sqrt(total);
|
double factor = 1d / Math.sqrt(total);
|
||||||
|
|
||||||
double weight = 0d;
|
double weight = 0d;
|
||||||
for (int i = 0; i < weights.length; i++) {
|
for (double partWeight : weights) {
|
||||||
double partWeight = weights[i];
|
|
||||||
weight += partWeight * factor;
|
weight += partWeight * factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -709,7 +702,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
private boolean columnsHasBeenCreated = false;
|
private boolean columnsHasBeenCreated = false;
|
||||||
|
|
||||||
private List columns = new ArrayList();
|
private List<Column> columns = new ArrayList<Column>();
|
||||||
|
|
||||||
public List getColumns() {
|
public List getColumns() {
|
||||||
return columns;
|
return columns;
|
||||||
|
@ -740,9 +733,9 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
Matrix.this.columns.add(this);
|
Matrix.this.columns.add(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List rows = new ArrayList();
|
private List<Row> rows = new ArrayList<Row>();
|
||||||
|
|
||||||
public List getRows() {
|
public List<Row> getRows() {
|
||||||
return rows;
|
return rows;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -781,7 +774,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
return Column.this;
|
return Column.this;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List tokens = new LinkedList();
|
private List<Token> tokens = new LinkedList<Token>();
|
||||||
|
|
||||||
public Row() {
|
public Row() {
|
||||||
Column.this.rows.add(this);
|
Column.this.rows.add(this);
|
||||||
|
@ -791,11 +784,11 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
return Column.this.rows.indexOf(this);
|
return Column.this.rows.indexOf(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List getTokens() {
|
public List<Token> getTokens() {
|
||||||
return tokens;
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTokens(List tokens) {
|
public void setTokens(List<Token> tokens) {
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -826,9 +819,9 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Iterator permutationIterator() {
|
public Iterator<Column.Row[]> permutationIterator() {
|
||||||
|
|
||||||
return new Iterator() {
|
return new Iterator<Column.Row[]>() {
|
||||||
|
|
||||||
private int[] columnRowCounters = new int[columns.size()];
|
private int[] columnRowCounters = new int[columns.size()];
|
||||||
|
|
||||||
|
@ -838,10 +831,10 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
int s = columnRowCounters.length;
|
int s = columnRowCounters.length;
|
||||||
return s != 0 && columnRowCounters[s - 1] < ((Column) columns.get(s - 1)).getRows().size();
|
return s != 0 && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object next() {
|
public Column.Row[] next() {
|
||||||
if (!hasNext()) {
|
if (!hasNext()) {
|
||||||
throw new NoSuchElementException("no more elements");
|
throw new NoSuchElementException("no more elements");
|
||||||
}
|
}
|
||||||
|
@ -849,7 +842,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
Column.Row[] rows = new Column.Row[columnRowCounters.length];
|
Column.Row[] rows = new Column.Row[columnRowCounters.length];
|
||||||
|
|
||||||
for (int i = 0; i < columnRowCounters.length; i++) {
|
for (int i = 0; i < columnRowCounters.length; i++) {
|
||||||
rows[i] = (Matrix.Column.Row) ((Column) columns.get(i)).rows.get(columnRowCounters[i]);
|
rows[i] = columns.get(i).rows.get(columnRowCounters[i]);
|
||||||
}
|
}
|
||||||
incrementColumnRowCounters();
|
incrementColumnRowCounters();
|
||||||
|
|
||||||
|
@ -859,7 +852,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
||||||
private void incrementColumnRowCounters() {
|
private void incrementColumnRowCounters() {
|
||||||
for (int i = 0; i < columnRowCounters.length; i++) {
|
for (int i = 0; i < columnRowCounters.length; i++) {
|
||||||
columnRowCounters[i]++;
|
columnRowCounters[i]++;
|
||||||
if (columnRowCounters[i] == ((Column) columns.get(i)).rows.size() &&
|
if (columnRowCounters[i] == columns.get(i).rows.size() &&
|
||||||
i < columnRowCounters.length - 1) {
|
i < columnRowCounters.length - 1) {
|
||||||
columnRowCounters[i] = 0;
|
columnRowCounters[i] = 0;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.shingle;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
@ -28,6 +29,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||||
|
@ -44,7 +46,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
"testBehavingAsShingleFilter", "testMatrix"
|
"testBehavingAsShingleFilter", "testMatrix"
|
||||||
})));
|
})));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBehavingAsShingleFilter() throws IOException {
|
public void testBehavingAsShingleFilter() throws IOException {
|
||||||
|
|
||||||
ShingleMatrixFilter.defaultSettingsCodec = null;
|
ShingleMatrixFilter.defaultSettingsCodec = null;
|
||||||
|
|
Loading…
Reference in New Issue