From 4f878bdc93efa9db20a24f1bcde257f8ca546764 Mon Sep 17 00:00:00 2001 From: Karl-Johan Wettin Date: Sat, 3 Oct 2009 13:17:11 +0000 Subject: [PATCH] LUCENE-1257: Generified ShingleMatrixFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@821311 13f79535-47bb-0310-9956-ffa450edef68 --- .../analysis/shingle/ShingleMatrixFilter.java | 95 +++++++++---------- .../shingle/TestShingleMatrixFilter.java | 4 +- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java index 8f7f9974ebc..3cd126f677c 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java @@ -112,7 +112,7 @@ import org.apache.lucene.index.Payload; * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}. *

* NOTE: This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - * the ones located in org.apache.lucene.analysis.tokenattributes. + * the ones located in org.apache.lucene.analysis.tokenattributes. */ public class ShingleMatrixFilter extends TokenStream { @@ -206,7 +206,7 @@ public class ShingleMatrixFilter extends TokenStream { private TypeAttribute in_typeAtt; private FlagsAttribute in_flagsAtt; - + /** * Creates a shingle filter based on a user defined matrix. * @@ -237,7 +237,7 @@ public class ShingleMatrixFilter extends TokenStream { // set the input to be an empty token stream, we already have the data. this.input = new EmptyTokenStream(); - + in_termAtt = input.addAttribute(TermAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class); @@ -316,7 +316,7 @@ public class ShingleMatrixFilter extends TokenStream { offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); - + in_termAtt = input.addAttribute(TermAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class); @@ -328,12 +328,12 @@ public class ShingleMatrixFilter extends TokenStream { // internal filter instance variables /** iterator over the current matrix row permutations */ - private Iterator permutations; + private Iterator permutations; /** the current permutation of tokens used to produce shingles */ - private List currentPermuationTokens; + private List currentPermuationTokens; /** index to what row a token in currentShingleTokens represents*/ - private List currentPermutationRows; + private List currentPermutationRows; private int currentPermutationTokensStartOffset; private int currentShingleLength; @@ -342,7 +342,7 @@ public class ShingleMatrixFilter extends TokenStream { * a set containing shingles that has been the result of a call to next(Token), * used to avoid producing the same shingle more than once. */ - private Set shinglesSeen = new HashSet(); + private Set> shinglesSeen = new HashSet>(); public void reset() throws IOException { @@ -352,9 +352,9 @@ public class ShingleMatrixFilter extends TokenStream { } private Matrix matrix; - + private Token reusableToken = new Token(); - + public final boolean incrementToken() throws IOException { if (matrix == null) { matrix = new Matrix(); @@ -372,7 +372,7 @@ public class ShingleMatrixFilter extends TokenStream { token = produceNextToken(reusableToken); } while (token == request_next_token); if (token == null) return false; - + termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); @@ -381,7 +381,7 @@ public class ShingleMatrixFilter extends TokenStream { payloadAtt.setPayload(token.getPayload()); return true; } - + private Token getNextInputToken(Token token) throws IOException { if (!input.incrementToken()) return null; token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength()); @@ -404,7 +404,7 @@ public class ShingleMatrixFilter extends TokenStream { public final Token next() throws java.io.IOException { return super.next(); } - + private static final Token request_next_token = new Token(); /** @@ -428,16 +428,16 @@ public class ShingleMatrixFilter extends TokenStream { if (ignoringSinglePrefixOrSuffixShingle && currentShingleLength == 1 - && (((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || ((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { + && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { return next(reusableToken); } int termLength = 0; - List shingle = new ArrayList(); + List shingle = new ArrayList(currentShingleLength); for (int i = 0; i < currentShingleLength; i++) { - Token shingleToken = (Token) currentPermuationTokens.get(i + currentPermutationTokensStartOffset); + Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); termLength += shingleToken.termLength(); shingle.add(shingleToken); } @@ -452,8 +452,7 @@ public class ShingleMatrixFilter extends TokenStream { // shingle token factory StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. - for (Iterator iterator = shingle.iterator(); iterator.hasNext();) { - Token shingleToken = (Token) iterator.next(); + for (Token shingleToken : shingle) { if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } @@ -493,22 +492,19 @@ public class ShingleMatrixFilter extends TokenStream { // get rid of resources // delete the first column in the matrix - Matrix.Column deletedColumn = (Matrix.Column) matrix.columns.remove(0); + Matrix.Column deletedColumn = matrix.columns.remove(0); // remove all shingles seen that include any of the tokens from the deleted column. - List deletedColumnTokens = new ArrayList(); - for (Iterator iterator = deletedColumn.getRows().iterator(); iterator.hasNext();) { - Matrix.Column.Row row = (Matrix.Column.Row) iterator.next(); - for (Iterator rowIter = row.getTokens().iterator(); rowIter.hasNext();) { - Object o = rowIter.next();//Token - deletedColumnTokens.add(o); + List deletedColumnTokens = new ArrayList(); + for (Matrix.Column.Row row : deletedColumn.getRows()) { + for (Token token : row.getTokens()) { + deletedColumnTokens.add(token); } } - for (Iterator shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { - List shingle = (List) shinglesSeenIterator.next(); - for (Iterator deletedIter = deletedColumnTokens.iterator(); deletedIter.hasNext();) { - Token deletedColumnToken = (Token) deletedIter.next(); + for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { + List shingle = shinglesSeenIterator.next(); + for (Token deletedColumnToken : deletedColumnTokens) { if (shingle.contains(deletedColumnToken)) { shinglesSeenIterator.remove(); break; @@ -552,14 +548,12 @@ public class ShingleMatrixFilter extends TokenStream { * finally resets the current (next) shingle size and offset. */ private void nextTokensPermutation() { - Matrix.Column.Row[] rowsPermutation; - rowsPermutation = (Matrix.Column.Row[]) permutations.next(); - List currentPermutationRows = new ArrayList(); - List currentPermuationTokens = new ArrayList(); - for (int i = 0; i < rowsPermutation.length; i++) { - Matrix.Column.Row row = rowsPermutation[i]; - for (Iterator iterator = row.getTokens().iterator(); iterator.hasNext();) { - currentPermuationTokens.add(iterator.next()); + Matrix.Column.Row[] rowsPermutation = permutations.next(); + List currentPermutationRows = new ArrayList(); + List currentPermuationTokens = new ArrayList(); + for (Matrix.Column.Row row : rowsPermutation) { + for (Token token : row.getTokens()) { + currentPermuationTokens.add(token); currentPermutationRows.add(row); } } @@ -627,8 +621,7 @@ public class ShingleMatrixFilter extends TokenStream { double factor = 1d / Math.sqrt(total); double weight = 0d; - for (int i = 0; i < weights.length; i++) { - double partWeight = weights[i]; + for (double partWeight : weights) { weight += partWeight * factor; } @@ -709,7 +702,7 @@ public class ShingleMatrixFilter extends TokenStream { private boolean columnsHasBeenCreated = false; - private List columns = new ArrayList(); + private List columns = new ArrayList(); public List getColumns() { return columns; @@ -740,9 +733,9 @@ public class ShingleMatrixFilter extends TokenStream { Matrix.this.columns.add(this); } - private List rows = new ArrayList(); + private List rows = new ArrayList(); - public List getRows() { + public List getRows() { return rows; } @@ -781,7 +774,7 @@ public class ShingleMatrixFilter extends TokenStream { return Column.this; } - private List tokens = new LinkedList(); + private List tokens = new LinkedList(); public Row() { Column.this.rows.add(this); @@ -791,11 +784,11 @@ public class ShingleMatrixFilter extends TokenStream { return Column.this.rows.indexOf(this); } - public List getTokens() { + public List getTokens() { return tokens; } - public void setTokens(List tokens) { + public void setTokens(List tokens) { this.tokens = tokens; } @@ -826,9 +819,9 @@ public class ShingleMatrixFilter extends TokenStream { } - public Iterator permutationIterator() { + public Iterator permutationIterator() { - return new Iterator() { + return new Iterator() { private int[] columnRowCounters = new int[columns.size()]; @@ -838,10 +831,10 @@ public class ShingleMatrixFilter extends TokenStream { public boolean hasNext() { int s = columnRowCounters.length; - return s != 0 && columnRowCounters[s - 1] < ((Column) columns.get(s - 1)).getRows().size(); + return s != 0 && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size(); } - public Object next() { + public Column.Row[] next() { if (!hasNext()) { throw new NoSuchElementException("no more elements"); } @@ -849,7 +842,7 @@ public class ShingleMatrixFilter extends TokenStream { Column.Row[] rows = new Column.Row[columnRowCounters.length]; for (int i = 0; i < columnRowCounters.length; i++) { - rows[i] = (Matrix.Column.Row) ((Column) columns.get(i)).rows.get(columnRowCounters[i]); + rows[i] = columns.get(i).rows.get(columnRowCounters[i]); } incrementColumnRowCounters(); @@ -859,7 +852,7 @@ public class ShingleMatrixFilter extends TokenStream { private void incrementColumnRowCounters() { for (int i = 0; i < columnRowCounters.length; i++) { columnRowCounters[i]++; - if (columnRowCounters[i] == ((Column) columns.get(i)).rows.size() && + if (columnRowCounters[i] == columns.get(i).rows.size() && i < columnRowCounters.length - 1) { columnRowCounters[i] = 0; } else { diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java index bcab4b8d1b2..bba73f77b00 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.shingle; */ import java.io.IOException; +import java.io.StringReader; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; @@ -28,6 +29,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter; import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; @@ -44,7 +46,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase { "testBehavingAsShingleFilter", "testMatrix" }))); } - + public void testBehavingAsShingleFilter() throws IOException { ShingleMatrixFilter.defaultSettingsCodec = null;