diff --git a/contrib/analyzers/build.xml b/contrib/analyzers/build.xml index caf33d9e35d..b27cfc8b155 100644 --- a/contrib/analyzers/build.xml +++ b/contrib/analyzers/build.xml @@ -23,8 +23,8 @@ Additional Analyzers - - - + + + diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java index fc33e175819..c4ea8d3bdf4 100644 --- a/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java @@ -108,7 +108,7 @@ import org.apache.lucene.index.Payload; */ public class ShingleMatrixFilter extends TokenStream { - public static Character defaultSpacerCharacter = '_'; + public static Character defaultSpacerCharacter = new Character('_'); public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec(); public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false; @@ -155,10 +155,10 @@ public class ShingleMatrixFilter extends TokenStream { * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token) * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) */ - public static enum TokenPositioner { - newColumn(0), - newRow(1), - sameRow(2); + public static class TokenPositioner { + public static final TokenPositioner newColumn = new TokenPositioner(0); + public static final TokenPositioner newRow = new TokenPositioner(1); + public static final TokenPositioner sameRow = new TokenPositioner(2); private final int index; @@ -180,7 +180,7 @@ public class ShingleMatrixFilter extends TokenStream { private boolean ignoringSinglePrefixOrSuffixShingle = false; - private Character spacerCharacter = '_'; + private Character spacerCharacter = defaultSpacerCharacter; private TokenStream input; @@ -279,12 +279,12 @@ public class ShingleMatrixFilter extends TokenStream { // internal filter instance variables /** iterator over the current matrix row permutations */ - private Iterator permutations; + private Iterator permutations; /** the current permutation of tokens used to produce shingles */ - private List currentPermuationTokens; + private List currentPermuationTokens; /** index to what row a token in currentShingleTokens represents*/ - private List currentPermutationRows; + private List currentPermutationRows; private int currentPermutationTokensStartOffset; private int currentShingleLength; @@ -293,7 +293,7 @@ public class ShingleMatrixFilter extends TokenStream { * a set containing shingles that has been the result of a call to next(Token), * used to avoid producing the same shingle more than once. */ - private Set> shinglesSeen = new HashSet>(); + private Set shinglesSeen = new HashSet(); public void reset() throws IOException { @@ -324,16 +324,16 @@ public class ShingleMatrixFilter extends TokenStream { if (ignoringSinglePrefixOrSuffixShingle && currentShingleLength == 1 - && (currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isFirst() || currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isLast())) { + && (((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || ((Matrix.Column.Row) currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { return next(reusableToken); } int termLength = 0; - List shingle = new ArrayList(); + List shingle = new ArrayList(); for (int i = 0; i < currentShingleLength; i++) { - Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); + Token shingleToken = (Token) currentPermuationTokens.get(i + currentPermutationTokensStartOffset); termLength += shingleToken.termLength(); shingle.add(shingleToken); } @@ -347,8 +347,9 @@ public class ShingleMatrixFilter extends TokenStream { } // shingle token factory - StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. - for (Token shingleToken : shingle) { + StringBuffer sb = new StringBuffer(termLength + 10); // paranormal ability to foresee the future. + for (Iterator iterator = shingle.iterator(); iterator.hasNext();) { + Token shingleToken = (Token) iterator.next(); if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } @@ -388,18 +389,22 @@ public class ShingleMatrixFilter extends TokenStream { // get rith of resources // delete the first column in the matrix - Matrix.Column deletedColumn = matrix.columns.remove(0); + Matrix.Column deletedColumn = (Matrix.Column) matrix.columns.remove(0); // remove all shingles seen that include any of the tokens from the deleted column. - List deletedColumnTokens = new ArrayList(); - for (Matrix.Column.Row row : deletedColumn.getRows()) { - for (Token shingleToken : row.getTokens()) { - deletedColumnTokens.add(shingleToken); + List deletedColumnTokens = new ArrayList(); + for (Iterator iterator = deletedColumn.getRows().iterator(); iterator.hasNext();) { + Matrix.Column.Row row = (Matrix.Column.Row) iterator.next(); + for (Iterator rowIter = row.getTokens().iterator(); rowIter.hasNext();) { + Object o = rowIter.next();//Token + deletedColumnTokens.add(o); } + } - for (Iterator> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { - List shingle = shinglesSeenIterator.next(); - for (Token deletedColumnToken : deletedColumnTokens) { + for (Iterator shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { + List shingle = (List) shinglesSeenIterator.next(); + for (Iterator deletedIter = deletedColumnTokens.iterator(); deletedIter.hasNext();) { + Token deletedColumnToken = (Token) deletedIter.next(); if (shingle.contains(deletedColumnToken)) { shinglesSeenIterator.remove(); break; @@ -443,12 +448,14 @@ public class ShingleMatrixFilter extends TokenStream { * finally resets the current (next) shingle size and offset. */ private void nextTokensPermutation() { - Matrix.Column.Row[] rowsPermutation = permutations.next(); - List currentPermutationRows = new ArrayList(); - List currentPermuationTokens = new ArrayList(); - for (Matrix.Column.Row row : rowsPermutation) { - for (Token shingleToken : row.getTokens()) { - currentPermuationTokens.add(shingleToken); + Matrix.Column.Row[] rowsPermutation; + rowsPermutation = (Matrix.Column.Row[]) permutations.next(); + List currentPermutationRows = new ArrayList(); + List currentPermuationTokens = new ArrayList(); + for (int i = 0; i < rowsPermutation.length; i++) { + Matrix.Column.Row row = rowsPermutation[i]; + for (Iterator iterator = row.getTokens().iterator(); iterator.hasNext();) { + currentPermuationTokens.add(iterator.next()); currentPermutationRows.add(row); } } @@ -471,12 +478,12 @@ public class ShingleMatrixFilter extends TokenStream { * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens * @param currentPermuationTokens tokens of the current permutation of rows in the matrix. */ - public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { - token.setType(ShingleMatrixFilter.class.getSimpleName()); + public void updateToken(Token token, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + token.setType(ShingleMatrixFilter.class.getName()); token.setFlags(0); token.setPositionIncrement(1); - token.setStartOffset(shingle.get(0).startOffset()); - token.setEndOffset(shingle.get(shingle.size() - 1).endOffset()); + token.setStartOffset(((Token) shingle.get(0)).startOffset()); + token.setEndOffset(((Token) shingle.get(shingle.size() - 1)).endOffset()); settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)); } @@ -496,7 +503,7 @@ public class ShingleMatrixFilter extends TokenStream { * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle. * @return weight to be set for parameter shingleToken */ - public float calculateShingleWeight(Token shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { + public float calculateShingleWeight(Token shingleToken, List shingle, int currentPermutationStartOffset, List currentPermutationRows, List currentPermuationTokens) { double[] weights = new double[shingle.size()]; double total = 0f; @@ -504,7 +511,7 @@ public class ShingleMatrixFilter extends TokenStream { for (int i=0; i top) { @@ -516,7 +523,8 @@ public class ShingleMatrixFilter extends TokenStream { double factor = 1d / Math.sqrt(total); double weight = 0d; - for (double partWeight : weights) { + for (int i = 0; i < weights.length; i++) { + double partWeight = weights[i]; weight += partWeight * factor; } @@ -597,9 +605,9 @@ public class ShingleMatrixFilter extends TokenStream { private boolean columnsHasBeenCreated = false; - private List columns = new ArrayList(); + private List columns = new ArrayList(); - public List getColumns() { + public List getColumns() { return columns; } @@ -628,9 +636,9 @@ public class ShingleMatrixFilter extends TokenStream { Matrix.this.columns.add(this); } - private List rows = new ArrayList(); + private List rows = new ArrayList(); - public List getRows() { + public List getRows() { return rows; } @@ -669,7 +677,7 @@ public class ShingleMatrixFilter extends TokenStream { return Column.this; } - private List tokens = new LinkedList(); + private List tokens = new LinkedList(); public Row() { Column.this.rows.add(this); @@ -679,11 +687,11 @@ public class ShingleMatrixFilter extends TokenStream { return Column.this.rows.indexOf(this); } - public List getTokens() { + public List getTokens() { return tokens; } - public void setTokens(List tokens) { + public void setTokens(List tokens) { this.tokens = tokens; } @@ -706,7 +714,7 @@ public class ShingleMatrixFilter extends TokenStream { public String toString() { return "Row{" + "index=" + getIndex() + - ", tokens=" + (tokens == null ? null : Arrays.asList(tokens)) + + ", tokens=" + (tokens == null ? null : tokens) + '}'; } } @@ -714,9 +722,9 @@ public class ShingleMatrixFilter extends TokenStream { } - public Iterator permutationIterator() { + public Iterator permutationIterator() { - return new Iterator() { + return new Iterator() { private int[] columnRowCounters = new int[columns.size()]; @@ -726,10 +734,10 @@ public class ShingleMatrixFilter extends TokenStream { public boolean hasNext() { int s = columnRowCounters.length; - return s != 0 && columnRowCounters[s - 1] < columns.get(s - 1).getRows().size(); + return s != 0 && columnRowCounters[s - 1] < ((Column) columns.get(s - 1)).getRows().size(); } - public Column.Row[] next() { + public Object next() { if (!hasNext()) { throw new NoSuchElementException("no more elements"); } @@ -737,7 +745,7 @@ public class ShingleMatrixFilter extends TokenStream { Column.Row[] rows = new Column.Row[columnRowCounters.length]; for (int i = 0; i < columnRowCounters.length; i++) { - rows[i] = columns.get(i).rows.get(columnRowCounters[i]); + rows[i] = (Matrix.Column.Row) ((Column) columns.get(i)).rows.get(columnRowCounters[i]); } incrementColumnRowCounters(); @@ -747,7 +755,7 @@ public class ShingleMatrixFilter extends TokenStream { private void incrementColumnRowCounters() { for (int i = 0; i < columnRowCounters.length; i++) { columnRowCounters[i]++; - if (columnRowCounters[i] == columns.get(i).rows.size() && + if (columnRowCounters[i] == ((Column) columns.get(i)).rows.size() && i < columnRowCounters.length - 1) { columnRowCounters[i] = 0; } else { diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java index ee7547fc500..b5d7549fd70 100644 --- a/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java @@ -42,15 +42,15 @@ public class TestShingleMatrixFilter extends TestCase { TokenStream ts; - ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); + ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); assertNull(ts.next(new Token())); TokenListStream tls; - LinkedList tokens; + LinkedList tokens; // test a plain old token stream with synonyms translated to rows. - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(createToken("please", 0, 6)); tokens.add(createToken("divide", 7, 13)); tokens.add(createToken("this", 14, 18)); @@ -62,7 +62,7 @@ public class TestShingleMatrixFilter extends TestCase { // bi-grams - ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); + ts = new ShingleMatrixFilter(tls, 1, 2, new Character(' '), false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec()); Token reusableToken = new Token(); @@ -93,11 +93,11 @@ public class TestShingleMatrixFilter extends TestCase { TokenStream ts; TokenListStream tls; - LinkedList tokens; + LinkedList tokens; // test a plain old token stream with synonyms tranlated to rows. - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 0, 4)); tokens.add(tokenFactory("greetings", 0, 0, 4)); tokens.add(tokenFactory("world", 1, 5, 10)); @@ -108,7 +108,7 @@ public class TestShingleMatrixFilter extends TestCase { // bi-grams - ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); + ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec()); final Token reusableToken = new Token(); assertNext(ts, reusableToken, "hello_world"); @@ -138,7 +138,7 @@ public class TestShingleMatrixFilter extends TestCase { ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec(); - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn)); tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow)); tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn)); @@ -152,7 +152,7 @@ public class TestShingleMatrixFilter extends TestCase { // bi-grams, position incrememnt, weight, start offset, end offset - ts = new ShingleMatrixFilter(tls, 2, 2, '_', false); + ts = new ShingleMatrixFilter(tls, 2, 2, new Character('_'), false); // // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); @@ -174,7 +174,7 @@ public class TestShingleMatrixFilter extends TestCase { // test unlimited size and allow single boundary token as shingle tls.reset(); - ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false); + ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), false); // // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { @@ -224,7 +224,7 @@ public class TestShingleMatrixFilter extends TestCase { // test unlimited size but don't allow single boundary token as shingle tls.reset(); - ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true); + ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, new Character('_'), true); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); // token.clear(); @@ -279,7 +279,7 @@ public class TestShingleMatrixFilter extends TestCase { // - tokens = new LinkedList(); + tokens = new LinkedList(); tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn)); tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow)); tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow)); @@ -292,7 +292,7 @@ public class TestShingleMatrixFilter extends TestCase { // 2-3 grams - ts = new ShingleMatrixFilter(tls, 2, 3, '_', false); + ts = new ShingleMatrixFilter(tls, 2, 3, new Character('_'), false); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); @@ -353,7 +353,7 @@ public class TestShingleMatrixFilter extends TestCase { matrix.new Column(tokenFactory("the", 1)); matrix.new Column(tokenFactory("croud", 1)); - TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec()); + TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, new Character('_'), true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec()); // for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) { // System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");"); @@ -457,7 +457,7 @@ public class TestShingleMatrixFilter extends TestCase { assertNotNull(nextToken); assertEquals(text, nextToken.term()); assertEquals(positionIncrement, nextToken.getPositionIncrement()); - assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData())); + assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0); return nextToken; } @@ -466,7 +466,7 @@ public class TestShingleMatrixFilter extends TestCase { assertNotNull(nextToken); assertEquals(text, nextToken.term()); assertEquals(positionIncrement, nextToken.getPositionIncrement()); - assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData())); + assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()), 0); assertEquals(startOffset, nextToken.startOffset()); assertEquals(endOffset, nextToken.endOffset()); return nextToken; @@ -491,21 +491,21 @@ public class TestShingleMatrixFilter extends TestCase { public static class TokenListStream extends TokenStream { - private Collection tokens; + private Collection tokens; public TokenListStream(TokenStream ts) throws IOException { - tokens = new ArrayList(); + tokens = new ArrayList(); final Token reusableToken = new Token(); for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) { tokens.add((Token) nextToken.clone()); } } - public TokenListStream(Collection tokens) { + public TokenListStream(Collection tokens) { this.tokens = tokens; } - private Iterator iterator; + private Iterator iterator; public Token next(final Token reusableToken) throws IOException { assert reusableToken != null;