mirror of https://github.com/apache/lucene.git
LUCENE-400: Added ShingleFilter (token based ngram)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@642612 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
09b3a56ae2
commit
cc955c9748
|
@ -139,6 +139,9 @@ New features
|
|||
Index store similar to MemoryIndex but allows for multiple documents
|
||||
in memory. (Karl Wettin via Grant Ingersoll)
|
||||
|
||||
12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
|
||||
that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll)
|
||||
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
package org.apache.lucene.analysis.shingle;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
|
||||
/**
|
||||
* A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A shingle is another term for a token based
|
||||
* n-gram.
|
||||
*/
|
||||
public class ShingleAnalyzerWrapper extends Analyzer {
|
||||
|
||||
protected Analyzer defaultAnalyzer;
|
||||
protected int maxShingleSize = 2;
|
||||
protected boolean outputUnigrams = true;
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||
super();
|
||||
this.defaultAnalyzer = defaultAnalyzer;
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||
this(defaultAnalyzer);
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper() {
|
||||
super();
|
||||
this.defaultAnalyzer = new StandardAnalyzer();
|
||||
}
|
||||
|
||||
public ShingleAnalyzerWrapper(int nGramSize) {
|
||||
this();
|
||||
this.maxShingleSize = nGramSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* The max shingle (ngram) size
|
||||
* @return The max shingle (ngram) size
|
||||
*/
|
||||
public int getMaxShingleSize() {
|
||||
return maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the maximum size of output shingles (default: 2)
|
||||
*
|
||||
* @param maxShingleSize max shingle size
|
||||
*/
|
||||
public void setMaxShingleSize(int maxShingleSize) {
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
|
||||
public boolean isOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shall the filter pass the original tokens (the "unigrams") to the output
|
||||
* stream? (default: true)
|
||||
*
|
||||
* @param outputUnigrams Whether or not the filter shall pass the original
|
||||
* tokens to the output stream
|
||||
*/
|
||||
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
ShingleFilter filter
|
||||
= new ShingleFilter(defaultAnalyzer.tokenStream(fieldName, reader));
|
||||
filter.setMaxShingleSize(maxShingleSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
return filter;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
package org.apache.lucene.analysis.shingle;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream,
|
||||
* that is, combinations of tokens that are indexed as one token.
|
||||
*
|
||||
* <p>For example, the sentence "please divide this sentence into shingles"
|
||||
* would be tokenized into the tokens "please divide", "divide this",
|
||||
* "this sentence", "sentence into", and "into shingles".
|
||||
*
|
||||
* <p>This filter handles position increments > 1 by inserting filler tokens
|
||||
* (tokens with termtext "_"). It does not handle a position increment of 0.
|
||||
*/
|
||||
public class ShingleFilter extends TokenFilter {
|
||||
|
||||
private LinkedList shingleBuf = new LinkedList();
|
||||
private LinkedList outputBuf = new LinkedList();
|
||||
private LinkedList tokenBuf = new LinkedList();
|
||||
private StringBuffer[] shingles;
|
||||
private String tokenType = "shingle";
|
||||
|
||||
/**
|
||||
* filler token for when positionIncrement is more than 1
|
||||
*/
|
||||
public static final String FILLER_TOKEN = "_";
|
||||
|
||||
|
||||
/**
|
||||
* default maximum shingle size is 2.
|
||||
*/
|
||||
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The string to use when joining adjacent tokens to form a shingle
|
||||
*/
|
||||
public static final String TOKEN_SEPARATOR = " ";
|
||||
|
||||
/**
|
||||
* By default, we output unigrams (individual tokens) as well as shingles
|
||||
* (token n-grams).
|
||||
*/
|
||||
private boolean outputUnigrams = true;
|
||||
|
||||
/**
|
||||
* maximum shingle size (number of tokens)
|
||||
*/
|
||||
private int maxShingleSize;
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with the specified single size from the
|
||||
* TokenStream <code>input</code>
|
||||
*
|
||||
* @param input input stream
|
||||
* @param maxShingleSize maximum shingle size produced by the filter.
|
||||
*/
|
||||
public ShingleFilter(TokenStream input, int maxShingleSize) {
|
||||
super(input);
|
||||
setMaxShingleSize(maxShingleSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with default shingle size.
|
||||
*
|
||||
* @param input input stream
|
||||
*/
|
||||
public ShingleFilter(TokenStream input) {
|
||||
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a ShingleFilter with the specified token type for shingle tokens.
|
||||
*
|
||||
* @param input input stream
|
||||
* @param tokenType token type for shingle tokens
|
||||
*/
|
||||
public ShingleFilter(TokenStream input, String tokenType) {
|
||||
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||
setTokenType(tokenType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the type of the shingle tokens produced by this filter.
|
||||
* (default: "shingle")
|
||||
*
|
||||
* @param tokenType token tokenType
|
||||
*/
|
||||
public void setTokenType(String tokenType) {
|
||||
this.tokenType = tokenType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shall the output stream contain the input tokens (unigrams) as well as
|
||||
* shingles? (default: true.)
|
||||
*
|
||||
* @param outputUnigrams Whether or not the output stream shall contain
|
||||
* the input tokens (unigrams)
|
||||
*/
|
||||
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the max shingle size (default: 2)
|
||||
*
|
||||
* @param maxShingleSize max size of output shingles
|
||||
*/
|
||||
public void setMaxShingleSize(int maxShingleSize) {
|
||||
if (maxShingleSize < 2) {
|
||||
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||
}
|
||||
shingles = new StringBuffer[maxShingleSize];
|
||||
for (int i = 0; i < shingles.length; i++) {
|
||||
shingles[i] = new StringBuffer();
|
||||
}
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear the StringBuffers that are used for storing the output shingles.
|
||||
*/
|
||||
private void clearShingles() {
|
||||
for (int i = 0; i < shingles.length; i++) {
|
||||
shingles[i].setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
if (outputBuf.isEmpty()) {
|
||||
fillOutputBuf();
|
||||
}
|
||||
Token nextToken = null;
|
||||
if ( ! outputBuf.isEmpty())
|
||||
{
|
||||
nextToken = (Token)outputBuf.remove(0);
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next token from the input stream and push it on the token buffer.
|
||||
* If we encounter a token with position increment > 1, we put filler tokens
|
||||
* on the token buffer.
|
||||
* <p/>
|
||||
* Returns null when the end of the input stream is reached.
|
||||
* @return the next token, or null if at end of input stream
|
||||
* @throws IOException if the input stream has a problem
|
||||
*/
|
||||
private Token getNextToken() throws IOException {
|
||||
if (tokenBuf.isEmpty()) {
|
||||
Token lastToken = input.next();
|
||||
if (lastToken != null) {
|
||||
for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
|
||||
tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
|
||||
lastToken.startOffset()));
|
||||
}
|
||||
tokenBuf.add(lastToken);
|
||||
return getNextToken();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return (Token)tokenBuf.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fill the output buffer with new shingles.
|
||||
*
|
||||
* @throws IOException if there's a problem getting the next token
|
||||
*/
|
||||
private void fillOutputBuf() throws IOException {
|
||||
boolean addedToken = false;
|
||||
/*
|
||||
* Try to fill the shingle buffer.
|
||||
*/
|
||||
do {
|
||||
Token token = getNextToken();
|
||||
if (token != null) {
|
||||
shingleBuf.add(token);
|
||||
if (shingleBuf.size() > maxShingleSize)
|
||||
{
|
||||
shingleBuf.remove(0);
|
||||
}
|
||||
addedToken = true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (shingleBuf.size() < maxShingleSize);
|
||||
|
||||
/*
|
||||
* If no new token could be added to the shingle buffer, we have reached
|
||||
* the end of the input stream and have to discard the least recent token.
|
||||
*/
|
||||
if (! addedToken) {
|
||||
if (shingleBuf.isEmpty()) {
|
||||
return;
|
||||
} else {
|
||||
shingleBuf.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
clearShingles();
|
||||
|
||||
int[] endOffsets = new int[shingleBuf.size()];
|
||||
for (int i = 0; i < endOffsets.length; i++) {
|
||||
endOffsets[i] = 0;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
Token token = null;
|
||||
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
|
||||
token = (Token) it.next();
|
||||
for (int j = i; j < shingles.length; j++) {
|
||||
if (shingles[j].length() != 0) {
|
||||
shingles[j].append(TOKEN_SEPARATOR);
|
||||
}
|
||||
shingles[j].append(token.termBuffer(), 0, token.termLength());
|
||||
}
|
||||
|
||||
endOffsets[i] = token.endOffset();
|
||||
i++;
|
||||
}
|
||||
|
||||
if ((! shingleBuf.isEmpty()) && outputUnigrams) {
|
||||
Token unigram = (Token) shingleBuf.getFirst();
|
||||
unigram.setPositionIncrement(1);
|
||||
outputBuf.add(unigram);
|
||||
}
|
||||
|
||||
/*
|
||||
* Push new tokens to the output buffer.
|
||||
*/
|
||||
for (int j = 1; j < shingleBuf.size(); j++) {
|
||||
Token shingle = new Token(shingles[j].toString(),
|
||||
((Token) shingleBuf.get(0)).startOffset(),
|
||||
endOffsets[j],
|
||||
tokenType);
|
||||
if ((! outputUnigrams) && j == 1) {
|
||||
shingle.setPositionIncrement(1);
|
||||
} else {
|
||||
shingle.setPositionIncrement(0);
|
||||
}
|
||||
outputBuf.add(shingle);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,196 @@
|
|||
package org.apache.lucene.analysis.shingle;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
|
||||
*/
|
||||
public class ShingleAnalyzerWrapperTest extends TestCase {
|
||||
|
||||
public IndexSearcher searcher;
|
||||
|
||||
public static void main(String[] args) {
|
||||
junit.textui.TestRunner.run(ShingleAnalyzerWrapperTest.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up a new index in RAM with three test phrases and the supplied Analyzer.
|
||||
*
|
||||
* @param analyzer the analyzer to use
|
||||
* @return an indexSearcher on the test index.
|
||||
* @throws Exception if an error occurs with index writer or searcher
|
||||
*/
|
||||
public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||
|
||||
Document doc;
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "please divide this sentence into shingles",
|
||||
Field.Store.YES,Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "just another test sentence",
|
||||
Field.Store.YES,Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", "a sentence which contains no test",
|
||||
Field.Store.YES,Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
|
||||
writer.close();
|
||||
|
||||
return new IndexSearcher(dir);
|
||||
}
|
||||
|
||||
protected Hits queryParsingTest(Analyzer analyzer, String qs) throws Exception {
|
||||
searcher = setUpSearcher(analyzer);
|
||||
|
||||
QueryParser qp = new QueryParser("content", analyzer);
|
||||
|
||||
Query q = qp.parse(qs);
|
||||
|
||||
return searcher.search(q);
|
||||
}
|
||||
|
||||
protected void compareRanks(Hits hits, int[] ranks) throws Exception {
|
||||
assertEquals(ranks.length, hits.length());
|
||||
for (int i = 0; i < ranks.length; i++) {
|
||||
assertEquals(ranks[i], hits.id(i));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Will not work on an index without unigrams, since QueryParser automatically
|
||||
* tokenizes on whitespace.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
|
||||
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(), 2),
|
||||
"test sentence");
|
||||
int[] ranks = new int[] { 1, 2, 0 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
/*
|
||||
* This one fails with an exception.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
|
||||
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(), 2),
|
||||
"\"this sentence\"");
|
||||
int[] ranks = new int[] { 0 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
/*
|
||||
* This one works, actually.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
|
||||
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(), 2),
|
||||
"\"test sentence\"");
|
||||
int[] ranks = new int[] { 1 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
/*
|
||||
* Same as above, is tokenized without using the analyzer.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
|
||||
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||
(new WhitespaceAnalyzer(), 2),
|
||||
"+test +sentence");
|
||||
int[] ranks = new int[] { 1, 2 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
/*
|
||||
* This shows how to construct a phrase query containing shingles.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
|
||||
searcher = setUpSearcher(analyzer);
|
||||
|
||||
PhraseQuery q = new PhraseQuery();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("this sentence"));
|
||||
Token token;
|
||||
int j = -1;
|
||||
while ((token = ts.next()) != null) {
|
||||
j += token.getPositionIncrement();
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
q.add(new Term("content", termText), j);
|
||||
}
|
||||
|
||||
Hits hits = searcher.search(q);
|
||||
int[] ranks = new int[] { 0 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
|
||||
/*
|
||||
* How to construct a boolean query with shingles. A query like this will
|
||||
* implicitly score those documents higher that contain the words in the query
|
||||
* in the right order and adjacent to each other.
|
||||
*/
|
||||
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
|
||||
searcher = setUpSearcher(analyzer);
|
||||
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("test sentence"));
|
||||
Token token;
|
||||
while ((token = ts.next()) != null) {
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
Hits hits = searcher.search(q);
|
||||
int[] ranks = new int[] { 1, 2, 0 };
|
||||
compareRanks(hits, ranks);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,190 @@
|
|||
package org.apache.lucene.analysis.shingle;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class ShingleFilterTest extends TestCase {
|
||||
|
||||
public class TestTokenStream extends TokenStream {
|
||||
|
||||
protected int index = 0;
|
||||
protected Token[] testToken;
|
||||
|
||||
public TestTokenStream(Token[] testToken) {
|
||||
super();
|
||||
this.testToken = testToken;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (index < testToken.length) {
|
||||
return testToken[index++];
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
junit.textui.TestRunner.run(ShingleFilterTest.class);
|
||||
}
|
||||
|
||||
public static final Token[] TEST_TOKEN = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("this", 14, 18),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("into", 28, 32),
|
||||
new Token("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static Token[] testTokenWithHoles;
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide this", 7, 18),
|
||||
new Token("this", 14, 18),
|
||||
new Token("this sentence", 14, 27),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence into", 19, 32),
|
||||
new Token("into", 28, 32),
|
||||
new Token("into shingles", 28, 39),
|
||||
new Token("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] BI_GRAM_TYPES = new String[] {
|
||||
"word", "shingle", "word", "shingle", "word", "shingle", "word",
|
||||
"shingle", "word", "shingle", "word"
|
||||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide _", 7, 19),
|
||||
new Token("_", 19, 19),
|
||||
new Token("_ sentence", 19, 27),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence _", 19, 33),
|
||||
new Token("_", 33, 33),
|
||||
new Token("_ shingles", 33, 39),
|
||||
new Token("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
|
||||
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("please divide this", 0, 18),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide this", 7, 18),
|
||||
new Token("divide this sentence", 7, 27),
|
||||
new Token("this", 14, 18),
|
||||
new Token("this sentence", 14, 27),
|
||||
new Token("this sentence into", 14, 32),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence into", 19, 32),
|
||||
new Token("sentence into shingles", 19, 39),
|
||||
new Token("into", 28, 32),
|
||||
new Token("into shingles", 28, 39),
|
||||
new Token("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||
};
|
||||
|
||||
public static final String[] TRI_GRAM_TYPES = new String[] {
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle", "shingle",
|
||||
"word", "shingle",
|
||||
"word"
|
||||
};
|
||||
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
testTokenWithHoles = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("shingles", 33, 39),
|
||||
};
|
||||
|
||||
testTokenWithHoles[2].setPositionIncrement(2);
|
||||
testTokenWithHoles[3].setPositionIncrement(2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Class under test for void ShingleFilter(TokenStream, int)
|
||||
*/
|
||||
public void testBiGramFilter() throws IOException {
|
||||
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
|
||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
||||
}
|
||||
|
||||
public void testBiGramFilterWithHoles() throws IOException {
|
||||
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
|
||||
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
||||
}
|
||||
|
||||
public void testTriGramFilter() throws IOException {
|
||||
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
|
||||
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
|
||||
}
|
||||
|
||||
protected void shingleFilterTest(int n, Token[] testToken, Token[] tokens,
|
||||
int[] positionIncrements, String[] types)
|
||||
throws IOException {
|
||||
|
||||
TokenStream filter = new ShingleFilter(new TestTokenStream(testToken), n);
|
||||
Token token;
|
||||
int i = 0;
|
||||
|
||||
while ((token = filter.next()) != null) {
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
String goldText
|
||||
= new String(tokens[i].termBuffer(), 0, tokens[i].termLength());
|
||||
assertEquals("Wrong termText", goldText, termText);
|
||||
assertEquals("Wrong startOffset for token \"" + termText + "\"",
|
||||
tokens[i].startOffset(), token.startOffset());
|
||||
assertEquals("Wrong endOffset for token \"" + termText + "\"",
|
||||
tokens[i].endOffset(), token.endOffset());
|
||||
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||
positionIncrements[i], token.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"",
|
||||
types[i], token.type());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue