LUCENE-400: Added ShingleFilter (token based ngram)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@642612 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-03-29 21:11:33 +00:00
parent 09b3a56ae2
commit cc955c9748
5 changed files with 759 additions and 0 deletions

View File

@ -139,6 +139,9 @@ New features
Index store similar to MemoryIndex but allows for multiple documents Index store similar to MemoryIndex but allows for multiple documents
in memory. (Karl Wettin via Grant Ingersoll) in memory. (Karl Wettin via Grant Ingersoll)
12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll)
Optimizations Optimizations

View File

@ -0,0 +1,96 @@
package org.apache.lucene.analysis.shingle;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
/**
* A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A shingle is another term for a token based
* n-gram.
*/
public class ShingleAnalyzerWrapper extends Analyzer {
protected Analyzer defaultAnalyzer;
protected int maxShingleSize = 2;
protected boolean outputUnigrams = true;
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
super();
this.defaultAnalyzer = defaultAnalyzer;
}
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
this(defaultAnalyzer);
this.maxShingleSize = maxShingleSize;
}
public ShingleAnalyzerWrapper() {
super();
this.defaultAnalyzer = new StandardAnalyzer();
}
public ShingleAnalyzerWrapper(int nGramSize) {
this();
this.maxShingleSize = nGramSize;
}
/**
* The max shingle (ngram) size
* @return The max shingle (ngram) size
*/
public int getMaxShingleSize() {
return maxShingleSize;
}
/**
* Set the maximum size of output shingles (default: 2)
*
* @param maxShingleSize max shingle size
*/
public void setMaxShingleSize(int maxShingleSize) {
this.maxShingleSize = maxShingleSize;
}
public boolean isOutputUnigrams() {
return outputUnigrams;
}
/**
* Shall the filter pass the original tokens (the "unigrams") to the output
* stream? (default: true)
*
* @param outputUnigrams Whether or not the filter shall pass the original
* tokens to the output stream
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
public TokenStream tokenStream(String fieldName, Reader reader) {
ShingleFilter filter
= new ShingleFilter(defaultAnalyzer.tokenStream(fieldName, reader));
filter.setMaxShingleSize(maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
return filter;
}
}

View File

@ -0,0 +1,274 @@
package org.apache.lucene.analysis.shingle;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
/**
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream,
* that is, combinations of tokens that are indexed as one token.
*
* <p>For example, the sentence "please divide this sentence into shingles"
* would be tokenized into the tokens "please divide", "divide this",
* "this sentence", "sentence into", and "into shingles".
*
* <p>This filter handles position increments > 1 by inserting filler tokens
* (tokens with termtext "_"). It does not handle a position increment of 0.
*/
public class ShingleFilter extends TokenFilter {
private LinkedList shingleBuf = new LinkedList();
private LinkedList outputBuf = new LinkedList();
private LinkedList tokenBuf = new LinkedList();
private StringBuffer[] shingles;
private String tokenType = "shingle";
/**
* filler token for when positionIncrement is more than 1
*/
public static final String FILLER_TOKEN = "_";
/**
* default maximum shingle size is 2.
*/
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
/**
* The string to use when joining adjacent tokens to form a shingle
*/
public static final String TOKEN_SEPARATOR = " ";
/**
* By default, we output unigrams (individual tokens) as well as shingles
* (token n-grams).
*/
private boolean outputUnigrams = true;
/**
* maximum shingle size (number of tokens)
*/
private int maxShingleSize;
/**
* Construct a ShingleFilter with the specified single size from the
* TokenStream <code>input</code>
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize) {
super(input);
setMaxShingleSize(maxShingleSize);
}
/**
* Construct a ShingleFilter with default shingle size.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input) {
this(input, DEFAULT_MAX_SHINGLE_SIZE);
}
/**
* Construct a ShingleFilter with the specified token type for shingle tokens.
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, String tokenType) {
this(input, DEFAULT_MAX_SHINGLE_SIZE);
setTokenType(tokenType);
}
/**
* Set the type of the shingle tokens produced by this filter.
* (default: "shingle")
*
* @param tokenType token tokenType
*/
public void setTokenType(String tokenType) {
this.tokenType = tokenType;
}
/**
* Shall the output stream contain the input tokens (unigrams) as well as
* shingles? (default: true.)
*
* @param outputUnigrams Whether or not the output stream shall contain
* the input tokens (unigrams)
*/
public void setOutputUnigrams(boolean outputUnigrams) {
this.outputUnigrams = outputUnigrams;
}
/**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
*/
public void setMaxShingleSize(int maxShingleSize) {
if (maxShingleSize < 2) {
throw new IllegalArgumentException("Max shingle size must be >= 2");
}
shingles = new StringBuffer[maxShingleSize];
for (int i = 0; i < shingles.length; i++) {
shingles[i] = new StringBuffer();
}
this.maxShingleSize = maxShingleSize;
}
/**
* Clear the StringBuffers that are used for storing the output shingles.
*/
private void clearShingles() {
for (int i = 0; i < shingles.length; i++) {
shingles[i].setLength(0);
}
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException {
if (outputBuf.isEmpty()) {
fillOutputBuf();
}
Token nextToken = null;
if ( ! outputBuf.isEmpty())
{
nextToken = (Token)outputBuf.remove(0);
}
return nextToken;
}
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
* on the token buffer.
* <p/>
* Returns null when the end of the input stream is reached.
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
private Token getNextToken() throws IOException {
if (tokenBuf.isEmpty()) {
Token lastToken = input.next();
if (lastToken != null) {
for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
lastToken.startOffset()));
}
tokenBuf.add(lastToken);
return getNextToken();
} else {
return null;
}
} else {
return (Token)tokenBuf.remove(0);
}
}
/**
* Fill the output buffer with new shingles.
*
* @throws IOException if there's a problem getting the next token
*/
private void fillOutputBuf() throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
Token token = getNextToken();
if (token != null) {
shingleBuf.add(token);
if (shingleBuf.size() > maxShingleSize)
{
shingleBuf.remove(0);
}
addedToken = true;
} else {
break;
}
} while (shingleBuf.size() < maxShingleSize);
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (! addedToken) {
if (shingleBuf.isEmpty()) {
return;
} else {
shingleBuf.remove(0);
}
}
clearShingles();
int[] endOffsets = new int[shingleBuf.size()];
for (int i = 0; i < endOffsets.length; i++) {
endOffsets[i] = 0;
}
int i = 0;
Token token = null;
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
token = (Token) it.next();
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
shingles[j].append(token.termBuffer(), 0, token.termLength());
}
endOffsets[i] = token.endOffset();
i++;
}
if ((! shingleBuf.isEmpty()) && outputUnigrams) {
Token unigram = (Token) shingleBuf.getFirst();
unigram.setPositionIncrement(1);
outputBuf.add(unigram);
}
/*
* Push new tokens to the output buffer.
*/
for (int j = 1; j < shingleBuf.size(); j++) {
Token shingle = new Token(shingles[j].toString(),
((Token) shingleBuf.get(0)).startOffset(),
endOffsets[j],
tokenType);
if ((! outputUnigrams) && j == 1) {
shingle.setPositionIncrement(1);
} else {
shingle.setPositionIncrement(0);
}
outputBuf.add(shingle);
}
}
}

View File

@ -0,0 +1,196 @@
package org.apache.lucene.analysis.shingle;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import junit.framework.TestCase;
/**
* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
*/
public class ShingleAnalyzerWrapperTest extends TestCase {
public IndexSearcher searcher;
public static void main(String[] args) {
junit.textui.TestRunner.run(ShingleAnalyzerWrapperTest.class);
}
/**
* Set up a new index in RAM with three test phrases and the supplied Analyzer.
*
* @param analyzer the analyzer to use
* @return an indexSearcher on the test index.
* @throws Exception if an error occurs with index writer or searcher
*/
public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, analyzer, true);
Document doc;
doc = new Document();
doc.add(new Field("content", "please divide this sentence into shingles",
Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
doc = new Document();
doc.add(new Field("content", "just another test sentence",
Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
doc = new Document();
doc.add(new Field("content", "a sentence which contains no test",
Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.close();
return new IndexSearcher(dir);
}
protected Hits queryParsingTest(Analyzer analyzer, String qs) throws Exception {
searcher = setUpSearcher(analyzer);
QueryParser qp = new QueryParser("content", analyzer);
Query q = qp.parse(qs);
return searcher.search(q);
}
protected void compareRanks(Hits hits, int[] ranks) throws Exception {
assertEquals(ranks.length, hits.length());
for (int i = 0; i < ranks.length; i++) {
assertEquals(ranks[i], hits.id(i));
}
}
/*
* Will not work on an index without unigrams, since QueryParser automatically
* tokenizes on whitespace.
*/
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"test sentence");
int[] ranks = new int[] { 1, 2, 0 };
compareRanks(hits, ranks);
}
/*
* This one fails with an exception.
*/
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"\"this sentence\"");
int[] ranks = new int[] { 0 };
compareRanks(hits, ranks);
}
/*
* This one works, actually.
*/
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"\"test sentence\"");
int[] ranks = new int[] { 1 };
compareRanks(hits, ranks);
}
/*
* Same as above, is tokenized without using the analyzer.
*/
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
(new WhitespaceAnalyzer(), 2),
"+test +sentence");
int[] ranks = new int[] { 1, 2 };
compareRanks(hits, ranks);
}
/*
* This shows how to construct a phrase query containing shingles.
*/
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
searcher = setUpSearcher(analyzer);
PhraseQuery q = new PhraseQuery();
TokenStream ts = analyzer.tokenStream("content",
new StringReader("this sentence"));
Token token;
int j = -1;
while ((token = ts.next()) != null) {
j += token.getPositionIncrement();
String termText = new String(token.termBuffer(), 0, token.termLength());
q.add(new Term("content", termText), j);
}
Hits hits = searcher.search(q);
int[] ranks = new int[] { 0 };
compareRanks(hits, ranks);
}
/*
* How to construct a boolean query with shingles. A query like this will
* implicitly score those documents higher that contain the words in the query
* in the right order and adjacent to each other.
*/
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
searcher = setUpSearcher(analyzer);
BooleanQuery q = new BooleanQuery();
TokenStream ts = analyzer.tokenStream("content",
new StringReader("test sentence"));
Token token;
while ((token = ts.next()) != null) {
String termText = new String(token.termBuffer(), 0, token.termLength());
q.add(new TermQuery(new Term("content", termText)),
BooleanClause.Occur.SHOULD);
}
Hits hits = searcher.search(q);
int[] ranks = new int[] { 1, 2, 0 };
compareRanks(hits, ranks);
}
}

View File

@ -0,0 +1,190 @@
package org.apache.lucene.analysis.shingle;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class ShingleFilterTest extends TestCase {
public class TestTokenStream extends TokenStream {
protected int index = 0;
protected Token[] testToken;
public TestTokenStream(Token[] testToken) {
super();
this.testToken = testToken;
}
public Token next() throws IOException {
if (index < testToken.length) {
return testToken[index++];
} else {
return null;
}
}
}
public static void main(String[] args) {
junit.textui.TestRunner.run(ShingleFilterTest.class);
}
public static final Token[] TEST_TOKEN = new Token[] {
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 39),
};
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("divide", 7, 13),
new Token("divide this", 7, 18),
new Token("this", 14, 18),
new Token("this sentence", 14, 27),
new Token("sentence", 19, 27),
new Token("sentence into", 19, 32),
new Token("into", 28, 32),
new Token("into shingles", 28, 39),
new Token("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final String[] BI_GRAM_TYPES = new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word", "shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("divide", 7, 13),
new Token("divide _", 7, 19),
new Token("_", 19, 19),
new Token("_ sentence", 19, 27),
new Token("sentence", 19, 27),
new Token("sentence _", 19, 33),
new Token("_", 33, 33),
new Token("_ shingles", 33, 39),
new Token("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
};
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("please divide this", 0, 18),
new Token("divide", 7, 13),
new Token("divide this", 7, 18),
new Token("divide this sentence", 7, 27),
new Token("this", 14, 18),
new Token("this sentence", 14, 27),
new Token("this sentence into", 14, 32),
new Token("sentence", 19, 27),
new Token("sentence into", 19, 32),
new Token("sentence into shingles", 19, 39),
new Token("into", 28, 32),
new Token("into shingles", 28, 39),
new Token("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
};
public static final String[] TRI_GRAM_TYPES = new String[] {
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle", "shingle",
"word", "shingle",
"word"
};
protected void setUp() throws Exception {
super.setUp();
testTokenWithHoles = new Token[] {
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("sentence", 19, 27),
new Token("shingles", 33, 39),
};
testTokenWithHoles[2].setPositionIncrement(2);
testTokenWithHoles[3].setPositionIncrement(2);
}
/*
* Class under test for void ShingleFilter(TokenStream, int)
*/
public void testBiGramFilter() throws IOException {
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
}
public void testBiGramFilterWithHoles() throws IOException {
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
}
public void testTriGramFilter() throws IOException {
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
}
protected void shingleFilterTest(int n, Token[] testToken, Token[] tokens,
int[] positionIncrements, String[] types)
throws IOException {
TokenStream filter = new ShingleFilter(new TestTokenStream(testToken), n);
Token token;
int i = 0;
while ((token = filter.next()) != null) {
String termText = new String(token.termBuffer(), 0, token.termLength());
String goldText
= new String(tokens[i].termBuffer(), 0, tokens[i].termLength());
assertEquals("Wrong termText", goldText, termText);
assertEquals("Wrong startOffset for token \"" + termText + "\"",
tokens[i].startOffset(), token.startOffset());
assertEquals("Wrong endOffset for token \"" + termText + "\"",
tokens[i].endOffset(), token.endOffset());
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
positionIncrements[i], token.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"",
types[i], token.type());
i++;
}
}
}