mirror of https://github.com/apache/lucene.git
LUCENE-400: Added ShingleFilter (token based ngram)
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@642612 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
09b3a56ae2
commit
cc955c9748
|
@ -139,6 +139,9 @@ New features
|
||||||
Index store similar to MemoryIndex but allows for multiple documents
|
Index store similar to MemoryIndex but allows for multiple documents
|
||||||
in memory. (Karl Wettin via Grant Ingersoll)
|
in memory. (Karl Wettin via Grant Ingersoll)
|
||||||
|
|
||||||
|
12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
|
||||||
|
that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,96 @@
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A ShingleAnalyzerWrapper wraps a ShingleFilter around another analyzer. A shingle is another term for a token based
|
||||||
|
* n-gram.
|
||||||
|
*/
|
||||||
|
public class ShingleAnalyzerWrapper extends Analyzer {
|
||||||
|
|
||||||
|
protected Analyzer defaultAnalyzer;
|
||||||
|
protected int maxShingleSize = 2;
|
||||||
|
protected boolean outputUnigrams = true;
|
||||||
|
|
||||||
|
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
|
||||||
|
super();
|
||||||
|
this.defaultAnalyzer = defaultAnalyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
|
||||||
|
this(defaultAnalyzer);
|
||||||
|
this.maxShingleSize = maxShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ShingleAnalyzerWrapper() {
|
||||||
|
super();
|
||||||
|
this.defaultAnalyzer = new StandardAnalyzer();
|
||||||
|
}
|
||||||
|
|
||||||
|
public ShingleAnalyzerWrapper(int nGramSize) {
|
||||||
|
this();
|
||||||
|
this.maxShingleSize = nGramSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The max shingle (ngram) size
|
||||||
|
* @return The max shingle (ngram) size
|
||||||
|
*/
|
||||||
|
public int getMaxShingleSize() {
|
||||||
|
return maxShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the maximum size of output shingles (default: 2)
|
||||||
|
*
|
||||||
|
* @param maxShingleSize max shingle size
|
||||||
|
*/
|
||||||
|
public void setMaxShingleSize(int maxShingleSize) {
|
||||||
|
this.maxShingleSize = maxShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isOutputUnigrams() {
|
||||||
|
return outputUnigrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shall the filter pass the original tokens (the "unigrams") to the output
|
||||||
|
* stream? (default: true)
|
||||||
|
*
|
||||||
|
* @param outputUnigrams Whether or not the filter shall pass the original
|
||||||
|
* tokens to the output stream
|
||||||
|
*/
|
||||||
|
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||||
|
this.outputUnigrams = outputUnigrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
ShingleFilter filter
|
||||||
|
= new ShingleFilter(defaultAnalyzer.tokenStream(fieldName, reader));
|
||||||
|
filter.setMaxShingleSize(maxShingleSize);
|
||||||
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,274 @@
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream,
|
||||||
|
* that is, combinations of tokens that are indexed as one token.
|
||||||
|
*
|
||||||
|
* <p>For example, the sentence "please divide this sentence into shingles"
|
||||||
|
* would be tokenized into the tokens "please divide", "divide this",
|
||||||
|
* "this sentence", "sentence into", and "into shingles".
|
||||||
|
*
|
||||||
|
* <p>This filter handles position increments > 1 by inserting filler tokens
|
||||||
|
* (tokens with termtext "_"). It does not handle a position increment of 0.
|
||||||
|
*/
|
||||||
|
public class ShingleFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private LinkedList shingleBuf = new LinkedList();
|
||||||
|
private LinkedList outputBuf = new LinkedList();
|
||||||
|
private LinkedList tokenBuf = new LinkedList();
|
||||||
|
private StringBuffer[] shingles;
|
||||||
|
private String tokenType = "shingle";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* filler token for when positionIncrement is more than 1
|
||||||
|
*/
|
||||||
|
public static final String FILLER_TOKEN = "_";
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* default maximum shingle size is 2.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The string to use when joining adjacent tokens to form a shingle
|
||||||
|
*/
|
||||||
|
public static final String TOKEN_SEPARATOR = " ";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* By default, we output unigrams (individual tokens) as well as shingles
|
||||||
|
* (token n-grams).
|
||||||
|
*/
|
||||||
|
private boolean outputUnigrams = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* maximum shingle size (number of tokens)
|
||||||
|
*/
|
||||||
|
private int maxShingleSize;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a ShingleFilter with the specified single size from the
|
||||||
|
* TokenStream <code>input</code>
|
||||||
|
*
|
||||||
|
* @param input input stream
|
||||||
|
* @param maxShingleSize maximum shingle size produced by the filter.
|
||||||
|
*/
|
||||||
|
public ShingleFilter(TokenStream input, int maxShingleSize) {
|
||||||
|
super(input);
|
||||||
|
setMaxShingleSize(maxShingleSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a ShingleFilter with default shingle size.
|
||||||
|
*
|
||||||
|
* @param input input stream
|
||||||
|
*/
|
||||||
|
public ShingleFilter(TokenStream input) {
|
||||||
|
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct a ShingleFilter with the specified token type for shingle tokens.
|
||||||
|
*
|
||||||
|
* @param input input stream
|
||||||
|
* @param tokenType token type for shingle tokens
|
||||||
|
*/
|
||||||
|
public ShingleFilter(TokenStream input, String tokenType) {
|
||||||
|
this(input, DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
|
setTokenType(tokenType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the type of the shingle tokens produced by this filter.
|
||||||
|
* (default: "shingle")
|
||||||
|
*
|
||||||
|
* @param tokenType token tokenType
|
||||||
|
*/
|
||||||
|
public void setTokenType(String tokenType) {
|
||||||
|
this.tokenType = tokenType;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shall the output stream contain the input tokens (unigrams) as well as
|
||||||
|
* shingles? (default: true.)
|
||||||
|
*
|
||||||
|
* @param outputUnigrams Whether or not the output stream shall contain
|
||||||
|
* the input tokens (unigrams)
|
||||||
|
*/
|
||||||
|
public void setOutputUnigrams(boolean outputUnigrams) {
|
||||||
|
this.outputUnigrams = outputUnigrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max shingle size (default: 2)
|
||||||
|
*
|
||||||
|
* @param maxShingleSize max size of output shingles
|
||||||
|
*/
|
||||||
|
public void setMaxShingleSize(int maxShingleSize) {
|
||||||
|
if (maxShingleSize < 2) {
|
||||||
|
throw new IllegalArgumentException("Max shingle size must be >= 2");
|
||||||
|
}
|
||||||
|
shingles = new StringBuffer[maxShingleSize];
|
||||||
|
for (int i = 0; i < shingles.length; i++) {
|
||||||
|
shingles[i] = new StringBuffer();
|
||||||
|
}
|
||||||
|
this.maxShingleSize = maxShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear the StringBuffers that are used for storing the output shingles.
|
||||||
|
*/
|
||||||
|
private void clearShingles() {
|
||||||
|
for (int i = 0; i < shingles.length; i++) {
|
||||||
|
shingles[i].setLength(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* (non-Javadoc)
|
||||||
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
|
*/
|
||||||
|
public Token next() throws IOException {
|
||||||
|
if (outputBuf.isEmpty()) {
|
||||||
|
fillOutputBuf();
|
||||||
|
}
|
||||||
|
Token nextToken = null;
|
||||||
|
if ( ! outputBuf.isEmpty())
|
||||||
|
{
|
||||||
|
nextToken = (Token)outputBuf.remove(0);
|
||||||
|
}
|
||||||
|
return nextToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the next token from the input stream and push it on the token buffer.
|
||||||
|
* If we encounter a token with position increment > 1, we put filler tokens
|
||||||
|
* on the token buffer.
|
||||||
|
* <p/>
|
||||||
|
* Returns null when the end of the input stream is reached.
|
||||||
|
* @return the next token, or null if at end of input stream
|
||||||
|
* @throws IOException if the input stream has a problem
|
||||||
|
*/
|
||||||
|
private Token getNextToken() throws IOException {
|
||||||
|
if (tokenBuf.isEmpty()) {
|
||||||
|
Token lastToken = input.next();
|
||||||
|
if (lastToken != null) {
|
||||||
|
for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
|
||||||
|
tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
|
||||||
|
lastToken.startOffset()));
|
||||||
|
}
|
||||||
|
tokenBuf.add(lastToken);
|
||||||
|
return getNextToken();
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return (Token)tokenBuf.remove(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fill the output buffer with new shingles.
|
||||||
|
*
|
||||||
|
* @throws IOException if there's a problem getting the next token
|
||||||
|
*/
|
||||||
|
private void fillOutputBuf() throws IOException {
|
||||||
|
boolean addedToken = false;
|
||||||
|
/*
|
||||||
|
* Try to fill the shingle buffer.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
Token token = getNextToken();
|
||||||
|
if (token != null) {
|
||||||
|
shingleBuf.add(token);
|
||||||
|
if (shingleBuf.size() > maxShingleSize)
|
||||||
|
{
|
||||||
|
shingleBuf.remove(0);
|
||||||
|
}
|
||||||
|
addedToken = true;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while (shingleBuf.size() < maxShingleSize);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If no new token could be added to the shingle buffer, we have reached
|
||||||
|
* the end of the input stream and have to discard the least recent token.
|
||||||
|
*/
|
||||||
|
if (! addedToken) {
|
||||||
|
if (shingleBuf.isEmpty()) {
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
shingleBuf.remove(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
clearShingles();
|
||||||
|
|
||||||
|
int[] endOffsets = new int[shingleBuf.size()];
|
||||||
|
for (int i = 0; i < endOffsets.length; i++) {
|
||||||
|
endOffsets[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
Token token = null;
|
||||||
|
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
|
||||||
|
token = (Token) it.next();
|
||||||
|
for (int j = i; j < shingles.length; j++) {
|
||||||
|
if (shingles[j].length() != 0) {
|
||||||
|
shingles[j].append(TOKEN_SEPARATOR);
|
||||||
|
}
|
||||||
|
shingles[j].append(token.termBuffer(), 0, token.termLength());
|
||||||
|
}
|
||||||
|
|
||||||
|
endOffsets[i] = token.endOffset();
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((! shingleBuf.isEmpty()) && outputUnigrams) {
|
||||||
|
Token unigram = (Token) shingleBuf.getFirst();
|
||||||
|
unigram.setPositionIncrement(1);
|
||||||
|
outputBuf.add(unigram);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Push new tokens to the output buffer.
|
||||||
|
*/
|
||||||
|
for (int j = 1; j < shingleBuf.size(); j++) {
|
||||||
|
Token shingle = new Token(shingles[j].toString(),
|
||||||
|
((Token) shingleBuf.get(0)).startOffset(),
|
||||||
|
endOffsets[j],
|
||||||
|
tokenType);
|
||||||
|
if ((! outputUnigrams) && j == 1) {
|
||||||
|
shingle.setPositionIncrement(1);
|
||||||
|
} else {
|
||||||
|
shingle.setPositionIncrement(0);
|
||||||
|
}
|
||||||
|
outputBuf.add(shingle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,196 @@
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
|
||||||
|
*/
|
||||||
|
public class ShingleAnalyzerWrapperTest extends TestCase {
|
||||||
|
|
||||||
|
public IndexSearcher searcher;
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
junit.textui.TestRunner.run(ShingleAnalyzerWrapperTest.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set up a new index in RAM with three test phrases and the supplied Analyzer.
|
||||||
|
*
|
||||||
|
* @param analyzer the analyzer to use
|
||||||
|
* @return an indexSearcher on the test index.
|
||||||
|
* @throws Exception if an error occurs with index writer or searcher
|
||||||
|
*/
|
||||||
|
public IndexSearcher setUpSearcher(Analyzer analyzer) throws Exception {
|
||||||
|
Directory dir = new RAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, analyzer, true);
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new Field("content", "please divide this sentence into shingles",
|
||||||
|
Field.Store.YES,Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new Field("content", "just another test sentence",
|
||||||
|
Field.Store.YES,Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new Field("content", "a sentence which contains no test",
|
||||||
|
Field.Store.YES,Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
return new IndexSearcher(dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Hits queryParsingTest(Analyzer analyzer, String qs) throws Exception {
|
||||||
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
|
QueryParser qp = new QueryParser("content", analyzer);
|
||||||
|
|
||||||
|
Query q = qp.parse(qs);
|
||||||
|
|
||||||
|
return searcher.search(q);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void compareRanks(Hits hits, int[] ranks) throws Exception {
|
||||||
|
assertEquals(ranks.length, hits.length());
|
||||||
|
for (int i = 0; i < ranks.length; i++) {
|
||||||
|
assertEquals(ranks[i], hits.id(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Will not work on an index without unigrams, since QueryParser automatically
|
||||||
|
* tokenizes on whitespace.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
|
||||||
|
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
|
(new WhitespaceAnalyzer(), 2),
|
||||||
|
"test sentence");
|
||||||
|
int[] ranks = new int[] { 1, 2, 0 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This one fails with an exception.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
|
||||||
|
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
|
(new WhitespaceAnalyzer(), 2),
|
||||||
|
"\"this sentence\"");
|
||||||
|
int[] ranks = new int[] { 0 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This one works, actually.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
|
||||||
|
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
|
(new WhitespaceAnalyzer(), 2),
|
||||||
|
"\"test sentence\"");
|
||||||
|
int[] ranks = new int[] { 1 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Same as above, is tokenized without using the analyzer.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
|
||||||
|
Hits hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
|
(new WhitespaceAnalyzer(), 2),
|
||||||
|
"+test +sentence");
|
||||||
|
int[] ranks = new int[] { 1, 2 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This shows how to construct a phrase query containing shingles.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||||
|
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
|
||||||
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
|
PhraseQuery q = new PhraseQuery();
|
||||||
|
|
||||||
|
TokenStream ts = analyzer.tokenStream("content",
|
||||||
|
new StringReader("this sentence"));
|
||||||
|
Token token;
|
||||||
|
int j = -1;
|
||||||
|
while ((token = ts.next()) != null) {
|
||||||
|
j += token.getPositionIncrement();
|
||||||
|
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||||
|
q.add(new Term("content", termText), j);
|
||||||
|
}
|
||||||
|
|
||||||
|
Hits hits = searcher.search(q);
|
||||||
|
int[] ranks = new int[] { 0 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* How to construct a boolean query with shingles. A query like this will
|
||||||
|
* implicitly score those documents higher that contain the words in the query
|
||||||
|
* in the right order and adjacent to each other.
|
||||||
|
*/
|
||||||
|
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||||
|
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
|
||||||
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
|
||||||
|
TokenStream ts = analyzer.tokenStream("content",
|
||||||
|
new StringReader("test sentence"));
|
||||||
|
Token token;
|
||||||
|
while ((token = ts.next()) != null) {
|
||||||
|
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||||
|
q.add(new TermQuery(new Term("content", termText)),
|
||||||
|
BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
|
||||||
|
Hits hits = searcher.search(q);
|
||||||
|
int[] ranks = new int[] { 1, 2, 0 };
|
||||||
|
compareRanks(hits, ranks);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,190 @@
|
||||||
|
package org.apache.lucene.analysis.shingle;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
public class ShingleFilterTest extends TestCase {
|
||||||
|
|
||||||
|
public class TestTokenStream extends TokenStream {
|
||||||
|
|
||||||
|
protected int index = 0;
|
||||||
|
protected Token[] testToken;
|
||||||
|
|
||||||
|
public TestTokenStream(Token[] testToken) {
|
||||||
|
super();
|
||||||
|
this.testToken = testToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next() throws IOException {
|
||||||
|
if (index < testToken.length) {
|
||||||
|
return testToken[index++];
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) {
|
||||||
|
junit.textui.TestRunner.run(ShingleFilterTest.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Token[] TEST_TOKEN = new Token[] {
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("shingles", 33, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static Token[] testTokenWithHoles;
|
||||||
|
|
||||||
|
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("please divide", 0, 13),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("divide this", 7, 18),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("this sentence", 14, 27),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("sentence into", 19, 32),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("into shingles", 28, 39),
|
||||||
|
new Token("shingles", 33, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||||
|
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] BI_GRAM_TYPES = new String[] {
|
||||||
|
"word", "shingle", "word", "shingle", "word", "shingle", "word",
|
||||||
|
"shingle", "word", "shingle", "word"
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("please divide", 0, 13),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("divide _", 7, 19),
|
||||||
|
new Token("_", 19, 19),
|
||||||
|
new Token("_ sentence", 19, 27),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("sentence _", 19, 33),
|
||||||
|
new Token("_", 33, 33),
|
||||||
|
new Token("_ shingles", 33, 39),
|
||||||
|
new Token("shingles", 33, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
|
||||||
|
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("please divide", 0, 13),
|
||||||
|
new Token("please divide this", 0, 18),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("divide this", 7, 18),
|
||||||
|
new Token("divide this sentence", 7, 27),
|
||||||
|
new Token("this", 14, 18),
|
||||||
|
new Token("this sentence", 14, 27),
|
||||||
|
new Token("this sentence into", 14, 32),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("sentence into", 19, 32),
|
||||||
|
new Token("sentence into shingles", 19, 39),
|
||||||
|
new Token("into", 28, 32),
|
||||||
|
new Token("into shingles", 28, 39),
|
||||||
|
new Token("shingles", 33, 39)
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||||
|
1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final String[] TRI_GRAM_TYPES = new String[] {
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle", "shingle",
|
||||||
|
"word", "shingle",
|
||||||
|
"word"
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
testTokenWithHoles = new Token[] {
|
||||||
|
new Token("please", 0, 6),
|
||||||
|
new Token("divide", 7, 13),
|
||||||
|
new Token("sentence", 19, 27),
|
||||||
|
new Token("shingles", 33, 39),
|
||||||
|
};
|
||||||
|
|
||||||
|
testTokenWithHoles[2].setPositionIncrement(2);
|
||||||
|
testTokenWithHoles[3].setPositionIncrement(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class under test for void ShingleFilter(TokenStream, int)
|
||||||
|
*/
|
||||||
|
public void testBiGramFilter() throws IOException {
|
||||||
|
this.shingleFilterTest(2, TEST_TOKEN, BI_GRAM_TOKENS,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBiGramFilterWithHoles() throws IOException {
|
||||||
|
this.shingleFilterTest(2, testTokenWithHoles, BI_GRAM_TOKENS_WITH_HOLES,
|
||||||
|
BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTriGramFilter() throws IOException {
|
||||||
|
this.shingleFilterTest(3, TEST_TOKEN, TRI_GRAM_TOKENS,
|
||||||
|
TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void shingleFilterTest(int n, Token[] testToken, Token[] tokens,
|
||||||
|
int[] positionIncrements, String[] types)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
TokenStream filter = new ShingleFilter(new TestTokenStream(testToken), n);
|
||||||
|
Token token;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
while ((token = filter.next()) != null) {
|
||||||
|
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||||
|
String goldText
|
||||||
|
= new String(tokens[i].termBuffer(), 0, tokens[i].termLength());
|
||||||
|
assertEquals("Wrong termText", goldText, termText);
|
||||||
|
assertEquals("Wrong startOffset for token \"" + termText + "\"",
|
||||||
|
tokens[i].startOffset(), token.startOffset());
|
||||||
|
assertEquals("Wrong endOffset for token \"" + termText + "\"",
|
||||||
|
tokens[i].endOffset(), token.endOffset());
|
||||||
|
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||||
|
positionIncrements[i], token.getPositionIncrement());
|
||||||
|
assertEquals("Wrong type for token \"" + termText + "\"",
|
||||||
|
types[i], token.type());
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue