mirror of https://github.com/apache/lucene.git
LUCENE-4176: fix AnalyzingQueryParser to analyze range endpoints as bytes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1355001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
032cad944a
commit
93462313c3
|
@ -16,6 +16,11 @@ API Changes
|
|||
has a different API (carries a list of tags instead of a compound tag). Upgrade
|
||||
of embedded morfologik dictionaries to version 1.9. (Dawid Weiss)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4176: Fix AnalyzingQueryParser to analyze range endpoints as bytes,
|
||||
so that it works correctly with Analyzers that produce binary non-UTF-8 terms
|
||||
such as CollationAnalyzer. (Nattapong Sirilappanich via Robert Muir)
|
||||
|
||||
======================= Lucene 4.0.0-ALPHA =======================
|
||||
|
||||
|
|
|
@ -50,6 +50,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
*/
|
||||
public AnalyzingQueryParser(Version matchVersion, String field, Analyzer analyzer) {
|
||||
super(matchVersion, field, analyzer);
|
||||
setAnalyzeRangeTerms(true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -278,72 +279,4 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
|
||||
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Overrides super class, by passing terms through analyzer.
|
||||
* @exception ParseException
|
||||
*/
|
||||
@Override
|
||||
protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive)
|
||||
throws ParseException {
|
||||
// get Analyzer from superclass and tokenize the terms
|
||||
TokenStream source = null;
|
||||
CharTermAttribute termAtt = null;
|
||||
boolean multipleTokens = false;
|
||||
|
||||
if (part1 != null) {
|
||||
// part1
|
||||
try {
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
||||
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
source.reset();
|
||||
multipleTokens = false;
|
||||
|
||||
|
||||
if (source.incrementToken()) {
|
||||
part1 = termAtt.toString();
|
||||
}
|
||||
multipleTokens = source.incrementToken();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
if (multipleTokens) {
|
||||
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
|
||||
+ " - tokens were added to part1");
|
||||
}
|
||||
}
|
||||
|
||||
if (part2 != null) {
|
||||
try {
|
||||
// part2
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
||||
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
source.reset();
|
||||
if (source.incrementToken()) {
|
||||
part2 = termAtt.toString();
|
||||
}
|
||||
multipleTokens = source.incrementToken();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
if (multipleTokens) {
|
||||
throw new ParseException("Cannot build RangeQuery with analyzer " + getAnalyzer().getClass()
|
||||
+ " - tokens were added to part2");
|
||||
}
|
||||
}
|
||||
return super.getRangeQuery(field, part1, part2, startInclusive, endInclusive);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,7 +22,16 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/**
|
||||
|
@ -138,5 +147,28 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
|
|||
Tokenizer result = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(result, new FoldingFilter(result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-4176
|
||||
public void testByteTerms() throws Exception {
|
||||
Directory ramDir = newDirectory();
|
||||
Analyzer analyzer = new MockBytesAnalyzer();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), ramDir, analyzer);
|
||||
Document doc = new Document();
|
||||
FieldType fieldType = new FieldType();
|
||||
fieldType.setIndexed(true);
|
||||
fieldType.setTokenized(true);
|
||||
fieldType.setStored(true);
|
||||
Field field = new Field("content","เข", fieldType);
|
||||
doc.add(field);
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
DirectoryReader ir = DirectoryReader.open(ramDir);
|
||||
IndexSearcher is = new IndexSearcher(ir);
|
||||
QueryParser qp = new AnalyzingQueryParser(TEST_VERSION_CURRENT, "content", analyzer);
|
||||
Query q = qp.parse("[เข TO เข]");
|
||||
assertEquals(1, is.search(q, 10).totalHits);
|
||||
ir.close();
|
||||
ramDir.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Analyzer for testing that encodes terms as UTF-16 bytes.
|
||||
*/
|
||||
public class MockBytesAnalyzer extends Analyzer {
|
||||
private final MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new MockTokenizer(factory, reader, MockTokenizer.KEYWORD, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
return new TokenStreamComponents(t);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Attribute factory that implements CharTermAttribute with
|
||||
* {@link MockUTF16TermAttributeImpl}
|
||||
*/
|
||||
public class MockBytesAttributeFactory extends AttributeSource.AttributeFactory {
|
||||
private final AttributeSource.AttributeFactory delegate =
|
||||
AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
|
||||
|
||||
@Override
|
||||
public AttributeImpl createAttributeInstance(
|
||||
Class<? extends Attribute> attClass) {
|
||||
return attClass.isAssignableFrom(MockUTF16TermAttributeImpl.class)
|
||||
? new MockUTF16TermAttributeImpl()
|
||||
: delegate.createAttributeInstance(attClass);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Extension of {@link CharTermAttributeImpl} that encodes the term
|
||||
* text as UTF-16 bytes instead of as UTF-8 bytes.
|
||||
*/
|
||||
public class MockUTF16TermAttributeImpl extends CharTermAttributeImpl {
|
||||
static final Charset charset = Charset.forName("UTF-16LE");
|
||||
|
||||
@Override
|
||||
public int fillBytesRef() {
|
||||
BytesRef bytes = getBytesRef();
|
||||
byte[] utf16 = toString().getBytes(charset);
|
||||
bytes.bytes = utf16;
|
||||
bytes.offset = 0;
|
||||
bytes.length = utf16.length;
|
||||
return bytes.hashCode();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue