mirror of https://github.com/apache/lucene.git
SOLR-12775: Add deprecated versions of LowerCaseTokenizer and LowerCaseTokenizerFactory
This commit is contained in:
parent
f7fa25069e
commit
2459072286
|
@ -98,6 +98,9 @@ Other Changes
|
|||
|
||||
* SOLR-12620: Remove the Admin UI Cloud -> Graph (Radial) view (janhoy)
|
||||
|
||||
* SOLR-12775: LowerCaseTokenizer is deprecated, and should be replaced by LetterTokenizer and
|
||||
LowerCaseFilter (Alan Woodward)
|
||||
|
||||
================== 7.7.0 ==================
|
||||
|
||||
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CharacterUtils;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
/**
|
||||
* LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
* and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
* them to lower case. While it is functionally equivalent to the combination
|
||||
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||
* to doing the two tasks at once, hence this (redundant) implementation.
|
||||
* <P>
|
||||
* Note: this does a decent job for most European languages, but does a terrible
|
||||
* job for some Asian languages, where words are not separated by spaces.
|
||||
* </p>
|
||||
*
|
||||
* @deprecated Use {@link LetterTokenizer} and {@link org.apache.lucene.analysis.LowerCaseFilter}
|
||||
*/
|
||||
@Deprecated
|
||||
public final class LowerCaseTokenizer extends Tokenizer {
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer.
|
||||
*/
|
||||
public LowerCaseTokenizer() {
|
||||
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
*/
|
||||
public LowerCaseTokenizer(AttributeFactory factory) {
|
||||
this(factory, DEFAULT_MAX_WORD_LEN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
*/
|
||||
public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory);
|
||||
this.maxTokenLen = maxTokenLen;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
public static final int DEFAULT_MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
private final int maxTokenLen;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
int length = 0;
|
||||
int start = -1; // this variable is always initialized
|
||||
int end = -1;
|
||||
char[] buffer = termAtt.buffer();
|
||||
while (true) {
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
|
||||
if (ioBuffer.getLength() == 0) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0) {
|
||||
break;
|
||||
} else {
|
||||
finalOffset = correctOffset(offset);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
dataLen = ioBuffer.getLength();
|
||||
bufferIndex = 0;
|
||||
}
|
||||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
|
||||
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
|
||||
final int charCount = Character.charCount(c);
|
||||
bufferIndex += charCount;
|
||||
|
||||
if (Character.isLetter(c)) { // if it's a token char
|
||||
if (length == 0) { // start of token
|
||||
assert start == -1;
|
||||
start = offset + bufferIndex - charCount;
|
||||
end = start;
|
||||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
|
||||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
|
||||
}
|
||||
end += charCount;
|
||||
length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized
|
||||
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
|
||||
break;
|
||||
}
|
||||
} else if (length > 0) { // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
}
|
||||
}
|
||||
|
||||
termAtt.setLength(length);
|
||||
assert start != -1;
|
||||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
super.end();
|
||||
// set final offset
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
finalOffset = 0;
|
||||
ioBuffer.reset(); // make sure to reset the IO buffer!!
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Factory for {@link LowerCaseTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* <p>
|
||||
* Options:
|
||||
* <ul>
|
||||
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
|
||||
* It is rare to need to change this
|
||||
* else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
|
||||
* </ul>
|
||||
* @deprecated Use {@link org.apache.lucene.analysis.core.LetterTokenizerFactory} and {@link LowerCaseFilterFactory}
|
||||
*/
|
||||
@Deprecated
|
||||
public class LowerCaseTokenizerFactory extends TokenizerFactory {
|
||||
|
||||
private final int maxTokenLen;
|
||||
|
||||
/**
|
||||
* Creates a new LowerCaseTokenizerFactory
|
||||
*/
|
||||
public LowerCaseTokenizerFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public LowerCaseTokenizer create(AttributeFactory factory) {
|
||||
return new LowerCaseTokenizer(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<schema name="schema-deprecations" version="1.6">
|
||||
|
||||
<types>
|
||||
<fieldType name="lowertok" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.LowerCaseTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
<fieldType name="string" class="solr.StrField"/>
|
||||
<fieldType name="long" class="${solr.tests.LongFieldType}"/>
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
|
||||
<field name="_version_" type="long" indexed="false" stored="false" docValues="true"/>
|
||||
<field name="lowertext" type="lowertok" indexed="true"/>
|
||||
</fields>
|
||||
|
||||
</schema>
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestDeprecatedFilters extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-master.xml","schema-deprecations.xml");
|
||||
}
|
||||
|
||||
public void testLowerCaseTokenizer() {
|
||||
assertU(adoc("id", "1", "lowertext", "THIS IS A TEST"));
|
||||
assertU(commit());
|
||||
assertQ(req("lowertext:test"), "//result[@numFound=1]");
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue