SOLR-12775: Add deprecated versions of LowerCaseTokenizer and LowerCaseTokenizerFactory

2018-11-22 13:54:01 +00:00 · 2018-11-22 13:54:01 +00:00 · 2459072286
parent f7fa25069e
commit 2459072286
5 changed files with 300 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -98,6 +98,9 @@ Other Changes

 * SOLR-12620: Remove the Admin UI Cloud -> Graph (Radial) view (janhoy)

+* SOLR-12775: LowerCaseTokenizer is deprecated, and should be replaced by LetterTokenizer and
+  LowerCaseFilter (Alan Woodward)
+
 ==================  7.7.0 ==================

 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
--- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizer.java
+++ b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizer.java
@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.AttributeFactory;
+
+/**
+ * LowerCaseTokenizer performs the function of LetterTokenizer
+ * and LowerCaseFilter together.  It divides text at non-letters and converts
+ * them to lower case.  While it is functionally equivalent to the combination
+ * of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+ * to doing the two tasks at once, hence this (redundant) implementation.
+ * <P>
+ * Note: this does a decent job for most European languages, but does a terrible
+ * job for some Asian languages, where words are not separated by spaces.
+ * </p>
+ *
+ * @deprecated Use {@link LetterTokenizer} and {@link org.apache.lucene.analysis.LowerCaseFilter}
+ */
+@Deprecated
+public final class LowerCaseTokenizer extends Tokenizer {
+
+  /**
+   * Construct a new LowerCaseTokenizer.
+   */
+  public LowerCaseTokenizer() {
+    this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
+  }
+
+  /**
+   * Construct a new LowerCaseTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory
+   *          the attribute factory to use for this {@link Tokenizer}
+   */
+  public LowerCaseTokenizer(AttributeFactory factory) {
+    this(factory, DEFAULT_MAX_WORD_LEN);
+  }
+
+  /**
+   * Construct a new LowerCaseTokenizer using a given
+   * {@link org.apache.lucene.util.AttributeFactory}.
+   *
+   * @param factory the attribute factory to use for this {@link Tokenizer}
+   * @param maxTokenLen maximum token length the tokenizer will emit.
+   *        Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
+   * @throws IllegalArgumentException if maxTokenLen is invalid.
+   */
+  public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
+    super(factory);
+    this.maxTokenLen = maxTokenLen;
+  }
+
+  private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
+  public static final int DEFAULT_MAX_WORD_LEN = 255;
+  private static final int IO_BUFFER_SIZE = 4096;
+  private final int maxTokenLen;
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    clearAttributes();
+    int length = 0;
+    int start = -1; // this variable is always initialized
+    int end = -1;
+    char[] buffer = termAtt.buffer();
+    while (true) {
+      if (bufferIndex >= dataLen) {
+        offset += dataLen;
+        CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
+        if (ioBuffer.getLength() == 0) {
+          dataLen = 0; // so next offset += dataLen won't decrement offset
+          if (length > 0) {
+            break;
+          } else {
+            finalOffset = correctOffset(offset);
+            return false;
+          }
+        }
+        dataLen = ioBuffer.getLength();
+        bufferIndex = 0;
+      }
+      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
+      final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
+      final int charCount = Character.charCount(c);
+      bufferIndex += charCount;
+
+      if (Character.isLetter(c)) {               // if it's a token char
+        if (length == 0) {                // start of token
+          assert start == -1;
+          start = offset + bufferIndex - charCount;
+          end = start;
+        } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
+          buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
+        }
+        end += charCount;
+        length += Character.toChars(Character.toLowerCase(c), buffer, length); // buffer it, normalized
+        if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
+          break;
+        }
+      } else if (length > 0) {           // at non-Letter w/ chars
+        break;                           // return 'em
+      }
+    }
+
+    termAtt.setLength(length);
+    assert start != -1;
+    offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(end));
+    return true;
+
+  }
+
+  @Override
+  public final void end() throws IOException {
+    super.end();
+    // set final offset
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    bufferIndex = 0;
+    offset = 0;
+    dataLen = 0;
+    finalOffset = 0;
+    ioBuffer.reset(); // make sure to reset the IO buffer!!
+  }
+
+}
--- a/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
+import org.apache.lucene.analysis.util.CharTokenizer;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+
+import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
+
+/**
+ * Factory for {@link LowerCaseTokenizer}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
+ * &lt;analyzer&gt;
+ * &lt;tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/&gt;
+ * &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * <p>
+ * Options:
+ * <ul>
+ * <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
+ *     It is rare to need to change this
+ * else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
+ * </ul>
+ * @deprecated Use {@link org.apache.lucene.analysis.core.LetterTokenizerFactory} and {@link LowerCaseFilterFactory}
+ */
+@Deprecated
+public class LowerCaseTokenizerFactory extends TokenizerFactory {
+
+  private final int maxTokenLen;
+
+  /**
+   * Creates a new LowerCaseTokenizerFactory
+   */
+  public LowerCaseTokenizerFactory(Map<String, String> args) {
+    super(args);
+    maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
+    if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
+      throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
+    }
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public LowerCaseTokenizer create(AttributeFactory factory) {
+    return new LowerCaseTokenizer(factory, maxTokenLen);
+  }
+
+}
--- a/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-deprecations.xml
@ -0,0 +1,36 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<schema name="schema-deprecations" version="1.6">
+
+  <types>
+    <fieldType name="lowertok" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    <fieldType name="string" class="solr.StrField"/>
+    <fieldType name="long" class="${solr.tests.LongFieldType}"/>
+  </types>
+
+  <fields>
+    <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
+    <field name="_version_" type="long" indexed="false" stored="false" docValues="true"/>
+    <field name="lowertext" type="lowertok" indexed="true"/>
+  </fields>
+
+</schema>
--- a/solr/core/src/test/org/apache/solr/analysis/TestDeprecatedFilters.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestDeprecatedFilters.java
@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+public class TestDeprecatedFilters extends SolrTestCaseJ4 {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-master.xml","schema-deprecations.xml");
+  }
+
+  public void testLowerCaseTokenizer() {
+    assertU(adoc("id", "1", "lowertext", "THIS IS A TEST"));
+    assertU(commit());
+    assertQ(req("lowertext:test"), "//result[@numFound=1]");
+  }
+
+}