LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer over itsself, instead it sets the length to the offset of the delimiter. Also optimizes logic and IdentityEncoder to use NIO.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@890791 13f79535-47bb-0310-9956-ffa450edef68
2025-03-03 23:09:36 +00:00 · 2009-12-15 13:27:27 +00:00 · 2009-12-15 13:27:27 +00:00 · dad7e60253
commit dad7e60253
parent 86039cdb36
3 changed files with 27 additions and 26 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -65,6 +65,12 @@ Build
   into core, and moved the ICU-based collation support into contrib/icu.  
   (Robert Muir)

+Optimizations
+
+ * LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer
+   over itsself. Instead it sets only the length. This patch also optimizes
+   the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler)
+
 Test Cases

 * LUCENE-2115: Cutover contrib tests to use Java5 generics.  (Kay Kay
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.java
@ -61,26 +61,19 @@ public final class DelimitedPayloadTokenFilter extends TokenFilter {

  @Override
  public boolean incrementToken() throws IOException {
-    boolean result = false;
    if (input.incrementToken()) {
      final char[] buffer = termAtt.termBuffer();
      final int length = termAtt.termLength();
-      //look for the delimiter
-      boolean seen = false;
      for (int i = 0; i < length; i++) {
        if (buffer[i] == delimiter) {
-          termAtt.setTermBuffer(buffer, 0, i);
          payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
-          seen = true;
-          break;//at this point, we know the whole piece, so we can exit.  If we don't see the delimiter, then the termAtt is the same
+          termAtt.setTermLength(i); // simply set a new length
+          return true;
        }
      }
-      if (seen == false) {
-        //no delimiter
-        payAtt.setPayload(null);
-      }
-      result = true;
-    }
-    return result;
+      // we have not seen the delimiter
+      payAtt.setPayload(null);
+      return true;
+    } else return false;
  }
 }
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/IdentityEncoder.java
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.payloads;

 import org.apache.lucene.index.Payload;

+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
 import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.io.UnsupportedEncodingException;


 /**
@ -30,28 +30,30 @@ import java.io.UnsupportedEncodingException;
 public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{

  protected Charset charset = Charset.forName("UTF-8");
-  protected String charsetName = "UTF-8";  //argh, stupid 1.4
+  
+  /** @deprecated This field is no longer used. Use {@link #charset} instead. */
+  @Deprecated
+  protected String charsetName = charset.name();

  public IdentityEncoder() {
  }

  public IdentityEncoder(Charset charset) {
    this.charset = charset;
+    // @deprecated, remove this in 4.0:
    charsetName = charset.name();
  }


  public Payload encode(char[] buffer, int offset, int length) {
-    //what's the most efficient way to get a byte [] from a char[] array
-    //Do we have to go through String?
-    String tmp = new String(buffer, offset, length);
-    Payload result = null;//Can we avoid allocating by knowing where using the new API?
-    try {
-      result = new Payload(tmp.getBytes(charsetName));
-    } catch (UnsupportedEncodingException e) {
-      //should never hit this, since we get the name from the Charset
+    final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length));
+    if (bb.hasArray()) {
+      return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
+    } else {
+      // normally it should always have an array, but who knows?
+      final byte[] b = new byte[bb.remaining()];
+      bb.get(b);
+      return new Payload(b);
    }
-
-    return result;
  }
 }