LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer over itsself, instead it sets the length to the offset of the delimiter. Also optimizes logic and IdentityEncoder to use NIO.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@890791 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-12-15 13:27:27 +00:00
parent 86039cdb36
commit dad7e60253
3 changed files with 27 additions and 26 deletions

View File

@ -65,6 +65,12 @@ Build
into core, and moved the ICU-based collation support into contrib/icu.
(Robert Muir)
Optimizations
* LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer
over itsself. Instead it sets only the length. This patch also optimizes
the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler)
Test Cases
* LUCENE-2115: Cutover contrib tests to use Java5 generics. (Kay Kay

View File

@ -61,26 +61,19 @@ public final class DelimitedPayloadTokenFilter extends TokenFilter {
@Override
public boolean incrementToken() throws IOException {
boolean result = false;
if (input.incrementToken()) {
final char[] buffer = termAtt.termBuffer();
final int length = termAtt.termLength();
//look for the delimiter
boolean seen = false;
for (int i = 0; i < length; i++) {
if (buffer[i] == delimiter) {
termAtt.setTermBuffer(buffer, 0, i);
payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
seen = true;
break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same
termAtt.setTermLength(i); // simply set a new length
return true;
}
}
if (seen == false) {
//no delimiter
// we have not seen the delimiter
payAtt.setPayload(null);
}
result = true;
}
return result;
return true;
} else return false;
}
}

View File

@ -18,9 +18,9 @@ package org.apache.lucene.analysis.payloads;
import org.apache.lucene.index.Payload;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.io.UnsupportedEncodingException;
/**
@ -30,28 +30,30 @@ import java.io.UnsupportedEncodingException;
public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{
protected Charset charset = Charset.forName("UTF-8");
protected String charsetName = "UTF-8"; //argh, stupid 1.4
/** @deprecated This field is no longer used. Use {@link #charset} instead. */
@Deprecated
protected String charsetName = charset.name();
public IdentityEncoder() {
}
public IdentityEncoder(Charset charset) {
this.charset = charset;
// @deprecated, remove this in 4.0:
charsetName = charset.name();
}
public Payload encode(char[] buffer, int offset, int length) {
//what's the most efficient way to get a byte [] from a char[] array
//Do we have to go through String?
String tmp = new String(buffer, offset, length);
Payload result = null;//Can we avoid allocating by knowing where using the new API?
try {
result = new Payload(tmp.getBytes(charsetName));
} catch (UnsupportedEncodingException e) {
//should never hit this, since we get the name from the Charset
}
return result;
final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length));
if (bb.hasArray()) {
return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
} else {
// normally it should always have an array, but who knows?
final byte[] b = new byte[bb.remaining()];
bb.get(b);
return new Payload(b);
}
}
}