mirror of https://github.com/apache/lucene.git
LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer over itsself, instead it sets the length to the offset of the delimiter. Also optimizes logic and IdentityEncoder to use NIO.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@890791 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
86039cdb36
commit
dad7e60253
|
@ -65,6 +65,12 @@ Build
|
|||
into core, and moved the ICU-based collation support into contrib/icu.
|
||||
(Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2157: DelimitedPayloadTokenFilter no longer copies the buffer
|
||||
over itsself. Instead it sets only the length. This patch also optimizes
|
||||
the logic of the filter and uses NIO for IdentityEncoder. (Uwe Schindler)
|
||||
|
||||
Test Cases
|
||||
|
||||
* LUCENE-2115: Cutover contrib tests to use Java5 generics. (Kay Kay
|
||||
|
|
|
@ -61,26 +61,19 @@ public final class DelimitedPayloadTokenFilter extends TokenFilter {
|
|||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean result = false;
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.termBuffer();
|
||||
final int length = termAtt.termLength();
|
||||
//look for the delimiter
|
||||
boolean seen = false;
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (buffer[i] == delimiter) {
|
||||
termAtt.setTermBuffer(buffer, 0, i);
|
||||
payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
|
||||
seen = true;
|
||||
break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same
|
||||
termAtt.setTermLength(i); // simply set a new length
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (seen == false) {
|
||||
//no delimiter
|
||||
payAtt.setPayload(null);
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
return result;
|
||||
// we have not seen the delimiter
|
||||
payAtt.setPayload(null);
|
||||
return true;
|
||||
} else return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,9 @@ package org.apache.lucene.analysis.payloads;
|
|||
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.CharBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Arrays;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -30,28 +30,30 @@ import java.io.UnsupportedEncodingException;
|
|||
public class IdentityEncoder extends AbstractEncoder implements PayloadEncoder{
|
||||
|
||||
protected Charset charset = Charset.forName("UTF-8");
|
||||
protected String charsetName = "UTF-8"; //argh, stupid 1.4
|
||||
|
||||
/** @deprecated This field is no longer used. Use {@link #charset} instead. */
|
||||
@Deprecated
|
||||
protected String charsetName = charset.name();
|
||||
|
||||
public IdentityEncoder() {
|
||||
}
|
||||
|
||||
public IdentityEncoder(Charset charset) {
|
||||
this.charset = charset;
|
||||
// @deprecated, remove this in 4.0:
|
||||
charsetName = charset.name();
|
||||
}
|
||||
|
||||
|
||||
public Payload encode(char[] buffer, int offset, int length) {
|
||||
//what's the most efficient way to get a byte [] from a char[] array
|
||||
//Do we have to go through String?
|
||||
String tmp = new String(buffer, offset, length);
|
||||
Payload result = null;//Can we avoid allocating by knowing where using the new API?
|
||||
try {
|
||||
result = new Payload(tmp.getBytes(charsetName));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
//should never hit this, since we get the name from the Charset
|
||||
final ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length));
|
||||
if (bb.hasArray()) {
|
||||
return new Payload(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
|
||||
} else {
|
||||
// normally it should always have an array, but who knows?
|
||||
final byte[] b = new byte[bb.remaining()];
|
||||
bb.get(b);
|
||||
return new Payload(b);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue