mirror of https://github.com/apache/lucene.git
LUCENE-2295: Added a LimitTokenCountAnalyzer / LimitTokenCountFilter to wrap any other Analyzer and provide the same functionality as MaxFieldLength provided on IndexWriter. This patch also fixes a bug in the offset calculation in CharTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@949445 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b457760585
commit
98b252ed7f
|
@ -508,6 +508,11 @@ New features
|
|||
can be used to prevent commits from ever getting deleted from the index.
|
||||
(Shai Erera)
|
||||
|
||||
* LUCENE-2295: Added a LimitTokenCountAnalyzer / LimitTokenCountFilter
|
||||
to wrap any other Analyzer and provide the same functionality as
|
||||
MaxFieldLength provided on IndexWriter. This patch also fixes a bug
|
||||
in the offset calculation in CharTokenizer. (Uwe Schindler, Shai Erera)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2075: Terms dict cache is now shared across threads instead
|
||||
|
|
|
@ -151,7 +151,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
this(Version.LUCENE_30, factory, input);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
||||
|
@ -265,17 +265,19 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
if(useOldAPI) // TODO remove this in LUCENE 4.0
|
||||
return incrementTokenOld();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
int start = -1; // this variable is always initialized
|
||||
char[] buffer = termAtt.buffer();
|
||||
while (true) {
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0)
|
||||
if (length > 0) {
|
||||
break;
|
||||
else
|
||||
} else {
|
||||
finalOffset = correctOffset(offset);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
dataLen = ioBuffer.getLength();
|
||||
bufferIndex = 0;
|
||||
|
@ -285,10 +287,12 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
bufferIndex += Character.charCount(c);
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
if (length == 0) // start of token
|
||||
if (length == 0) { // start of token
|
||||
assert start == -1;
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length >= buffer.length-1) // check if a supplementary could run out of bounds
|
||||
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
|
||||
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
|
||||
}
|
||||
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
|
||||
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
|
||||
break;
|
||||
|
@ -297,7 +301,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
termAtt.setLength(length);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
assert start != -1;
|
||||
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length));
|
||||
return true;
|
||||
|
||||
}
|
||||
|
@ -310,7 +315,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
@Deprecated
|
||||
private boolean incrementTokenOld() throws IOException {
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
int start = -1; // this variable is always initialized
|
||||
char[] buffer = termAtt.buffer();
|
||||
final char[] oldIoBuffer = ioBuffer.getBuffer();
|
||||
while (true) {
|
||||
|
@ -320,10 +325,12 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
dataLen = input.read(oldIoBuffer);
|
||||
if (dataLen == -1) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0)
|
||||
if (length > 0) {
|
||||
break;
|
||||
else
|
||||
} else {
|
||||
finalOffset = correctOffset(offset);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
@ -332,10 +339,12 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
if (length == 0) { // start of token
|
||||
assert start == -1;
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
} else if (length == buffer.length) {
|
||||
buffer = termAtt.resizeBuffer(1+length);
|
||||
}
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
|
@ -347,6 +356,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
termAtt.setLength(length);
|
||||
assert start != -1;
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
|
@ -356,7 +366,6 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(offset);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
@ -366,6 +375,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
finalOffset = 0;
|
||||
ioBuffer.reset(); // make sure to reset the IO buffer!!
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This Analyzer limits the number of tokens while indexing. It is
|
||||
* a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}.
|
||||
*/
|
||||
public final class LimitTokenCountAnalyzer extends Analyzer {
|
||||
private final Analyzer delegate;
|
||||
private final int maxTokenCount;
|
||||
|
||||
/**
|
||||
* Build an analyzer that limits the maximum number of tokens per field.
|
||||
*/
|
||||
public LimitTokenCountAnalyzer(Analyzer delegate, int maxTokenCount) {
|
||||
this.delegate = delegate;
|
||||
this.maxTokenCount = maxTokenCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new LimitTokenCountFilter(
|
||||
delegate.tokenStream(fieldName, reader), maxTokenCount
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
return new LimitTokenCountFilter(
|
||||
delegate.reusableTokenStream(fieldName, reader), maxTokenCount
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getPositionIncrementGap(String fieldName) {
|
||||
return delegate.getPositionIncrementGap(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOffsetGap(Fieldable field) {
|
||||
return delegate.getOffsetGap(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "LimitTokenCountAnalyzer(" + delegate.toString() + ", maxTokenCount=" + maxTokenCount + ")";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This TokenFilter limits the number of tokens while indexing. It is
|
||||
* a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}.
|
||||
*/
|
||||
public final class LimitTokenCountFilter extends TokenFilter {
|
||||
|
||||
private final int maxTokenCount;
|
||||
private int tokenCount = 0;
|
||||
|
||||
/**
|
||||
* Build a filter that only accepts tokens up to a maximum number.
|
||||
*/
|
||||
public LimitTokenCountFilter(TokenStream in, int maxTokenCount) {
|
||||
super(in);
|
||||
this.maxTokenCount = maxTokenCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (tokenCount < maxTokenCount && input.incrementToken()) {
|
||||
tokenCount++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
tokenCount = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
public TestLimitTokenCountAnalyzer(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
public void testLimitTokenCountAnalyzer() throws IOException {
|
||||
Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4);
|
||||
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
|
||||
|
||||
a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
|
||||
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue