LUCENE-2295: Added a LimitTokenCountAnalyzer / LimitTokenCountFilter to wrap any other Analyzer and provide the same functionality as MaxFieldLength provided on IndexWriter. This patch also fixes a bug in the offset calculation in CharTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@949445 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-05-29 23:14:18 +00:00
parent b457760585
commit 98b252ed7f
5 changed files with 202 additions and 13 deletions

View File

@ -508,6 +508,11 @@ New features
can be used to prevent commits from ever getting deleted from the index.
(Shai Erera)
* LUCENE-2295: Added a LimitTokenCountAnalyzer / LimitTokenCountFilter
to wrap any other Analyzer and provide the same functionality as
MaxFieldLength provided on IndexWriter. This patch also fixes a bug
in the offset calculation in CharTokenizer. (Uwe Schindler, Shai Erera)
Optimizations
* LUCENE-2075: Terms dict cache is now shared across threads instead

View File

@ -151,7 +151,7 @@ public abstract class CharTokenizer extends Tokenizer {
this(Version.LUCENE_30, factory, input);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
@ -265,17 +265,19 @@ public abstract class CharTokenizer extends Tokenizer {
if(useOldAPI) // TODO remove this in LUCENE 4.0
return incrementTokenOld();
int length = 0;
int start = bufferIndex;
int start = -1; // this variable is always initialized
char[] buffer = termAtt.buffer();
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
if (length > 0) {
break;
else
} else {
finalOffset = correctOffset(offset);
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
@ -285,10 +287,12 @@ public abstract class CharTokenizer extends Tokenizer {
bufferIndex += Character.charCount(c);
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - 1;
else if (length >= buffer.length-1) // check if a supplementary could run out of bounds
} else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer
}
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
@ -297,7 +301,8 @@ public abstract class CharTokenizer extends Tokenizer {
}
termAtt.setLength(length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
assert start != -1;
offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length));
return true;
}
@ -310,7 +315,7 @@ public abstract class CharTokenizer extends Tokenizer {
@Deprecated
private boolean incrementTokenOld() throws IOException {
int length = 0;
int start = bufferIndex;
int start = -1; // this variable is always initialized
char[] buffer = termAtt.buffer();
final char[] oldIoBuffer = ioBuffer.getBuffer();
while (true) {
@ -320,10 +325,12 @@ public abstract class CharTokenizer extends Tokenizer {
dataLen = input.read(oldIoBuffer);
if (dataLen == -1) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0)
if (length > 0) {
break;
else
} else {
finalOffset = correctOffset(offset);
return false;
}
}
bufferIndex = 0;
}
@ -332,10 +339,12 @@ public abstract class CharTokenizer extends Tokenizer {
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
if (length == 0) { // start of token
assert start == -1;
start = offset + bufferIndex - 1;
else if (length == buffer.length)
} else if (length == buffer.length) {
buffer = termAtt.resizeBuffer(1+length);
}
buffer[length++] = normalize(c); // buffer it, normalized
@ -347,6 +356,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
termAtt.setLength(length);
assert start != -1;
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
@ -356,7 +366,6 @@ public abstract class CharTokenizer extends Tokenizer {
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}
@ -366,6 +375,7 @@ public abstract class CharTokenizer extends Tokenizer {
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.io.IOException;
/**
* This Analyzer limits the number of tokens while indexing. It is
* a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}.
*/
public final class LimitTokenCountAnalyzer extends Analyzer {
private final Analyzer delegate;
private final int maxTokenCount;
/**
* Build an analyzer that limits the maximum number of tokens per field.
*/
public LimitTokenCountAnalyzer(Analyzer delegate, int maxTokenCount) {
this.delegate = delegate;
this.maxTokenCount = maxTokenCount;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new LimitTokenCountFilter(
delegate.tokenStream(fieldName, reader), maxTokenCount
);
}
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return new LimitTokenCountFilter(
delegate.reusableTokenStream(fieldName, reader), maxTokenCount
);
}
@Override
public int getPositionIncrementGap(String fieldName) {
return delegate.getPositionIncrementGap(fieldName);
}
@Override
public int getOffsetGap(Fieldable field) {
return delegate.getOffsetGap(field);
}
@Override
public String toString() {
return "LimitTokenCountAnalyzer(" + delegate.toString() + ", maxTokenCount=" + maxTokenCount + ")";
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
/**
* This TokenFilter limits the number of tokens while indexing. It is
* a replacement for the maximum field length setting inside {@link org.apache.lucene.index.IndexWriter}.
*/
public final class LimitTokenCountFilter extends TokenFilter {
private final int maxTokenCount;
private int tokenCount = 0;
/**
* Build a filter that only accepts tokens up to a maximum number.
*/
public LimitTokenCountFilter(TokenStream in, int maxTokenCount) {
super(in);
this.maxTokenCount = maxTokenCount;
}
@Override
public boolean incrementToken() throws IOException {
if (tokenCount < maxTokenCount && input.incrementToken()) {
tokenCount++;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
tokenCount = 0;
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase {
public TestLimitTokenCountAnalyzer(String name) {
super(name);
}
public void testLimitTokenCountAnalyzer() throws IOException {
Analyzer a = new LimitTokenCountAnalyzer(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 3 }, new int[] { 1, 4 }, 4);
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
a = new LimitTokenCountAnalyzer(new StandardAnalyzer(TEST_VERSION_CURRENT), 2);
// dont use assertAnalyzesTo here, as the end offset is not the end of the string!
assertTokenStreamContents(a.tokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
assertTokenStreamContents(a.reusableTokenStream("dummy", new StringReader("1 2 3 4 5")), new String[] { "1", "2" }, new int[] { 0, 2 }, new int[] { 1, 3 }, 3);
}
}