mirror of https://github.com/apache/lucene.git
LUCENE-7465: add SimplePatternTokenizer and SimpleSplitPatternTokenizer, for tokenization using Lucene's regexp/automaton implementation
This commit is contained in:
parent
7dcf9de41f
commit
93fa72f77b
|
@ -107,6 +107,11 @@ New Features
|
|||
SortedNumericSelector.Type can give a ValueSource view of a
|
||||
SortedNumericDocValues field. (Tomás Fernández Löbbe)
|
||||
|
||||
* LUCENE-7465: Add SimplePatternTokenizer and
|
||||
SimplePatternSplitTokenizer, using Lucene's regexp/automaton
|
||||
implementation for analysis/tokenization (Clinton Gormley, Mike
|
||||
McCandless)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
|
||||
|
|
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
|
||||
* The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. This is just
|
||||
* like {@link SimplePatternTokenizer} except that the pattern shold make valid token separator characters, like
|
||||
* {@code String.split}. Empty string tokens are never produced.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public final class SimplePatternSplitTokenizer extends Tokenizer {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final CharacterRunAutomaton runDFA;
|
||||
|
||||
// TODO: this is copied from SimplePatternTokenizer, but there are subtle differences e.g. we track sepUpto an tokenUpto;
|
||||
// find a clean way to share it:
|
||||
|
||||
// TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
|
||||
// suspect it's slowish:
|
||||
|
||||
private char[] pendingChars = new char[8];
|
||||
private int tokenUpto;
|
||||
private int pendingLimit;
|
||||
private int pendingUpto;
|
||||
private int offset;
|
||||
private int sepUpto;
|
||||
private final char[] buffer = new char[1024];
|
||||
private int bufferLimit;
|
||||
private int bufferNextRead;
|
||||
|
||||
/** See {@link RegExp} for the accepted syntax. */
|
||||
public SimplePatternSplitTokenizer(String regexp) {
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/** Runs a pre-built automaton. */
|
||||
public SimplePatternSplitTokenizer(Automaton dfa) {
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
|
||||
}
|
||||
|
||||
/** See {@link RegExp} for the accepted syntax. */
|
||||
public SimplePatternSplitTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
|
||||
this(factory, new RegExp(regexp).toAutomaton());
|
||||
}
|
||||
|
||||
/** Runs a pre-built automaton. */
|
||||
public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
|
||||
super(factory);
|
||||
|
||||
// we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
|
||||
// realizing this ctor is otherwise trappy
|
||||
if (dfa.isDeterministic() == false) {
|
||||
throw new IllegalArgumentException("please determinize the incoming automaton first");
|
||||
}
|
||||
|
||||
runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
private void fillToken(int offsetStart) {
|
||||
termAtt.setLength(tokenUpto);
|
||||
offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+tokenUpto));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
int offsetStart = offset;
|
||||
|
||||
clearAttributes();
|
||||
|
||||
tokenUpto = 0;
|
||||
|
||||
while (true) {
|
||||
sepUpto = 0;
|
||||
|
||||
// The runDFA operates in Unicode space, not UTF16 (java's char):
|
||||
int ch = nextCodePoint();
|
||||
if (ch == -1) {
|
||||
if (tokenUpto > 0) {
|
||||
fillToken(offsetStart);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
int state = runDFA.step(0, ch);
|
||||
|
||||
if (state != -1) {
|
||||
// a token separator just possibly started; keep scanning to see if the token is accepted:
|
||||
int lastAcceptLength = -1;
|
||||
do {
|
||||
|
||||
if (runDFA.isAccept(state)) {
|
||||
// record that the token separator matches here, but keep scanning in case a longer match also works (greedy):
|
||||
lastAcceptLength = sepUpto;
|
||||
}
|
||||
|
||||
ch = nextCodePoint();
|
||||
if (ch == -1) {
|
||||
break;
|
||||
}
|
||||
state = runDFA.step(state, ch);
|
||||
} while (state != -1);
|
||||
|
||||
if (lastAcceptLength != -1) {
|
||||
// strip the trailing separater we just matched from the token:
|
||||
tokenUpto -= lastAcceptLength;
|
||||
// we found a token separator
|
||||
int extra = sepUpto - lastAcceptLength;
|
||||
if (extra != 0) {
|
||||
pushBack(extra);
|
||||
}
|
||||
if (tokenUpto > 0) {
|
||||
fillToken(offsetStart);
|
||||
return true;
|
||||
} else {
|
||||
// we matched one token separator immediately after another
|
||||
offsetStart = offset;
|
||||
}
|
||||
} else if (ch == -1) {
|
||||
if (tokenUpto > 0) {
|
||||
fillToken(offsetStart);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// false alarm: there was no token separator here; push back all but the first character we scanned
|
||||
pushBack(sepUpto-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
|
||||
offsetAtt.setOffset(ofs, ofs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = 0;
|
||||
pendingUpto = 0;
|
||||
pendingLimit = 0;
|
||||
sepUpto = 0;
|
||||
bufferNextRead = 0;
|
||||
bufferLimit = 0;
|
||||
}
|
||||
|
||||
/** Pushes back the last {@code count} characters in current token's buffer. */
|
||||
private void pushBack(int count) {
|
||||
tokenUpto -= count;
|
||||
assert tokenUpto >= 0;
|
||||
if (pendingLimit == 0) {
|
||||
if (bufferNextRead >= count) {
|
||||
// optimize common case when the chars we are pushing back are still in the buffer
|
||||
bufferNextRead -= count;
|
||||
} else {
|
||||
if (count > pendingChars.length) {
|
||||
pendingChars = ArrayUtil.grow(pendingChars, count);
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
|
||||
pendingLimit = count;
|
||||
}
|
||||
} else {
|
||||
// we are pushing back what is already in our pending buffer
|
||||
pendingUpto -= count;
|
||||
assert pendingUpto >= 0;
|
||||
}
|
||||
offset -= count;
|
||||
}
|
||||
|
||||
private void appendToToken(char ch) {
|
||||
char[] buffer = termAtt.buffer();
|
||||
if (tokenUpto == buffer.length) {
|
||||
buffer = termAtt.resizeBuffer(tokenUpto + 1);
|
||||
}
|
||||
buffer[tokenUpto++] = ch;
|
||||
sepUpto++;
|
||||
}
|
||||
|
||||
private int nextCodeUnit() throws IOException {
|
||||
int result;
|
||||
if (pendingUpto < pendingLimit) {
|
||||
result = pendingChars[pendingUpto++];
|
||||
if (pendingUpto == pendingLimit) {
|
||||
// We used up the pending buffer
|
||||
pendingUpto = 0;
|
||||
pendingLimit = 0;
|
||||
}
|
||||
appendToToken((char) result);
|
||||
offset++;
|
||||
} else if (bufferLimit == -1) {
|
||||
return -1;
|
||||
} else {
|
||||
assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
|
||||
if (bufferNextRead == bufferLimit) {
|
||||
bufferLimit = input.read(buffer, 0, buffer.length);
|
||||
if (bufferLimit == -1) {
|
||||
return -1;
|
||||
}
|
||||
bufferNextRead = 0;
|
||||
}
|
||||
result = buffer[bufferNextRead++];
|
||||
offset++;
|
||||
appendToToken((char) result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private int nextCodePoint() throws IOException {
|
||||
|
||||
int ch = nextCodeUnit();
|
||||
if (ch == -1) {
|
||||
return ch;
|
||||
}
|
||||
if (Character.isHighSurrogate((char) ch)) {
|
||||
return Character.toCodePoint((char) ch, (char) nextCodeUnit());
|
||||
} else {
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* Factory for {@link SimplePatternSplitTokenizer}, for producing tokens by splitting according to the provided regexp.
|
||||
*
|
||||
* <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
|
||||
* for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
|
||||
* tokenization is quite a bit faster. It takes two arguments:
|
||||
* <br>
|
||||
* <ul>
|
||||
* <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
|
||||
* <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* The pattern matches the characters that should split tokens, like {@code String.split}, and the
|
||||
* matching is greedy such that the longest token separator matching at a given point is matched. Empty
|
||||
* tokens are never created.
|
||||
*
|
||||
* <p>For example, to match tokens delimited by simple whitespace characters:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.SimplePatternSplitTokenizerFactory" pattern="[ \t\r\n]+"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see SimplePatternSplitTokenizer
|
||||
*/
|
||||
public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
|
||||
public static final String PATTERN = "pattern";
|
||||
private final Automaton dfa;
|
||||
private final int maxDeterminizedStates;
|
||||
|
||||
/** Creates a new SimpleSplitPatternTokenizerFactory */
|
||||
public SimplePatternSplitTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
|
||||
if (args.isEmpty() == false) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimplePatternSplitTokenizer create(final AttributeFactory factory) {
|
||||
return new SimplePatternSplitTokenizer(factory, dfa);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,242 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
|
||||
* The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. The provided
|
||||
* regex should match valid token characters (not token separator characters, like {@code String.split}). The matching is greedy:
|
||||
* the longest match at a given start point will be the next token. Empty string tokens are never produced.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is
|
||||
// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's. This is because on failing to match
|
||||
// a token, we skip one character forward and try again. A better approach would be to compile something like this regexp
|
||||
// instead: .* | <pattern>, because that automaton would not "forget" all the as it had already seen, and would be a single pass
|
||||
// through the input. I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
|
||||
// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know
|
||||
// which specific characters the pattern matched. SynonymFilter has this same limitation.
|
||||
|
||||
public final class SimplePatternTokenizer extends Tokenizer {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final CharacterRunAutomaton runDFA;
|
||||
|
||||
// TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
|
||||
// suspect it's slowish:
|
||||
|
||||
private char[] pendingChars = new char[8];
|
||||
private int pendingLimit;
|
||||
private int pendingUpto;
|
||||
private int offset;
|
||||
private int tokenUpto;
|
||||
private final char[] buffer = new char[1024];
|
||||
private int bufferLimit;
|
||||
private int bufferNextRead;
|
||||
|
||||
/** See {@link RegExp} for the accepted syntax. */
|
||||
public SimplePatternTokenizer(String regexp) {
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/** Runs a pre-built automaton. */
|
||||
public SimplePatternTokenizer(Automaton dfa) {
|
||||
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
|
||||
}
|
||||
|
||||
/** See {@link RegExp} for the accepted syntax. */
|
||||
public SimplePatternTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
|
||||
this(factory, new RegExp(regexp).toAutomaton());
|
||||
}
|
||||
|
||||
/** Runs a pre-built automaton. */
|
||||
public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
|
||||
super(factory);
|
||||
|
||||
// we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
|
||||
// realizing this ctor is otherwise trappy
|
||||
if (dfa.isDeterministic() == false) {
|
||||
throw new IllegalArgumentException("please determinize the incoming automaton first");
|
||||
}
|
||||
|
||||
runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
clearAttributes();
|
||||
tokenUpto = 0;
|
||||
|
||||
while (true) {
|
||||
|
||||
int offsetStart = offset;
|
||||
|
||||
// The runDFA operates in Unicode space, not UTF16 (java's char):
|
||||
|
||||
int ch = nextCodePoint();
|
||||
if (ch == -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int state = runDFA.step(0, ch);
|
||||
|
||||
if (state != -1) {
|
||||
// a token just possibly started; keep scanning to see if the token is accepted:
|
||||
int lastAcceptLength = -1;
|
||||
do {
|
||||
|
||||
if (runDFA.isAccept(state)) {
|
||||
// record that the token matches here, but keep scanning in case a longer match also works (greedy):
|
||||
lastAcceptLength = tokenUpto;
|
||||
}
|
||||
|
||||
ch = nextCodePoint();
|
||||
if (ch == -1) {
|
||||
break;
|
||||
}
|
||||
state = runDFA.step(state, ch);
|
||||
} while (state != -1);
|
||||
|
||||
if (lastAcceptLength != -1) {
|
||||
// we found a token
|
||||
int extra = tokenUpto - lastAcceptLength;
|
||||
if (extra != 0) {
|
||||
pushBack(extra);
|
||||
}
|
||||
termAtt.setLength(lastAcceptLength);
|
||||
offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength));
|
||||
return true;
|
||||
} else if (ch == -1) {
|
||||
return false;
|
||||
} else {
|
||||
// false alarm: there was no token here; push back all but the first character we scanned
|
||||
pushBack(tokenUpto-1);
|
||||
tokenUpto = 0;
|
||||
}
|
||||
} else {
|
||||
tokenUpto = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
|
||||
offsetAtt.setOffset(ofs, ofs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = 0;
|
||||
pendingUpto = 0;
|
||||
pendingLimit = 0;
|
||||
tokenUpto = 0;
|
||||
bufferNextRead = 0;
|
||||
bufferLimit = 0;
|
||||
}
|
||||
|
||||
/** Pushes back the last {@code count} characters in current token's buffer. */
|
||||
private void pushBack(int count) {
|
||||
|
||||
if (pendingLimit == 0) {
|
||||
if (bufferNextRead >= count) {
|
||||
// optimize common case when the chars we are pushing back are still in the buffer
|
||||
bufferNextRead -= count;
|
||||
} else {
|
||||
if (count > pendingChars.length) {
|
||||
pendingChars = ArrayUtil.grow(pendingChars, count);
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
|
||||
pendingLimit = count;
|
||||
}
|
||||
} else {
|
||||
// we are pushing back what is already in our pending buffer
|
||||
pendingUpto -= count;
|
||||
assert pendingUpto >= 0;
|
||||
}
|
||||
offset -= count;
|
||||
}
|
||||
|
||||
private void appendToToken(char ch) {
|
||||
char[] buffer = termAtt.buffer();
|
||||
if (tokenUpto == buffer.length) {
|
||||
buffer = termAtt.resizeBuffer(tokenUpto + 1);
|
||||
}
|
||||
buffer[tokenUpto++] = ch;
|
||||
}
|
||||
|
||||
private int nextCodeUnit() throws IOException {
|
||||
int result;
|
||||
if (pendingUpto < pendingLimit) {
|
||||
result = pendingChars[pendingUpto++];
|
||||
if (pendingUpto == pendingLimit) {
|
||||
// We used up the pending buffer
|
||||
pendingUpto = 0;
|
||||
pendingLimit = 0;
|
||||
}
|
||||
appendToToken((char) result);
|
||||
offset++;
|
||||
} else if (bufferLimit == -1) {
|
||||
return -1;
|
||||
} else {
|
||||
assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
|
||||
if (bufferNextRead == bufferLimit) {
|
||||
bufferLimit = input.read(buffer, 0, buffer.length);
|
||||
if (bufferLimit == -1) {
|
||||
return -1;
|
||||
}
|
||||
bufferNextRead = 0;
|
||||
}
|
||||
result = buffer[bufferNextRead++];
|
||||
offset++;
|
||||
appendToToken((char) result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private int nextCodePoint() throws IOException {
|
||||
|
||||
int ch = nextCodeUnit();
|
||||
if (ch == -1) {
|
||||
return ch;
|
||||
}
|
||||
if (Character.isHighSurrogate((char) ch)) {
|
||||
return Character.toCodePoint((char) ch, (char) nextCodeUnit());
|
||||
} else {
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
/**
|
||||
* Factory for {@link SimplePatternTokenizer}, for matching tokens based on the provided regexp.
|
||||
*
|
||||
* <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
|
||||
* for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
|
||||
* tokenization is quite a bit faster. It takes two arguments:
|
||||
* <br>
|
||||
* <ul>
|
||||
* <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
|
||||
* <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* The pattern matches the characters to include in a token (not the split characters), and the
|
||||
* matching is greedy such that the longest token matching at a given point is created. Empty
|
||||
* tokens are never created.
|
||||
*
|
||||
* <p>For example, to match tokens delimited by simple whitespace characters:
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.SimplePatternTokenizerFactory" pattern="[^ \t\r\n]+"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see SimplePatternTokenizer
|
||||
*/
|
||||
public class SimplePatternTokenizerFactory extends TokenizerFactory {
|
||||
public static final String PATTERN = "pattern";
|
||||
private final Automaton dfa;
|
||||
private final int maxDeterminizedStates;
|
||||
|
||||
/** Creates a new SimplePatternTokenizerFactory */
|
||||
public SimplePatternTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
|
||||
if (args.isEmpty() == false) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimplePatternTokenizer create(final AttributeFactory factory) {
|
||||
return new SimplePatternTokenizer(factory, dfa);
|
||||
}
|
||||
}
|
|
@ -21,6 +21,8 @@ org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
|
|||
org.apache.lucene.analysis.ngram.NGramTokenizerFactory
|
||||
org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
|
||||
org.apache.lucene.analysis.pattern.PatternTokenizerFactory
|
||||
org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizerFactory
|
||||
org.apache.lucene.analysis.pattern.SimplePatternTokenizerFactory
|
||||
org.apache.lucene.analysis.standard.ClassicTokenizerFactory
|
||||
org.apache.lucene.analysis.standard.StandardTokenizerFactory
|
||||
org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
|
||||
|
|
|
@ -96,7 +96,11 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.Rethrow;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
|
@ -494,6 +498,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
if (random.nextBoolean()) return null;
|
||||
return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
|
||||
});
|
||||
put(Automaton.class, random -> {
|
||||
return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
});
|
||||
}};
|
||||
|
||||
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
|
||||
|
@ -503,6 +510,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
allowedTokenizerArgs.add(Reader.class);
|
||||
allowedTokenizerArgs.add(AttributeFactory.class);
|
||||
allowedTokenizerArgs.add(AttributeSource.class);
|
||||
allowedTokenizerArgs.add(Automaton.class);
|
||||
|
||||
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
|
||||
allowedTokenFilterArgs.addAll(argProducers.keySet());
|
||||
|
|
|
@ -0,0 +1,273 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
|
||||
public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testGreedy() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("(foo)+");
|
||||
t.setReader(new StringReader("bar foofoo baz"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"bar ", " baz"},
|
||||
new int[] {0, 10},
|
||||
new int[] {4, 14});
|
||||
}
|
||||
|
||||
public void testBackToBack() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("foo");
|
||||
t.setReader(new StringReader("bar foofoo baz"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"bar ", " baz"},
|
||||
new int[] {0, 10},
|
||||
new int[] {4, 14});
|
||||
}
|
||||
|
||||
public void testBigLookahead() throws Exception {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for(int i=0;i<100;i++) {
|
||||
b.append('a');
|
||||
}
|
||||
b.append('b');
|
||||
Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
|
||||
b = new StringBuilder();
|
||||
for(int i=0;i<200;i++) {
|
||||
b.append('a');
|
||||
}
|
||||
t.setReader(new StringReader(b.toString()));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals(b.toString(), termAtt.toString());
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testNoTokens() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer(".*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
String s;
|
||||
while (true) {
|
||||
s = TestUtil.randomUnicodeString(random());
|
||||
if (s.length() > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
t.setReader(new StringReader(s));
|
||||
t.reset();
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testEmptyStringPatternNoMatch() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("a*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("bbb"));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals("bbb", termAtt.toString());
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testSplitSingleCharWhitespace() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("a \tb c"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"a", "b", "c"},
|
||||
new int[] {0, 3, 7},
|
||||
new int[] {1, 4, 8});
|
||||
}
|
||||
|
||||
public void testSplitMultiCharWhitespace() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("a \tb c"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"a", "b", "c"},
|
||||
new int[] {0, 3, 7},
|
||||
new int[] {1, 4, 8});
|
||||
}
|
||||
|
||||
public void testLeadingNonToken() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader(" a c"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"a", "c"},
|
||||
new int[] {4, 6},
|
||||
new int[] {5, 7});
|
||||
}
|
||||
|
||||
public void testTrailingNonToken() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("a c "));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"a", "c"},
|
||||
new int[] {0, 2},
|
||||
new int[] {1, 3});
|
||||
}
|
||||
|
||||
public void testEmptyStringPatternOneMatch() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("a*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("bbab"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"bb", "b"},
|
||||
new int[] {0, 3},
|
||||
new int[] {2, 4});
|
||||
}
|
||||
|
||||
public void testEndOffset() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("a+");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
|
||||
t.setReader(new StringReader("aaabbb"));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals("bbb", termAtt.toString());
|
||||
assertFalse(t.incrementToken());
|
||||
t.end();
|
||||
assertEquals(6, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
public void testFixedToken() throws Exception {
|
||||
Tokenizer t = new SimplePatternSplitTokenizer("aaaa");
|
||||
|
||||
t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"aaa"},
|
||||
new int[] {12},
|
||||
new int[] {15});
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception
|
||||
{
|
||||
String[][] tests = {
|
||||
// pattern input output
|
||||
{ "--", "aaa--bbb--ccc", "aaa bbb ccc" },
|
||||
{ ":", "aaa:bbb:ccc", "aaa bbb ccc" },
|
||||
{ ":", "boo:and:foo", "boo and foo" },
|
||||
{ "o", "boo:and:foo", "b :and:f" },
|
||||
};
|
||||
|
||||
for(String[] test : tests) {
|
||||
TokenStream stream = new SimplePatternSplitTokenizer(test[0]);
|
||||
((Tokenizer)stream).setReader(new StringReader(test[1]));
|
||||
String out = tsToString(stream);
|
||||
assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNotDeterminized() throws Exception {
|
||||
Automaton a = new Automaton();
|
||||
int start = a.createState();
|
||||
int mid1 = a.createState();
|
||||
int mid2 = a.createState();
|
||||
int end = a.createState();
|
||||
a.setAccept(end, true);
|
||||
a.addTransition(start, mid1, 'a', 'z');
|
||||
a.addTransition(start, mid2, 'a', 'z');
|
||||
a.addTransition(mid1, end, 'b');
|
||||
a.addTransition(mid2, end, 'b');
|
||||
expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);});
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create MappingCharFilter
|
||||
List<String> mappingRules = new ArrayList<>();
|
||||
mappingRules.add( "\"ü\" => \"ü\"" );
|
||||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
||||
builder.add("ü", "ü");
|
||||
NormalizeCharMap normMap = builder.build();
|
||||
CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
|
||||
|
||||
// create SimplePatternSplitTokenizer
|
||||
Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
|
||||
stream.setReader(charStream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { " ", " is here" },
|
||||
new int[] { 12, 25 },
|
||||
new int[] { 13, 33 },
|
||||
INPUT.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: rewrite tests not to use string comparison.
|
||||
*/
|
||||
private static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
in.reset();
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0) {
|
||||
out.append(' ');
|
||||
}
|
||||
out.append(termAtt.toString());
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
}
|
||||
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
a.close();
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
b.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
|
||||
public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testGreedy() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("(foo)+");
|
||||
t.setReader(new StringReader("bar foofoo baz"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"foofoo"},
|
||||
new int[] {4},
|
||||
new int[] {10});
|
||||
}
|
||||
|
||||
public void testBigLookahead() throws Exception {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for(int i=0;i<100;i++) {
|
||||
b.append('a');
|
||||
}
|
||||
b.append('b');
|
||||
Tokenizer t = new SimplePatternTokenizer(b.toString());
|
||||
|
||||
b = new StringBuilder();
|
||||
for(int i=0;i<200;i++) {
|
||||
b.append('a');
|
||||
}
|
||||
t.setReader(new StringReader(b.toString()));
|
||||
t.reset();
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testOneToken() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer(".*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
String s;
|
||||
while (true) {
|
||||
s = TestUtil.randomUnicodeString(random());
|
||||
if (s.length() > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
t.setReader(new StringReader(s));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals(s, termAtt.toString());
|
||||
}
|
||||
|
||||
public void testEmptyStringPatternNoMatch() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("a*");
|
||||
t.setReader(new StringReader("bbb"));
|
||||
t.reset();
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testEmptyStringPatternOneMatch() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("a*");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
t.setReader(new StringReader("bbab"));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals("a", termAtt.toString());
|
||||
assertFalse(t.incrementToken());
|
||||
}
|
||||
|
||||
public void testEndOffset() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("a+");
|
||||
CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
|
||||
t.setReader(new StringReader("aaabbb"));
|
||||
t.reset();
|
||||
assertTrue(t.incrementToken());
|
||||
assertEquals("aaa", termAtt.toString());
|
||||
assertFalse(t.incrementToken());
|
||||
t.end();
|
||||
assertEquals(6, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
public void testFixedToken() throws Exception {
|
||||
Tokenizer t = new SimplePatternTokenizer("aaaa");
|
||||
|
||||
t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
|
||||
assertTokenStreamContents(t,
|
||||
new String[] {"aaaa", "aaaa", "aaaa"},
|
||||
new int[] {0, 4, 8},
|
||||
new int[] {4, 8, 12});
|
||||
}
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
|
||||
String[][] tests = {
|
||||
// pattern input output
|
||||
{ ":", "boo:and:foo", ": :" },
|
||||
{ qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
|
||||
};
|
||||
|
||||
for(String[] test : tests) {
|
||||
TokenStream stream = new SimplePatternTokenizer(test[0]);
|
||||
((Tokenizer)stream).setReader(new StringReader(test[1]));
|
||||
String out = tsToString(stream);
|
||||
|
||||
assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNotDeterminized() throws Exception {
|
||||
Automaton a = new Automaton();
|
||||
int start = a.createState();
|
||||
int mid1 = a.createState();
|
||||
int mid2 = a.createState();
|
||||
int end = a.createState();
|
||||
a.setAccept(end, true);
|
||||
a.addTransition(start, mid1, 'a', 'z');
|
||||
a.addTransition(start, mid2, 'a', 'z');
|
||||
a.addTransition(mid1, end, 'b');
|
||||
a.addTransition(mid2, end, 'b');
|
||||
expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create MappingCharFilter
|
||||
List<String> mappingRules = new ArrayList<>();
|
||||
mappingRules.add( "\"ü\" => \"ü\"" );
|
||||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
|
||||
builder.add("ü", "ü");
|
||||
NormalizeCharMap normMap = builder.build();
|
||||
CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
|
||||
|
||||
// create SimplePatternTokenizer
|
||||
Tokenizer stream = new SimplePatternTokenizer("Günther");
|
||||
stream.setReader(charStream);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther" },
|
||||
new int[] { 0, 13 },
|
||||
new int[] { 12, 25 },
|
||||
INPUT.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: rewrite tests not to use string comparison.
|
||||
*/
|
||||
private static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
in.reset();
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0) {
|
||||
out.append(' ');
|
||||
}
|
||||
out.append(termAtt.toString());
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
}
|
||||
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new SimplePatternTokenizer("a");
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
a.close();
|
||||
|
||||
Analyzer b = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new SimplePatternTokenizer("a");
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
|
||||
b.close();
|
||||
}
|
||||
}
|
|
@ -27,9 +27,9 @@ public class ByteRunAutomaton extends RunAutomaton {
|
|||
this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/** expert: if utf8 is true, the input is already byte-based */
|
||||
/** expert: if isBinary is true, the input is already byte-based */
|
||||
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
|
||||
super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
|
||||
super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -36,7 +36,7 @@ public class CharacterRunAutomaton extends RunAutomaton {
|
|||
* it then a TooComplexToDeterminizeException is thrown.
|
||||
*/
|
||||
public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
|
||||
super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
|
||||
super(a, Character.MAX_CODE_POINT+1, maxDeterminizedStates);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -29,24 +29,24 @@
|
|||
|
||||
package org.apache.lucene.util.automaton;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
/**
|
||||
* Automata operations.
|
||||
*
|
||||
|
@ -335,7 +335,7 @@ final public class Operations {
|
|||
Transition[][] transitions2 = a2.getSortedTransitions();
|
||||
Automaton c = new Automaton();
|
||||
c.createState();
|
||||
LinkedList<StatePair> worklist = new LinkedList<>();
|
||||
ArrayDeque<StatePair> worklist = new ArrayDeque<>();
|
||||
HashMap<StatePair,StatePair> newstates = new HashMap<>();
|
||||
StatePair p = new StatePair(0, 0, 0);
|
||||
worklist.add(p);
|
||||
|
@ -435,7 +435,7 @@ final public class Operations {
|
|||
// TODO: cutover to iterators instead
|
||||
Transition[][] transitions1 = a1.getSortedTransitions();
|
||||
Transition[][] transitions2 = a2.getSortedTransitions();
|
||||
LinkedList<StatePair> worklist = new LinkedList<>();
|
||||
ArrayDeque<StatePair> worklist = new ArrayDeque<>();
|
||||
HashSet<StatePair> visited = new HashSet<>();
|
||||
StatePair p = new StatePair(0, 0);
|
||||
worklist.add(p);
|
||||
|
@ -682,7 +682,7 @@ final public class Operations {
|
|||
// Create state 0:
|
||||
b.createState();
|
||||
|
||||
LinkedList<SortedIntSet.FrozenIntSet> worklist = new LinkedList<>();
|
||||
ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>();
|
||||
Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>();
|
||||
|
||||
worklist.add(initialset);
|
||||
|
@ -804,7 +804,7 @@ final public class Operations {
|
|||
return false;
|
||||
}
|
||||
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
ArrayDeque<Integer> workList = new ArrayDeque<>();
|
||||
BitSet seen = new BitSet(a.getNumStates());
|
||||
workList.add(0);
|
||||
seen.set(0);
|
||||
|
@ -907,7 +907,7 @@ final public class Operations {
|
|||
if (numStates == 0) {
|
||||
return live;
|
||||
}
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
ArrayDeque<Integer> workList = new ArrayDeque<>();
|
||||
live.set(0);
|
||||
workList.add(0);
|
||||
|
||||
|
@ -946,7 +946,7 @@ final public class Operations {
|
|||
}
|
||||
Automaton a2 = builder.finish();
|
||||
|
||||
LinkedList<Integer> workList = new LinkedList<>();
|
||||
ArrayDeque<Integer> workList = new ArrayDeque<>();
|
||||
BitSet live = new BitSet(numStates);
|
||||
BitSet acceptBits = a.getAcceptStates();
|
||||
int s = 0;
|
||||
|
@ -1010,22 +1010,6 @@ final public class Operations {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the largest entry whose value is less than or equal to c, or 0 if
|
||||
* there is no such entry.
|
||||
*/
|
||||
static int findIndex(int c, int[] points) {
|
||||
int a = 0;
|
||||
int b = points.length;
|
||||
while (b - a > 1) {
|
||||
int d = (a + b) >>> 1;
|
||||
if (points[d] > c) b = d;
|
||||
else if (points[d] < c) a = d;
|
||||
else return d;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the language of this automaton is finite. The
|
||||
* automaton must not have any dead states.
|
||||
|
|
|
@ -38,13 +38,62 @@ import java.util.Arrays;
|
|||
*/
|
||||
public abstract class RunAutomaton {
|
||||
final Automaton automaton;
|
||||
final int maxInterval;
|
||||
final int alphabetSize;
|
||||
final int size;
|
||||
final boolean[] accept;
|
||||
final int[] transitions; // delta(state,c) = transitions[state*points.length +
|
||||
// getCharClass(c)]
|
||||
final int[] points; // char interval start points
|
||||
final int[] classmap; // map from char number to class class
|
||||
final int[] classmap; // map from char number to class
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
*
|
||||
* @param a an automaton
|
||||
*/
|
||||
protected RunAutomaton(Automaton a, int alphabetSize) {
|
||||
this(a, alphabetSize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
*
|
||||
* @param a an automaton
|
||||
* @param maxDeterminizedStates maximum number of states that can be created
|
||||
* while determinizing a
|
||||
*/
|
||||
protected RunAutomaton(Automaton a, int alphabetSize, int maxDeterminizedStates) {
|
||||
this.alphabetSize = alphabetSize;
|
||||
a = Operations.determinize(a, maxDeterminizedStates);
|
||||
this.automaton = a;
|
||||
points = a.getStartPoints();
|
||||
size = Math.max(1,a.getNumStates());
|
||||
accept = new boolean[size];
|
||||
transitions = new int[size * points.length];
|
||||
Arrays.fill(transitions, -1);
|
||||
for (int n=0;n<size;n++) {
|
||||
accept[n] = a.isAccept(n);
|
||||
for (int c = 0; c < points.length; c++) {
|
||||
int dest = a.step(n, points[c]);
|
||||
assert dest == -1 || dest < size;
|
||||
transitions[n * points.length + c] = dest;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set alphabet table for optimal run performance.
|
||||
*/
|
||||
classmap = new int[Math.min(256, alphabetSize)];
|
||||
int i = 0;
|
||||
for (int j = 0; j < classmap.length; j++) {
|
||||
if (i + 1 < points.length && j == points[i + 1]) {
|
||||
i++;
|
||||
}
|
||||
classmap[j] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string representation of this automaton.
|
||||
|
@ -63,7 +112,7 @@ public abstract class RunAutomaton {
|
|||
int min = points[j];
|
||||
int max;
|
||||
if (j + 1 < points.length) max = (points[j + 1] - 1);
|
||||
else max = maxInterval;
|
||||
else max = alphabetSize;
|
||||
b.append(" ");
|
||||
Automaton.appendCharString(min, b);
|
||||
if (min != max) {
|
||||
|
@ -103,61 +152,17 @@ public abstract class RunAutomaton {
|
|||
* Gets character class of given codepoint
|
||||
*/
|
||||
final int getCharClass(int c) {
|
||||
return Operations.findIndex(c, points);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
*
|
||||
* @param a an automaton
|
||||
*/
|
||||
public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
|
||||
this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new <code>RunAutomaton</code> from a deterministic
|
||||
* <code>Automaton</code>.
|
||||
*
|
||||
* @param a an automaton
|
||||
* @param maxDeterminizedStates maximum number of states that can be created
|
||||
* while determinizing a
|
||||
*/
|
||||
public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
|
||||
int maxDeterminizedStates) {
|
||||
this.maxInterval = maxInterval;
|
||||
a = Operations.determinize(a, maxDeterminizedStates);
|
||||
this.automaton = a;
|
||||
points = a.getStartPoints();
|
||||
size = Math.max(1,a.getNumStates());
|
||||
accept = new boolean[size];
|
||||
transitions = new int[size * points.length];
|
||||
Arrays.fill(transitions, -1);
|
||||
for (int n=0;n<size;n++) {
|
||||
accept[n] = a.isAccept(n);
|
||||
for (int c = 0; c < points.length; c++) {
|
||||
int dest = a.step(n, points[c]);
|
||||
assert dest == -1 || dest < size;
|
||||
transitions[n * points.length + c] = dest;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set alphabet table for optimal run performance.
|
||||
*/
|
||||
if (tableize) {
|
||||
classmap = new int[maxInterval + 1];
|
||||
int i = 0;
|
||||
for (int j = 0; j <= maxInterval; j++) {
|
||||
if (i + 1 < points.length && j == points[i + 1]) {
|
||||
i++;
|
||||
}
|
||||
classmap[j] = i;
|
||||
}
|
||||
} else {
|
||||
classmap = null;
|
||||
// binary search
|
||||
int a = 0;
|
||||
int b = points.length;
|
||||
while (b - a > 1) {
|
||||
int d = (a + b) >>> 1;
|
||||
if (points[d] > c) b = d;
|
||||
else if (points[d] < c) a = d;
|
||||
else return d;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -168,7 +173,8 @@ public abstract class RunAutomaton {
|
|||
* transition function.)
|
||||
*/
|
||||
public final int step(int state, int c) {
|
||||
if (classmap == null) {
|
||||
assert c < alphabetSize;
|
||||
if (c >= classmap.length) {
|
||||
return transitions[state * points.length + getCharClass(c)];
|
||||
} else {
|
||||
return transitions[state * points.length + classmap[c]];
|
||||
|
@ -179,7 +185,7 @@ public abstract class RunAutomaton {
|
|||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + maxInterval;
|
||||
result = prime * result + alphabetSize;
|
||||
result = prime * result + points.length;
|
||||
result = prime * result + size;
|
||||
return result;
|
||||
|
@ -191,7 +197,7 @@ public abstract class RunAutomaton {
|
|||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
RunAutomaton other = (RunAutomaton) obj;
|
||||
if (maxInterval != other.maxInterval) return false;
|
||||
if (alphabetSize != other.alphabetSize) return false;
|
||||
if (size != other.size) return false;
|
||||
if (!Arrays.equals(points, other.points)) return false;
|
||||
if (!Arrays.equals(accept, other.accept)) return false;
|
||||
|
|
|
@ -367,7 +367,7 @@ class TermAutomatonScorer extends Scorer {
|
|||
|
||||
static class TermRunAutomaton extends RunAutomaton {
|
||||
public TermRunAutomaton(Automaton a, int termCount) {
|
||||
super(a, termCount, true);
|
||||
super(a, termCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue