Fix repeating token sentence boundary bug (#11734)

Signed-off-by: lkotzaniewsk <lkotzaniewsk@bloomberg.net>
Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
This commit is contained in:
Luke Kot-Zaniewski 2022-09-23 06:59:46 -04:00 committed by GitHub
parent 5b24a233bd
commit 3a04aa44c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 471 additions and 119 deletions

View File

@ -110,6 +110,12 @@ Bug Fixes
trying to apply a dictionary whose size is greater than the maximum supported
window size for LZ4. (Adrien Grand)
* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
(Luke Kot-Zaniewski)
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
(Luke Kot-Zaniewski)
Other
---------------------
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)

View File

@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@ -36,76 +36,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
*/
@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
public final class OpenNLPChunkerFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private int tokenNum = 0;
private boolean moreTokensAvailable = true;
private String[] sentenceTerms = null;
private String[] sentenceTermPOSTags = null;
private final NLPChunkerOp chunkerOp;
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final SentenceAttributeExtractor sentenceAttributeExtractor;
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
super(input);
this.chunkerOp = chunkerOp;
sentenceAttributeExtractor =
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
public final boolean incrementToken() throws IOException {
if (!moreTokensAvailable) {
clear();
return false;
}
if (tokenNum == sentenceTokenAttrs.size()) {
nextSentence();
if (sentenceTerms == null) {
clear();
public boolean incrementToken() throws IOException {
List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
if (isEndOfCurrentSentence) {
boolean noSentencesLeft =
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
if (noSentencesLeft) {
return false;
}
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
private void nextSentence() throws IOException {
private List<AttributeSource> nextSentence() throws IOException {
tokenNum = 0;
List<String> termList = new ArrayList<>();
List<String> posTagList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
termList.add(termAtt.toString());
posTagList.add(typeAtt.type());
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
}
sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
sentenceTermPOSTags =
posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
String[] sentenceTerms = termList.toArray(new String[0]);
String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
return sentenceAttributeExtractor.getSentenceAttributes();
}
private void assignTokenTypes(String[] tags) {
for (int i = 0; i < tags.length; ++i) {
sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
sentenceAttributeExtractor
.getSentenceAttributes()
.get(i)
.getAttribute(TypeAttribute.class)
.setType(tags[i]);
}
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
sentenceTokenAttrs.clear();
sentenceTerms = null;
sentenceTermPOSTags = null;
tokenNum = 0;
}
}

View File

@ -24,10 +24,7 @@ import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@ -46,37 +43,28 @@ import org.apache.lucene.util.IgnoreRandomChains;
public class OpenNLPLemmatizerFilter extends TokenFilter {
private final NLPLemmatizerOp lemmatizerOp;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
private boolean moreTokensAvailable = true;
private String[] sentenceTokens = null; // non-keyword tokens
private String[] sentenceTokenTypes = null; // types for non-keyword tokens
private String[] lemmas = null; // lemmas for non-keyword tokens
private final SentenceAttributeExtractor sentenceAttributeExtractor;
private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
private int lemmaNum = 0; // lemma counter
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
super(input);
this.lemmatizerOp = lemmatizerOp;
sentenceAttributeExtractor =
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
public final boolean incrementToken() throws IOException {
if (!moreTokensAvailable) {
clear();
return false;
}
if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
nextSentence();
if (sentenceTokens == null) { // zero non-keyword tokens
clear();
boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
if (isEndOfCurrentSentence) {
boolean noSentencesLeft =
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
if (noSentencesLeft) {
return false;
}
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
lemmaNum = 0;
sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
}
clearAttributes();
sentenceTokenAttrsIter.next().copyTo(this);
@ -86,36 +74,35 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
return true;
}
private void nextSentence() throws IOException {
private List<AttributeSource> nextSentence() throws IOException {
lemmaNum = 0;
List<String> tokenList = new ArrayList<>();
List<String> typeList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
if (!keywordAtt.isKeyword()) {
tokenList.add(termAtt.toString());
typeList.add(typeAtt.type());
List<AttributeSource> sentenceAttributes =
sentenceAttributeExtractor.extractSentenceAttributes();
for (AttributeSource attributeSource : sentenceAttributes) {
if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
}
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
}
sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
String[] sentenceTokens = tokenList.toArray(new String[0]);
String[] sentenceTokenTypes = typeList.toArray(new String[0]);
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
sentenceTokenAttrsIter = sentenceAttributes.iterator();
return sentenceAttributeExtractor.getSentenceAttributes();
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
sentenceTokenAttrs.clear();
sentenceTokenAttrsIter = null;
sentenceTokens = null;
sentenceTokenTypes = null;
lemmas = null;
lemmas = new String[0];
lemmaNum = 0;
}
}

View File

@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IgnoreRandomChains;
@ -33,65 +33,62 @@ import org.apache.lucene.util.IgnoreRandomChains;
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPPOSFilter extends TokenFilter {
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
String[] tags = null;
private int tokenNum = 0;
private boolean moreTokensAvailable = true;
private final NLPPOSTaggerOp posTaggerOp;
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final SentenceAttributeExtractor sentenceAttributeExtractor;
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
super(input);
this.posTaggerOp = posTaggerOp;
sentenceAttributeExtractor =
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
}
@Override
public final boolean incrementToken() throws IOException {
if (!moreTokensAvailable) {
clear();
return false;
}
if (tokenNum
== sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
String[] sentenceTokens = nextSentence();
if (sentenceTokens == null) {
clear();
public boolean incrementToken() throws IOException {
List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
if (isEndOfCurrentSentence) {
boolean noSentencesLeft =
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
if (noSentencesLeft) {
return false;
}
tags = posTaggerOp.getPOSTags(sentenceTokens);
tokenNum = 0;
}
clearAttributes();
sentenceTokenAttrs.get(tokenNum).copyTo(this);
typeAtt.setType(tags[tokenNum++]);
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
return true;
}
private String[] nextSentence() throws IOException {
private List<AttributeSource> nextSentence() throws IOException {
tokenNum = 0;
List<String> termList = new ArrayList<>();
sentenceTokenAttrs.clear();
boolean endOfSentence = false;
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
termList.add(termAtt.toString());
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
sentenceTokenAttrs.add(input.cloneAttributes());
for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
}
String[] sentenceTerms = termList.toArray(new String[0]);
assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
return sentenceAttributeExtractor.getSentenceAttributes();
}
private void assignTokenTypes(String[] tags) {
for (int i = 0; i < tags.length; ++i) {
sentenceAttributeExtractor
.getSentenceAttributes()
.get(i)
.getAttribute(TypeAttribute.class)
.setType(tags[i]);
}
return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
}
@Override
public void reset() throws IOException {
super.reset();
moreTokensAvailable = true;
sentenceAttributeExtractor.reset();
clear();
}
private void clear() {
sentenceTokenAttrs.clear();
tags = null;
tokenNum = 0;
}
}

View File

@ -22,28 +22,27 @@ import opennlp.tools.util.Span;
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IgnoreRandomChains;
/**
* Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
* the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
* apply operations to tokens one sentence at a time.
* Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
* SentenceAttribute.
*/
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
public static int EOS_FLAG_BIT = 1;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
private Span[] termSpans = null;
private int termNum = 0;
private int sentenceStart = 0;
private int sentenceIndex = -1;
private NLPTokenizerOp tokenizerOp = null;
@ -71,6 +70,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
termSpans = tokenizerOp.getTerms(sentenceText);
termNum = 0;
sentenceIndex++;
}
@Override
@ -84,11 +84,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
offsetAtt.setOffset(
correctOffset(offset + sentenceStart + term.getStart()),
correctOffset(offset + sentenceStart + term.getEnd()));
if (termNum == termSpans.length - 1) {
flagsAtt.setFlags(
flagsAtt.getFlags()
| EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
}
sentenceAtt.setSentenceIndex(sentenceIndex);
++termNum;
return true;
}
@ -98,5 +94,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
super.reset();
termSpans = null;
termNum = sentenceStart = 0;
sentenceIndex = -1;
}
}

View File

@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
* central location to be used by other sentence-aware components.
*
* <p>May want to consider making this its own Filter so that extracted sentence token attributes
* can be shared by downstream sentence-aware filters.
*/
public class SentenceAttributeExtractor {
private final TokenStream input;
private final SentenceAttribute sentenceAtt;
private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
private AttributeSource prevAttributeSource;
private int currSentence = 0;
private boolean hasNextToken = true;
public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
this.input = input;
this.sentenceAtt = sentenceAtt;
}
// If this class were a stand-alone filter it could conceivably extract the attributes once
// and cache a reference to those attributes in SentenceAttribute. That way downstream filters
// could read the full sentence without having to independently extract it.
public List<AttributeSource> extractSentenceAttributes() throws IOException {
sentenceTokenAttrs.clear();
boolean hasNext;
do {
hasNextToken = input.incrementToken();
int currSentenceTmp = sentenceAtt.getSentenceIndex();
hasNext = (currSentence == currSentenceTmp && hasNextToken);
currSentence = currSentenceTmp;
if (prevAttributeSource != null) {
sentenceTokenAttrs.add(prevAttributeSource);
}
prevAttributeSource = input.cloneAttributes();
} while (hasNext);
return sentenceTokenAttrs;
}
public List<AttributeSource> getSentenceAttributes() {
return sentenceTokenAttrs;
}
public boolean allSentencesProcessed() {
return !hasNextToken;
}
public void reset() {
hasNextToken = true;
sentenceTokenAttrs.clear();
currSentence = 0;
prevAttributeSource = null;
}
}

View File

@ -0,0 +1,23 @@
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
x
Quick brown fox jumped over the lazy dog.
x
Quick brown fox jumped over the lazy dog.
x
Quick brown fox jumped over the lazy dog.
x
Quick brown fox jumped over the lazy dog.
x
Quick brown fox jumped over the lazy dog.
x
This should hopefully get analyzed.
x
And so should this.

View File

@ -0,0 +1,32 @@
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
x x
This This should should hopefully hopefully get get analyzed analyzed . .
x x
And And so so should should this this . .

View File

@ -114,4 +114,16 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
true,
toPayloads(SENTENCES_chunks));
}
public void testEmptyField() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
}
}

View File

@ -17,6 +17,11 @@
package org.apache.lucene.analysis.opennlp;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
@ -108,6 +113,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
};
private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
@ -290,4 +299,77 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
null,
true);
}
public void testNoBreakWithRepeatKeywordFilter() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(
analyzer,
NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
null,
null,
null,
null,
null,
true);
}
// checks for bug described in https://github.com/apache/lucene/issues/11771
public void testPreventEarlyExit() throws IOException {
InputStream earlyExitInput = null;
InputStream earlyExitOutput = null;
try {
ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
String earlyExitOutputText =
new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
String[] earlyExitOutputTexts =
Arrays.stream(earlyExitOutputText.split("\\s"))
.filter(text -> text != "")
.collect(Collectors.joining(" "))
.split(" ");
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp",
"tokenizerModel",
tokenizerModelFile,
"sentenceModel",
sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(
analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
} finally {
if (earlyExitInput != null) {
earlyExitInput.close();
}
if (earlyExitOutput != null) {
earlyExitOutput.close();
}
}
}
public void testEmptyField() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.ClasspathResourceLoader;
@ -66,6 +67,7 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
private static final String[] NO_BREAK_terms = {"No", "period"};
private static final int[] NO_BREAK_startOffsets = {0, 3};
private static final int[] NO_BREAK_endOffsets = {2, 9};
private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
private static final String sentenceModelFile = "en-test-sent.bin";
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
@ -144,4 +146,26 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
null,
true);
}
public void testNoBreakWithRepeatKeywordFilter() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(
analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
}
public void testEmptyField() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.tokenattributes;
import org.apache.lucene.util.Attribute;
/**
* This attribute tracks what sentence a given token belongs to as well as potentially other
* sentence specific attributes.
*/
public interface SentenceAttribute extends Attribute {
/**
* Get the sentence index for the current token
*
* @return The index of the sentence
* @see #getSentenceIndex()
*/
int getSentenceIndex();
/**
* Set the sentence of the current token
*
* @see #setSentenceIndex(int sentenceIndex)
*/
void setSentenceIndex(int sentenceIndex);
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.tokenattributes;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
/**
* Default implementation of {@link SentenceAttribute}.
*
* <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
* decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
* class may hold other sentence specific data in the future.
*/
public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
private int index = 0;
/** Initialize this attribute to default */
public SentenceAttributeImpl() {}
@Override
public void clear() {
index = 0;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other instanceof SentenceAttributeImpl) {
return ((SentenceAttributeImpl) other).index == index;
}
return false;
}
@Override
public int hashCode() {
return index;
}
@Override
public void copyTo(AttributeImpl target) {
SentenceAttribute t = (SentenceAttribute) target;
t.setSentenceIndex(index);
}
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(SentenceAttribute.class, "sentences", index);
}
@Override
public int getSentenceIndex() {
return index;
}
@Override
public void setSentenceIndex(int sentence) {
this.index = sentence;
}
}