mirror of https://github.com/apache/lucene.git
Fix repeating token sentence boundary bug (#11734)
Signed-off-by: lkotzaniewsk <lkotzaniewsk@bloomberg.net> Co-authored-by: Dawid Weiss <dawid.weiss@gmail.com>
This commit is contained in:
parent
5b24a233bd
commit
3a04aa44c2
|
@ -110,6 +110,12 @@ Bug Fixes
|
|||
trying to apply a dictionary whose size is greater than the maximum supported
|
||||
window size for LZ4. (Adrien Grand)
|
||||
|
||||
* GITHUB#11735: KeywordRepeatFilter + OpenNLPLemmatizer always drops last token of a stream.
|
||||
(Luke Kot-Zaniewski)
|
||||
|
||||
* GITHUB#11771: KeywordRepeatFilter + OpenNLPLemmatizer sometimes arbitrarily exits token stream.
|
||||
(Luke Kot-Zaniewski)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
* LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.IgnoreRandomChains;
|
||||
|
@ -36,76 +36,65 @@ import org.apache.lucene.util.IgnoreRandomChains;
|
|||
*/
|
||||
@IgnoreRandomChains(reason = "other filters must precede this one (see docs)")
|
||||
public final class OpenNLPChunkerFilter extends TokenFilter {
|
||||
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
private int tokenNum = 0;
|
||||
private boolean moreTokensAvailable = true;
|
||||
private String[] sentenceTerms = null;
|
||||
private String[] sentenceTermPOSTags = null;
|
||||
|
||||
private final NLPChunkerOp chunkerOp;
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final SentenceAttributeExtractor sentenceAttributeExtractor;
|
||||
|
||||
public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
|
||||
super(input);
|
||||
this.chunkerOp = chunkerOp;
|
||||
sentenceAttributeExtractor =
|
||||
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (tokenNum == sentenceTokenAttrs.size()) {
|
||||
nextSentence();
|
||||
if (sentenceTerms == null) {
|
||||
clear();
|
||||
public boolean incrementToken() throws IOException {
|
||||
List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
|
||||
boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
|
||||
if (isEndOfCurrentSentence) {
|
||||
boolean noSentencesLeft =
|
||||
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
|
||||
if (noSentencesLeft) {
|
||||
return false;
|
||||
}
|
||||
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
|
||||
tokenNum = 0;
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void nextSentence() throws IOException {
|
||||
private List<AttributeSource> nextSentence() throws IOException {
|
||||
tokenNum = 0;
|
||||
List<String> termList = new ArrayList<>();
|
||||
List<String> posTagList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
termList.add(termAtt.toString());
|
||||
posTagList.add(typeAtt.type());
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
|
||||
termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
|
||||
posTagList.add(attributeSource.getAttribute(TypeAttribute.class).type());
|
||||
}
|
||||
sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
|
||||
sentenceTermPOSTags =
|
||||
posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
|
||||
String[] sentenceTerms = termList.toArray(new String[0]);
|
||||
String[] sentenceTermPOSTags = posTagList.toArray(new String[0]);
|
||||
assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
|
||||
return sentenceAttributeExtractor.getSentenceAttributes();
|
||||
}
|
||||
|
||||
private void assignTokenTypes(String[] tags) {
|
||||
for (int i = 0; i < tags.length; ++i) {
|
||||
sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
|
||||
sentenceAttributeExtractor
|
||||
.getSentenceAttributes()
|
||||
.get(i)
|
||||
.getAttribute(TypeAttribute.class)
|
||||
.setType(tags[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
sentenceAttributeExtractor.reset();
|
||||
clear();
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
sentenceTerms = null;
|
||||
sentenceTermPOSTags = null;
|
||||
tokenNum = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,10 +24,7 @@ import java.util.List;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.IgnoreRandomChains;
|
||||
|
||||
|
@ -46,37 +43,28 @@ import org.apache.lucene.util.IgnoreRandomChains;
|
|||
public class OpenNLPLemmatizerFilter extends TokenFilter {
|
||||
private final NLPLemmatizerOp lemmatizerOp;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
|
||||
private boolean moreTokensAvailable = true;
|
||||
private String[] sentenceTokens = null; // non-keyword tokens
|
||||
private String[] sentenceTokenTypes = null; // types for non-keyword tokens
|
||||
private String[] lemmas = null; // lemmas for non-keyword tokens
|
||||
private final SentenceAttributeExtractor sentenceAttributeExtractor;
|
||||
private String[] lemmas = new String[0]; // lemmas for non-keyword tokens
|
||||
private int lemmaNum = 0; // lemma counter
|
||||
|
||||
public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
|
||||
super(input);
|
||||
this.lemmatizerOp = lemmatizerOp;
|
||||
sentenceAttributeExtractor =
|
||||
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (sentenceTokenAttrsIter == null || !sentenceTokenAttrsIter.hasNext()) {
|
||||
nextSentence();
|
||||
if (sentenceTokens == null) { // zero non-keyword tokens
|
||||
clear();
|
||||
boolean isEndOfCurrentSentence = lemmaNum >= lemmas.length;
|
||||
if (isEndOfCurrentSentence) {
|
||||
boolean noSentencesLeft =
|
||||
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
|
||||
if (noSentencesLeft) {
|
||||
return false;
|
||||
}
|
||||
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
|
||||
lemmaNum = 0;
|
||||
sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrsIter.next().copyTo(this);
|
||||
|
@ -86,36 +74,35 @@ public class OpenNLPLemmatizerFilter extends TokenFilter {
|
|||
return true;
|
||||
}
|
||||
|
||||
private void nextSentence() throws IOException {
|
||||
private List<AttributeSource> nextSentence() throws IOException {
|
||||
lemmaNum = 0;
|
||||
List<String> tokenList = new ArrayList<>();
|
||||
List<String> typeList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
if (!keywordAtt.isKeyword()) {
|
||||
tokenList.add(termAtt.toString());
|
||||
typeList.add(typeAtt.type());
|
||||
List<AttributeSource> sentenceAttributes =
|
||||
sentenceAttributeExtractor.extractSentenceAttributes();
|
||||
for (AttributeSource attributeSource : sentenceAttributes) {
|
||||
if (!attributeSource.getAttribute(KeywordAttribute.class).isKeyword()) {
|
||||
tokenList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
|
||||
typeList.add(attributeSource.getAttribute(TypeAttribute.class).type());
|
||||
}
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
}
|
||||
sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
|
||||
sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
|
||||
String[] sentenceTokens = tokenList.toArray(new String[0]);
|
||||
String[] sentenceTokenTypes = typeList.toArray(new String[0]);
|
||||
lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
|
||||
sentenceTokenAttrsIter = sentenceAttributes.iterator();
|
||||
return sentenceAttributeExtractor.getSentenceAttributes();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
sentenceAttributeExtractor.reset();
|
||||
clear();
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
sentenceTokenAttrsIter = null;
|
||||
sentenceTokens = null;
|
||||
sentenceTokenTypes = null;
|
||||
lemmas = null;
|
||||
lemmas = new String[0];
|
||||
lemmaNum = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.IgnoreRandomChains;
|
||||
|
@ -33,65 +33,62 @@ import org.apache.lucene.util.IgnoreRandomChains;
|
|||
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
|
||||
public final class OpenNLPPOSFilter extends TokenFilter {
|
||||
|
||||
private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
String[] tags = null;
|
||||
private int tokenNum = 0;
|
||||
private boolean moreTokensAvailable = true;
|
||||
|
||||
private final NLPPOSTaggerOp posTaggerOp;
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final SentenceAttributeExtractor sentenceAttributeExtractor;
|
||||
|
||||
public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
|
||||
super(input);
|
||||
this.posTaggerOp = posTaggerOp;
|
||||
sentenceAttributeExtractor =
|
||||
new SentenceAttributeExtractor(input, addAttribute(SentenceAttribute.class));
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (!moreTokensAvailable) {
|
||||
clear();
|
||||
return false;
|
||||
}
|
||||
if (tokenNum
|
||||
== sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
|
||||
String[] sentenceTokens = nextSentence();
|
||||
if (sentenceTokens == null) {
|
||||
clear();
|
||||
public boolean incrementToken() throws IOException {
|
||||
List<AttributeSource> sentenceTokenAttrs = sentenceAttributeExtractor.getSentenceAttributes();
|
||||
boolean isEndOfCurrentSentence = tokenNum >= sentenceTokenAttrs.size();
|
||||
if (isEndOfCurrentSentence) {
|
||||
boolean noSentencesLeft =
|
||||
sentenceAttributeExtractor.allSentencesProcessed() || nextSentence().isEmpty();
|
||||
if (noSentencesLeft) {
|
||||
return false;
|
||||
}
|
||||
tags = posTaggerOp.getPOSTags(sentenceTokens);
|
||||
tokenNum = 0;
|
||||
}
|
||||
clearAttributes();
|
||||
sentenceTokenAttrs.get(tokenNum).copyTo(this);
|
||||
typeAtt.setType(tags[tokenNum++]);
|
||||
sentenceTokenAttrs.get(tokenNum++).copyTo(this);
|
||||
return true;
|
||||
}
|
||||
|
||||
private String[] nextSentence() throws IOException {
|
||||
private List<AttributeSource> nextSentence() throws IOException {
|
||||
tokenNum = 0;
|
||||
List<String> termList = new ArrayList<>();
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean endOfSentence = false;
|
||||
while (!endOfSentence && (moreTokensAvailable = input.incrementToken())) {
|
||||
termList.add(termAtt.toString());
|
||||
endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
|
||||
sentenceTokenAttrs.add(input.cloneAttributes());
|
||||
for (AttributeSource attributeSource : sentenceAttributeExtractor.extractSentenceAttributes()) {
|
||||
termList.add(attributeSource.getAttribute(CharTermAttribute.class).toString());
|
||||
}
|
||||
String[] sentenceTerms = termList.toArray(new String[0]);
|
||||
assignTokenTypes(posTaggerOp.getPOSTags(sentenceTerms));
|
||||
return sentenceAttributeExtractor.getSentenceAttributes();
|
||||
}
|
||||
|
||||
private void assignTokenTypes(String[] tags) {
|
||||
for (int i = 0; i < tags.length; ++i) {
|
||||
sentenceAttributeExtractor
|
||||
.getSentenceAttributes()
|
||||
.get(i)
|
||||
.getAttribute(TypeAttribute.class)
|
||||
.setType(tags[i]);
|
||||
}
|
||||
return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
moreTokensAvailable = true;
|
||||
sentenceAttributeExtractor.reset();
|
||||
clear();
|
||||
}
|
||||
|
||||
private void clear() {
|
||||
sentenceTokenAttrs.clear();
|
||||
tags = null;
|
||||
tokenNum = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,28 +22,27 @@ import opennlp.tools.util.Span;
|
|||
import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
|
||||
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
|
||||
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
import org.apache.lucene.util.IgnoreRandomChains;
|
||||
|
||||
/**
|
||||
* Run OpenNLP SentenceDetector and Tokenizer. The last token in each sentence is marked by setting
|
||||
* the {@link #EOS_FLAG_BIT} in the FlagsAttribute; following filters can use this information to
|
||||
* apply operations to tokens one sentence at a time.
|
||||
* Run OpenNLP SentenceDetector and Tokenizer. The index of each sentence is stored in
|
||||
* SentenceAttribute.
|
||||
*/
|
||||
@IgnoreRandomChains(reason = "LUCENE-10352: add argument providers for this one")
|
||||
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
|
||||
public static int EOS_FLAG_BIT = 1;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final SentenceAttribute sentenceAtt = addAttribute(SentenceAttribute.class);
|
||||
|
||||
private Span[] termSpans = null;
|
||||
private int termNum = 0;
|
||||
private int sentenceStart = 0;
|
||||
private int sentenceIndex = -1;
|
||||
|
||||
private NLPTokenizerOp tokenizerOp = null;
|
||||
|
||||
|
@ -71,6 +70,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
|
|||
String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
|
||||
termSpans = tokenizerOp.getTerms(sentenceText);
|
||||
termNum = 0;
|
||||
sentenceIndex++;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -84,11 +84,7 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
|
|||
offsetAtt.setOffset(
|
||||
correctOffset(offset + sentenceStart + term.getStart()),
|
||||
correctOffset(offset + sentenceStart + term.getEnd()));
|
||||
if (termNum == termSpans.length - 1) {
|
||||
flagsAtt.setFlags(
|
||||
flagsAtt.getFlags()
|
||||
| EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
|
||||
}
|
||||
sentenceAtt.setSentenceIndex(sentenceIndex);
|
||||
++termNum;
|
||||
return true;
|
||||
}
|
||||
|
@ -98,5 +94,6 @@ public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
|
|||
super.reset();
|
||||
termSpans = null;
|
||||
termNum = sentenceStart = 0;
|
||||
sentenceIndex = -1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.SentenceAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* Iterate through sentence tokens and cache their attributes. Could consider moving this to a more
|
||||
* central location to be used by other sentence-aware components.
|
||||
*
|
||||
* <p>May want to consider making this its own Filter so that extracted sentence token attributes
|
||||
* can be shared by downstream sentence-aware filters.
|
||||
*/
|
||||
public class SentenceAttributeExtractor {
|
||||
|
||||
private final TokenStream input;
|
||||
private final SentenceAttribute sentenceAtt;
|
||||
private final List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
|
||||
private AttributeSource prevAttributeSource;
|
||||
private int currSentence = 0;
|
||||
private boolean hasNextToken = true;
|
||||
|
||||
public SentenceAttributeExtractor(TokenStream input, SentenceAttribute sentenceAtt) {
|
||||
this.input = input;
|
||||
this.sentenceAtt = sentenceAtt;
|
||||
}
|
||||
|
||||
// If this class were a stand-alone filter it could conceivably extract the attributes once
|
||||
// and cache a reference to those attributes in SentenceAttribute. That way downstream filters
|
||||
// could read the full sentence without having to independently extract it.
|
||||
public List<AttributeSource> extractSentenceAttributes() throws IOException {
|
||||
sentenceTokenAttrs.clear();
|
||||
boolean hasNext;
|
||||
do {
|
||||
hasNextToken = input.incrementToken();
|
||||
int currSentenceTmp = sentenceAtt.getSentenceIndex();
|
||||
hasNext = (currSentence == currSentenceTmp && hasNextToken);
|
||||
currSentence = currSentenceTmp;
|
||||
if (prevAttributeSource != null) {
|
||||
sentenceTokenAttrs.add(prevAttributeSource);
|
||||
}
|
||||
prevAttributeSource = input.cloneAttributes();
|
||||
} while (hasNext);
|
||||
return sentenceTokenAttrs;
|
||||
}
|
||||
|
||||
public List<AttributeSource> getSentenceAttributes() {
|
||||
return sentenceTokenAttrs;
|
||||
}
|
||||
|
||||
public boolean allSentencesProcessed() {
|
||||
return !hasNextToken;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
hasNextToken = true;
|
||||
sentenceTokenAttrs.clear();
|
||||
currSentence = 0;
|
||||
prevAttributeSource = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
Quick brown fox jumped over the lazy dog. Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
Quick brown fox jumped over the lazy dog.
|
||||
x
|
||||
This should hopefully get analyzed.
|
||||
x
|
||||
And so should this.
|
|
@ -0,0 +1,32 @@
|
|||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
Quick Quick brown brown fox fox jumped jumped over over the the lazy lazy dog dog . .
|
||||
x x
|
||||
This This should should hopefully hopefully get get analyzed analyzed . .
|
||||
x x
|
||||
And And so so should should this this . .
|
|
@ -114,4 +114,16 @@ public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
|
|||
true,
|
||||
toPayloads(SENTENCES_chunks));
|
||||
}
|
||||
|
||||
public void testEmptyField() throws Exception {
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
|
||||
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,11 @@
|
|||
|
||||
package org.apache.lucene.analysis.opennlp;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
|
||||
|
@ -108,6 +113,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
|
|||
"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."
|
||||
};
|
||||
|
||||
private static final String NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD = "period";
|
||||
|
||||
private static final String[] NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms = {"period", "period"};
|
||||
|
||||
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
|
||||
|
@ -290,4 +299,77 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
|
|||
null,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testNoBreakWithRepeatKeywordFilter() throws Exception {
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
|
||||
.build();
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD,
|
||||
NO_BREAK_SINGLE_TOKEN_REPEAT_KEYWORD_terms,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
true);
|
||||
}
|
||||
|
||||
// checks for bug described in https://github.com/apache/lucene/issues/11771
|
||||
public void testPreventEarlyExit() throws IOException {
|
||||
InputStream earlyExitInput = null;
|
||||
InputStream earlyExitOutput = null;
|
||||
try {
|
||||
ClasspathResourceLoader loader = new ClasspathResourceLoader(getClass());
|
||||
earlyExitInput = loader.openResource("data/early-exit-bug-input.txt");
|
||||
String earlyExitInputText = new String(earlyExitInput.readAllBytes(), StandardCharsets.UTF_8);
|
||||
earlyExitOutput = loader.openResource("data/early-exit-bug-output.txt");
|
||||
String earlyExitOutputText =
|
||||
new String(earlyExitOutput.readAllBytes(), StandardCharsets.UTF_8);
|
||||
String[] earlyExitOutputTexts =
|
||||
Arrays.stream(earlyExitOutputText.split("\\s"))
|
||||
.filter(text -> text != "")
|
||||
.collect(Collectors.joining(" "))
|
||||
.split(" ");
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp",
|
||||
"tokenizerModel",
|
||||
tokenizerModelFile,
|
||||
"sentenceModel",
|
||||
sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
|
||||
.build();
|
||||
assertAnalyzesTo(
|
||||
analyzer, earlyExitInputText, earlyExitOutputTexts, null, null, null, null, null, true);
|
||||
} finally {
|
||||
if (earlyExitInput != null) {
|
||||
earlyExitInput.close();
|
||||
}
|
||||
if (earlyExitOutput != null) {
|
||||
earlyExitOutput.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testEmptyField() throws Exception {
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
|
||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.ClasspathResourceLoader;
|
||||
|
@ -66,6 +67,7 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
|
|||
private static final String[] NO_BREAK_terms = {"No", "period"};
|
||||
private static final int[] NO_BREAK_startOffsets = {0, 3};
|
||||
private static final int[] NO_BREAK_endOffsets = {2, 9};
|
||||
private static final String[] NO_BREAK_KEYWORD_REPEAT_terms = {"No", "No", "period", "period"};
|
||||
|
||||
private static final String sentenceModelFile = "en-test-sent.bin";
|
||||
private static final String tokenizerModelFile = "en-test-tokenizer.bin";
|
||||
|
@ -144,4 +146,26 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
|
|||
null,
|
||||
true);
|
||||
}
|
||||
|
||||
public void testNoBreakWithRepeatKeywordFilter() throws Exception {
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter(KeywordRepeatFilterFactory.class)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(
|
||||
analyzer, NO_BREAK, NO_BREAK_KEYWORD_REPEAT_terms, null, null, null, null, null, true);
|
||||
}
|
||||
|
||||
public void testEmptyField() throws Exception {
|
||||
CustomAnalyzer analyzer =
|
||||
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
|
||||
.withTokenizer(
|
||||
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
|
||||
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
|
||||
.build();
|
||||
assertAnalyzesTo(analyzer, "", new String[0], null, null, null, null, null, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/**
|
||||
* This attribute tracks what sentence a given token belongs to as well as potentially other
|
||||
* sentence specific attributes.
|
||||
*/
|
||||
public interface SentenceAttribute extends Attribute {
|
||||
|
||||
/**
|
||||
* Get the sentence index for the current token
|
||||
*
|
||||
* @return The index of the sentence
|
||||
* @see #getSentenceIndex()
|
||||
*/
|
||||
int getSentenceIndex();
|
||||
|
||||
/**
|
||||
* Set the sentence of the current token
|
||||
*
|
||||
* @see #setSentenceIndex(int sentenceIndex)
|
||||
*/
|
||||
void setSentenceIndex(int sentenceIndex);
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeReflector;
|
||||
|
||||
/**
|
||||
* Default implementation of {@link SentenceAttribute}.
|
||||
*
|
||||
* <p>The current implementation is coincidentally identical to {@link FlagsAttributeImpl} It was
|
||||
* decided to keep it separate because this attribute will NOT be an implied bitmap. Also, this
|
||||
* class may hold other sentence specific data in the future.
|
||||
*/
|
||||
public class SentenceAttributeImpl extends AttributeImpl implements SentenceAttribute {
|
||||
|
||||
private int index = 0;
|
||||
|
||||
/** Initialize this attribute to default */
|
||||
public SentenceAttributeImpl() {}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
index = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof SentenceAttributeImpl) {
|
||||
return ((SentenceAttributeImpl) other).index == index;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
SentenceAttribute t = (SentenceAttribute) target;
|
||||
t.setSentenceIndex(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reflectWith(AttributeReflector reflector) {
|
||||
reflector.reflect(SentenceAttribute.class, "sentences", index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSentenceIndex() {
|
||||
return index;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setSentenceIndex(int sentence) {
|
||||
this.index = sentence;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue