mirror of https://github.com/apache/lucene.git
LUCENE-5259: convert analysis consumers to try-with-resources
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1529770 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9c98f9d958
commit
64a795b6e3
|
@ -307,30 +307,30 @@ public class SynonymMap {
|
|||
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
||||
* reuse and its chars must not be null. */
|
||||
public CharsRef analyze(String text, CharsRef reuse) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("", text);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
reuse.length = 0;
|
||||
while (ts.incrementToken()) {
|
||||
int length = termAtt.length();
|
||||
if (length == 0) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||
try (TokenStream ts = analyzer.tokenStream("", text)) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
reuse.length = 0;
|
||||
while (ts.incrementToken()) {
|
||||
int length = termAtt.length();
|
||||
if (length == 0) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||
}
|
||||
if (posIncAtt.getPositionIncrement() != 1) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||
}
|
||||
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||
int end = reuse.offset + reuse.length;
|
||||
if (reuse.length > 0) {
|
||||
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||
reuse.length++;
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||
reuse.length += length;
|
||||
}
|
||||
if (posIncAtt.getPositionIncrement() != 1) {
|
||||
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||
}
|
||||
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||
int end = reuse.offset + reuse.length;
|
||||
if (reuse.length > 0) {
|
||||
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||
reuse.length++;
|
||||
}
|
||||
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||
reuse.length += length;
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
if (reuse.length == 0) {
|
||||
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
||||
}
|
||||
|
|
|
@ -117,12 +117,15 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
// LUCENE-1441
|
||||
public void testOffsets() throws Exception {
|
||||
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
|
||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
try (TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"))) {
|
||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||
stream.reset();
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(4, offsetAtt.endOffset());
|
||||
assertFalse(stream.incrementToken());
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -46,27 +46,31 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testDefaults() throws IOException {
|
||||
assertTrue(stop != null);
|
||||
TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer");
|
||||
assertTrue(stream != null);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
stream.reset();
|
||||
try (TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer")) {
|
||||
assertTrue(stream != null);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
stream.reset();
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
assertFalse(inValidTokens.contains(termAtt.toString()));
|
||||
while (stream.incrementToken()) {
|
||||
assertFalse(inValidTokens.contains(termAtt.toString()));
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
public void testStopList() throws IOException {
|
||||
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
||||
TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer");
|
||||
assertNotNull(stream);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
|
||||
assertNotNull(stream);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -75,17 +79,19 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
||||
String s = "This is a good test of the english stop analyzer with positions";
|
||||
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
||||
TokenStream stream = newStop.tokenStream("test", s);
|
||||
assertNotNull(stream);
|
||||
int i = 0;
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
try (TokenStream stream = newStop.tokenStream("test", s)) {
|
||||
assertNotNull(stream);
|
||||
int i = 0;
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
String text = termAtt.toString();
|
||||
assertFalse(stopWordsSet.contains(text));
|
||||
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,23 +37,29 @@ public class TestPerFieldAnalyzerWrapper extends BaseTokenStreamTestCase {
|
|||
PerFieldAnalyzerWrapper analyzer =
|
||||
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);
|
||||
|
||||
TokenStream tokenStream = analyzer.tokenStream("field", text);
|
||||
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
try (TokenStream tokenStream = analyzer.tokenStream("field", text)) {
|
||||
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("WhitespaceAnalyzer does not lowercase",
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("WhitespaceAnalyzer does not lowercase",
|
||||
"Qwerty",
|
||||
termAtt.toString());
|
||||
assertFalse(tokenStream.incrementToken());
|
||||
tokenStream.end();
|
||||
}
|
||||
|
||||
tokenStream = analyzer.tokenStream("special", text);
|
||||
termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
try (TokenStream tokenStream = analyzer.tokenStream("special", text)) {
|
||||
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("SimpleAnalyzer lowercases",
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("SimpleAnalyzer lowercases",
|
||||
"qwerty",
|
||||
termAtt.toString());
|
||||
assertFalse(tokenStream.incrementToken());
|
||||
tokenStream.end();
|
||||
}
|
||||
}
|
||||
|
||||
public void testCharFilters() throws Exception {
|
||||
|
|
|
@ -95,17 +95,19 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||
PhraseQuery q = new PhraseQuery();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream("content", "this sentence");
|
||||
int j = -1;
|
||||
try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
|
||||
int j = -1;
|
||||
|
||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
j += posIncrAtt.getPositionIncrement();
|
||||
String termText = termAtt.toString();
|
||||
q.add(new Term("content", termText), j);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
j += posIncrAtt.getPositionIncrement();
|
||||
String termText = termAtt.toString();
|
||||
q.add(new Term("content", termText), j);
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
|
||||
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
||||
|
@ -121,16 +123,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
|
||||
TokenStream ts = analyzer.tokenStream("content", "test sentence");
|
||||
try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
|
||||
while (ts.incrementToken()) {
|
||||
String termText = termAtt.toString();
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String termText = termAtt.toString();
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
|
||||
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
||||
|
|
|
@ -123,18 +123,18 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
int num = 1000 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < num; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random());
|
||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
// just for fun
|
||||
checkRandomData(random(), analyzer, num);
|
||||
|
@ -161,18 +161,18 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
|||
int num = 1000 * RANDOM_MULTIPLIER;
|
||||
for (int i = 0; i < num; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random());
|
||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||
ts.reset();
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||
cp = highlightedText.codePointAt(j);
|
||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
// just for fun
|
||||
checkRandomData(random(), analyzer, num);
|
||||
|
|
|
@ -249,16 +249,16 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testTokenAttributes() throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", "This is a test");
|
||||
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
||||
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
||||
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
||||
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
||||
try (TokenStream ts = a.tokenStream("dummy", "This is a test")) {
|
||||
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
||||
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
||||
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
||||
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,14 +53,14 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
|
|||
int numIterations = atLeast(1000);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random(), 100);
|
||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -141,13 +141,13 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
* ideally the test would actually fail instead of hanging...
|
||||
*/
|
||||
public void testDecomposition5() throws Exception {
|
||||
TokenStream ts = analyzer.tokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ");
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
try (TokenStream ts = analyzer.tokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ")) {
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -213,12 +213,12 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
public void testLargeDocReliability() throws Exception {
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String s = _TestUtil.randomUnicodeString(random(), 10000);
|
||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -236,29 +236,31 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
System.out.println("\nTEST: iter=" + i);
|
||||
}
|
||||
String s = _TestUtil.randomUnicodeString(random(), 100);
|
||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testOnlyPunctuation() throws IOException {
|
||||
TokenStream ts = analyzerNoPunct.tokenStream("foo", "。、。。");
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
try (TokenStream ts = analyzerNoPunct.tokenStream("foo", "。、。。")) {
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
||||
public void testOnlyPunctuationExtended() throws IOException {
|
||||
TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......");
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
try (TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......")) {
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
||||
// note: test is kinda silly since kuromoji emits punctuation tokens.
|
||||
|
@ -369,75 +371,81 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void assertReadings(String input, String... readings) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String reading : readings) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(reading, readingAtt.getReading());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String reading : readings) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(reading, readingAtt.getReading());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertPronunciations(String input, String... pronunciations) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String pronunciation : pronunciations) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(pronunciation, readingAtt.getPronunciation());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String pronunciation : pronunciations) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(pronunciation, readingAtt.getPronunciation());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertBaseForms(String input, String... baseForms) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
|
||||
ts.reset();
|
||||
for(String baseForm : baseForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(baseForm, baseFormAtt.getBaseForm());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
|
||||
ts.reset();
|
||||
for(String baseForm : baseForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(baseForm, baseFormAtt.getBaseForm());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionType : inflectionTypes) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionType, inflectionAtt.getInflectionType());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionType : inflectionTypes) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionType, inflectionAtt.getInflectionType());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionForm : inflectionForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionForm : inflectionForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
||||
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
|
||||
ts.reset();
|
||||
for(String partOfSpeech : partsOfSpeech) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
|
||||
ts.reset();
|
||||
for(String partOfSpeech : partsOfSpeech) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
public void testReadings() throws Exception {
|
||||
|
@ -631,11 +639,11 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
long totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", line);
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
String[] sentences = line.split("、|。");
|
||||
if (VERBOSE) {
|
||||
|
@ -645,11 +653,11 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
for (String sentence: sentences) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", sentence);
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (VERBOSE) {
|
||||
|
|
|
@ -72,34 +72,36 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
@SuppressWarnings("unused")
|
||||
private void dumpTokens(String input) throws IOException {
|
||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
|
||||
ts.reset();
|
||||
try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", input)) {
|
||||
ts.reset();
|
||||
|
||||
MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
|
||||
CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
System.out.println(charTerm.toString() + " => " + attribute.getTags());
|
||||
MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
|
||||
CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
|
||||
while (ts.incrementToken()) {
|
||||
System.out.println(charTerm.toString() + " => " + attribute.getTags());
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||
public final void testLeftoverStems() throws IOException {
|
||||
Analyzer a = getTestAnalyzer();
|
||||
TokenStream ts_1 = a.tokenStream("dummy", "liście");
|
||||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||
ts_1.reset();
|
||||
ts_1.incrementToken();
|
||||
assertEquals("first stream", "liście", termAtt_1.toString());
|
||||
ts_1.end();
|
||||
ts_1.close();
|
||||
try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) {
|
||||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||
ts_1.reset();
|
||||
ts_1.incrementToken();
|
||||
assertEquals("first stream", "liście", termAtt_1.toString());
|
||||
ts_1.end();
|
||||
}
|
||||
|
||||
TokenStream ts_2 = a.tokenStream("dummy", "danych");
|
||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||
ts_2.reset();
|
||||
ts_2.incrementToken();
|
||||
assertEquals("second stream", "dany", termAtt_2.toString());
|
||||
ts_2.end();
|
||||
ts_2.close();
|
||||
try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) {
|
||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||
ts_2.reset();
|
||||
ts_2.incrementToken();
|
||||
assertEquals("second stream", "dany", termAtt_2.toString());
|
||||
ts_2.end();
|
||||
}
|
||||
}
|
||||
|
||||
/** Test stemming of mixed-case tokens. */
|
||||
|
@ -140,28 +142,27 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
/** Test morphosyntactic annotations. */
|
||||
public final void testPOSAttribute() throws IOException {
|
||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liście");
|
||||
|
||||
ts.reset();
|
||||
assertPOSToken(ts, "liście",
|
||||
try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liście")) {
|
||||
ts.reset();
|
||||
assertPOSToken(ts, "liście",
|
||||
"subst:sg:acc:n2",
|
||||
"subst:sg:nom:n2",
|
||||
"subst:sg:voc:n2");
|
||||
|
||||
assertPOSToken(ts, "liść",
|
||||
assertPOSToken(ts, "liść",
|
||||
"subst:pl:acc:m3",
|
||||
"subst:pl:nom:m3",
|
||||
"subst:pl:voc:m3");
|
||||
|
||||
assertPOSToken(ts, "list",
|
||||
assertPOSToken(ts, "list",
|
||||
"subst:sg:loc:m3",
|
||||
"subst:sg:voc:m3");
|
||||
|
||||
assertPOSToken(ts, "lista",
|
||||
assertPOSToken(ts, "lista",
|
||||
"subst:sg:dat:f",
|
||||
"subst:sg:loc:f");
|
||||
ts.end();
|
||||
ts.close();
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
||||
/** */
|
||||
|
|
|
@ -184,9 +184,11 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
sb.append("我购买了道具和服装。");
|
||||
}
|
||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||
TokenStream stream = analyzer.tokenStream("", sb.toString());
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -197,9 +199,11 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
|||
sb.append("我购买了道具和服装");
|
||||
}
|
||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||
TokenStream stream = analyzer.tokenStream("", sb.toString());
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -91,20 +91,19 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
throw new IOException("You must first call Classifier#train");
|
||||
}
|
||||
Long output = 0l;
|
||||
TokenStream tokenStream = analyzer.tokenStream(textFieldName,
|
||||
new StringReader(text));
|
||||
CharTermAttribute charTermAttribute = tokenStream
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
|
||||
CharTermAttribute charTermAttribute = tokenStream
|
||||
.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
String s = charTermAttribute.toString();
|
||||
Long d = Util.get(fst, new BytesRef(s));
|
||||
if (d != null) {
|
||||
output += d;
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
String s = charTermAttribute.toString();
|
||||
Long d = Util.get(fst, new BytesRef(s));
|
||||
if (d != null) {
|
||||
output += d;
|
||||
}
|
||||
}
|
||||
tokenStream.end();
|
||||
}
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
|
||||
return new ClassificationResult<>(output >= threshold, output.doubleValue());
|
||||
}
|
||||
|
|
|
@ -85,14 +85,14 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
|
||||
private String[] tokenizeDoc(String doc) throws IOException {
|
||||
Collection<String> result = new LinkedList<String>();
|
||||
TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
|
||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
result.add(charTermAttribute.toString());
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc)) {
|
||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
result.add(charTermAttribute.toString());
|
||||
}
|
||||
tokenStream.end();
|
||||
}
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
return result.toArray(new String[result.size()]);
|
||||
}
|
||||
|
||||
|
|
|
@ -92,13 +92,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
|
||||
}
|
||||
|
||||
final TokenStream stream = field.tokenStream(docState.analyzer);
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
boolean success2 = false;
|
||||
|
||||
try {
|
||||
try (TokenStream stream = field.tokenStream(docState.analyzer)) {
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
boolean hasMoreTokens = stream.incrementToken();
|
||||
|
||||
fieldState.attributeSource = stream;
|
||||
|
@ -179,13 +175,6 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// when we come back around to the field...
|
||||
fieldState.position += posIncrAttribute.getPositionIncrement();
|
||||
fieldState.offset += offsetAttribute.endOffset();
|
||||
success2 = true;
|
||||
} finally {
|
||||
if (!success2) {
|
||||
IOUtils.closeWhileHandlingException(stream);
|
||||
} else {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
|
||||
|
|
|
@ -98,13 +98,13 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
String testString = "t";
|
||||
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
TokenStream stream = analyzer.tokenStream("dummy", testString);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
// consume
|
||||
try (TokenStream stream = analyzer.tokenStream("dummy", testString)) {
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
// consume
|
||||
}
|
||||
stream.end();
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
|
||||
assertAnalyzesTo(analyzer, testString, new String[] { "t" });
|
||||
}
|
||||
|
@ -121,13 +121,13 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
|||
StringReader reader = new StringReader(s);
|
||||
MockCharFilter charfilter = new MockCharFilter(reader, 2);
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
TokenStream ts = analyzer.tokenStream("bogus", charfilter);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
;
|
||||
try (TokenStream ts = analyzer.tokenStream("bogus", charfilter)) {
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
;
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -47,30 +47,29 @@ public class TestLongPostings extends LuceneTestCase {
|
|||
if (other != null && s.equals(other)) {
|
||||
continue;
|
||||
}
|
||||
final TokenStream ts = a.tokenStream("foo", s);
|
||||
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
|
||||
final BytesRef termBytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
try (TokenStream ts = a.tokenStream("foo", s)) {
|
||||
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
|
||||
final BytesRef termBytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
|
||||
int count = 0;
|
||||
boolean changed = false;
|
||||
int count = 0;
|
||||
boolean changed = false;
|
||||
|
||||
while(ts.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
|
||||
// The value was changed during analysis. Keep iterating so the
|
||||
// tokenStream is exhausted.
|
||||
changed = true;
|
||||
while(ts.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
|
||||
// The value was changed during analysis. Keep iterating so the
|
||||
// tokenStream is exhausted.
|
||||
changed = true;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
ts.end();
|
||||
ts.close();
|
||||
|
||||
// Did we iterate just once and the value was unchanged?
|
||||
if (!changed && count == 1) {
|
||||
return s;
|
||||
ts.end();
|
||||
// Did we iterate just once and the value was unchanged?
|
||||
if (!changed && count == 1) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -174,17 +174,18 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
|||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
|
||||
Document doc = new Document();
|
||||
TokenStream stream = analyzer.tokenStream("field", "abcd ");
|
||||
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
|
||||
stream = new CachingTokenFilter(stream);
|
||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
customType.setStoreTermVectorPositions(true);
|
||||
customType.setStoreTermVectorOffsets(true);
|
||||
Field f = new Field("field", stream, customType);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
try (TokenStream stream = analyzer.tokenStream("field", "abcd ")) {
|
||||
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
|
||||
TokenStream cachedStream = new CachingTokenFilter(stream);
|
||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
customType.setStoreTermVectors(true);
|
||||
customType.setStoreTermVectorPositions(true);
|
||||
customType.setStoreTermVectorOffsets(true);
|
||||
Field f = new Field("field", cachedStream, customType);
|
||||
doc.add(f);
|
||||
doc.add(f);
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.close();
|
||||
|
||||
IndexReader r = DirectoryReader.open(dir);
|
||||
|
|
|
@ -617,16 +617,16 @@ public class TestPhraseQuery extends LuceneTestCase {
|
|||
break;
|
||||
}
|
||||
}
|
||||
TokenStream ts = analyzer.tokenStream("ignore", term);
|
||||
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while(ts.incrementToken()) {
|
||||
String text = termAttr.toString();
|
||||
doc.add(text);
|
||||
sb.append(text).append(' ');
|
||||
try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
|
||||
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while(ts.incrementToken()) {
|
||||
String text = termAttr.toString();
|
||||
doc.add(text);
|
||||
sb.append(text).append(' ');
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
} else {
|
||||
// pick existing sub-phrase
|
||||
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
|
||||
|
|
|
@ -170,21 +170,21 @@ public abstract class AbstractTestCase extends LuceneTestCase {
|
|||
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException {
|
||||
List<BytesRef> bytesRefs = new ArrayList<BytesRef>();
|
||||
|
||||
TokenStream tokenStream = analyzer.tokenStream(field, text);
|
||||
TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(field, text)) {
|
||||
TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||
|
||||
BytesRef bytesRef = termAttribute.getBytesRef();
|
||||
BytesRef bytesRef = termAttribute.getBytesRef();
|
||||
|
||||
tokenStream.reset();
|
||||
tokenStream.reset();
|
||||
|
||||
while (tokenStream.incrementToken()) {
|
||||
termAttribute.fillBytesRef();
|
||||
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
|
||||
while (tokenStream.incrementToken()) {
|
||||
termAttribute.fillBytesRef();
|
||||
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
|
||||
}
|
||||
|
||||
tokenStream.end();
|
||||
}
|
||||
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
|
||||
return bytesRefs;
|
||||
}
|
||||
|
||||
|
|
|
@ -777,31 +777,31 @@ public final class MoreLikeThis {
|
|||
throw new UnsupportedOperationException("To use MoreLikeThis without " +
|
||||
"term vectors, you must provide an Analyzer");
|
||||
}
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
||||
int tokenCount = 0;
|
||||
// for every token
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
tokenCount++;
|
||||
if (tokenCount > maxNumTokensParsed) {
|
||||
break;
|
||||
}
|
||||
if (isNoiseWord(word)) {
|
||||
continue;
|
||||
}
|
||||
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
|
||||
int tokenCount = 0;
|
||||
// for every token
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
tokenCount++;
|
||||
if (tokenCount > maxNumTokensParsed) {
|
||||
break;
|
||||
}
|
||||
if (isNoiseWord(word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// increment frequency
|
||||
Int cnt = termFreqMap.get(word);
|
||||
if (cnt == null) {
|
||||
termFreqMap.put(word, new Int());
|
||||
} else {
|
||||
cnt.x++;
|
||||
// increment frequency
|
||||
Int cnt = termFreqMap.get(word);
|
||||
if (cnt == null) {
|
||||
termFreqMap.put(word, new Int());
|
||||
} else {
|
||||
cnt.x++;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -162,9 +162,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
*/
|
||||
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
|
||||
String analyzed = null;
|
||||
TokenStream stream = null;
|
||||
try{
|
||||
stream = getAnalyzer().tokenStream(field, chunk);
|
||||
try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
|
||||
stream.reset();
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
// get first and hopefully only output token
|
||||
|
@ -186,7 +184,6 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
multipleOutputs.append('"');
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
if (null != multipleOutputs) {
|
||||
throw new ParseException(
|
||||
String.format(getLocale(),
|
||||
|
@ -196,7 +193,6 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
|||
// nothing returned by analyzer. Was it a stop word and the user accidentally
|
||||
// used an analyzer with stop words?
|
||||
stream.end();
|
||||
stream.close();
|
||||
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
|
||||
}
|
||||
} catch (IOException e){
|
||||
|
|
|
@ -497,64 +497,52 @@ public abstract class QueryParserBase implements CommonQueryParserConfiguration
|
|||
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException {
|
||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source;
|
||||
try {
|
||||
source = analyzer.tokenStream(field, queryText);
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text");
|
||||
p.initCause(e);
|
||||
throw p;
|
||||
}
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
CachingTokenFilter buffer = null;
|
||||
TermToBytesRefAttribute termAtt = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
buffer.reset();
|
||||
|
||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
|
||||
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
ParseException p = new ParseException("Cannot close TokenStream analyzing query text");
|
||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
ParseException p = new ParseException("Eror analyzing query text");
|
||||
p.initCause(e);
|
||||
throw p;
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
||||
|
||||
if (numTokens == 0)
|
||||
|
@ -839,38 +827,24 @@ public abstract class QueryParserBase implements CommonQueryParserConfiguration
|
|||
}
|
||||
|
||||
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
|
||||
TokenStream source;
|
||||
|
||||
if (analyzerIn == null) analyzerIn = analyzer;
|
||||
|
||||
try {
|
||||
source = analyzerIn.tokenStream(field, part);
|
||||
try (TokenStream source = analyzerIn.tokenStream(field, part)) {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
|
||||
}
|
||||
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
||||
try {
|
||||
if (!source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
|
||||
termAtt.fillBytesRef();
|
||||
if (source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
||||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
|
||||
throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
|
||||
}
|
||||
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -113,52 +113,44 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
|||
String text = fieldNode.getTextAsString();
|
||||
String field = fieldNode.getFieldAsString();
|
||||
|
||||
TokenStream source;
|
||||
try {
|
||||
source = this.analyzer.tokenStream(field, text);
|
||||
source.reset();
|
||||
} catch (IOException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
|
||||
CachingTokenFilter buffer = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt
|
||||
.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
while (buffer.incrementToken()) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt
|
||||
.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
if (!buffer.hasAttribute(CharTermAttribute.class)) {
|
||||
return new NoTokenFoundQueryNode();
|
||||
|
|
|
@ -73,8 +73,7 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
|||
if ((stopWords != null) && (fields != null)) {
|
||||
stopWordsSet = new HashSet<String>();
|
||||
for (String field : fields) {
|
||||
try {
|
||||
TokenStream ts = analyzer.tokenStream(field, stopWords);
|
||||
try (TokenStream ts = analyzer.tokenStream(field, stopWords)) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
|
|
|
@ -49,9 +49,9 @@ public class SpanOrTermsBuilder extends SpanBuilderBase {
|
|||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||
String value = DOMUtils.getNonBlankTextOrFail(e);
|
||||
|
||||
try {
|
||||
List<SpanQuery> clausesList = new ArrayList<SpanQuery>();
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, value);
|
||||
List<SpanQuery> clausesList = new ArrayList<SpanQuery>();
|
||||
|
||||
try (TokenStream ts = analyzer.tokenStream(fieldName, value)) {
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
|
@ -61,7 +61,6 @@ public class SpanOrTermsBuilder extends SpanBuilderBase {
|
|||
clausesList.add(stq);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||
soq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
|
||||
return soq;
|
||||
|
|
|
@ -54,8 +54,7 @@ public class TermsFilterBuilder implements FilterBuilder {
|
|||
String text = DOMUtils.getNonBlankTextOrFail(e);
|
||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||
|
||||
try {
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, text);
|
||||
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
|
@ -64,7 +63,6 @@ public class TermsFilterBuilder implements FilterBuilder {
|
|||
terms.add(BytesRef.deepCopyOf(bytes));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
||||
|
|
|
@ -51,8 +51,7 @@ public class TermsQueryBuilder implements QueryBuilder {
|
|||
|
||||
BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false));
|
||||
bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0));
|
||||
try {
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, text);
|
||||
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
Term term = null;
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
@ -63,7 +62,6 @@ public class TermsQueryBuilder implements QueryBuilder {
|
|||
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
||||
|
|
|
@ -193,67 +193,67 @@ public class FuzzyLikeThisQuery extends Query
|
|||
|
||||
private void addTerms(IndexReader reader, FieldVals f) throws IOException {
|
||||
if (f.queryString == null) return;
|
||||
TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString);
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
int corpusNumDocs = reader.numDocs();
|
||||
HashSet<String> processedTerms = new HashSet<String>();
|
||||
ts.reset();
|
||||
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
|
||||
if (terms == null) {
|
||||
return;
|
||||
}
|
||||
while (ts.incrementToken()) {
|
||||
String term = termAtt.toString();
|
||||
if (!processedTerms.contains(term)) {
|
||||
processedTerms.add(term);
|
||||
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||
float minScore = 0;
|
||||
Term startTerm = new Term(f.fieldName, term);
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
int corpusNumDocs = reader.numDocs();
|
||||
HashSet<String> processedTerms = new HashSet<String>();
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String term = termAtt.toString();
|
||||
if (!processedTerms.contains(term)) {
|
||||
processedTerms.add(term);
|
||||
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||
float minScore = 0;
|
||||
Term startTerm = new Term(f.fieldName, term);
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||
//store the df so all variants use same idf
|
||||
int df = reader.docFreq(startTerm);
|
||||
int numVariants = 0;
|
||||
int totalVariantDocFreqs = 0;
|
||||
BytesRef possibleMatch;
|
||||
BoostAttribute boostAtt =
|
||||
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||
//store the df so all variants use same idf
|
||||
int df = reader.docFreq(startTerm);
|
||||
int numVariants = 0;
|
||||
int totalVariantDocFreqs = 0;
|
||||
BytesRef possibleMatch;
|
||||
BoostAttribute boostAtt =
|
||||
fe.attributes().addAttribute(BoostAttribute.class);
|
||||
while ((possibleMatch = fe.next()) != null) {
|
||||
numVariants++;
|
||||
totalVariantDocFreqs += fe.docFreq();
|
||||
float score = boostAtt.getBoost();
|
||||
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
|
||||
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
|
||||
variantsQ.insertWithOverflow(st);
|
||||
minScore = variantsQ.top().score; // maintain minScore
|
||||
}
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
if (numVariants > 0) {
|
||||
int avgDf = totalVariantDocFreqs / numVariants;
|
||||
if (df == 0)//no direct match we can use as df for all variants
|
||||
{
|
||||
df = avgDf; //use avg df of all variants
|
||||
while ((possibleMatch = fe.next()) != null) {
|
||||
numVariants++;
|
||||
totalVariantDocFreqs += fe.docFreq();
|
||||
float score = boostAtt.getBoost();
|
||||
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
|
||||
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
|
||||
variantsQ.insertWithOverflow(st);
|
||||
minScore = variantsQ.top().score; // maintain minScore
|
||||
}
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// take the top variants (scored by edit distance) and reset the score
|
||||
// to include an IDF factor then add to the global queue for ranking
|
||||
// overall top query terms
|
||||
int size = variantsQ.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
ScoreTerm st = variantsQ.pop();
|
||||
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
|
||||
q.insertWithOverflow(st);
|
||||
if (numVariants > 0) {
|
||||
int avgDf = totalVariantDocFreqs / numVariants;
|
||||
if (df == 0)//no direct match we can use as df for all variants
|
||||
{
|
||||
df = avgDf; //use avg df of all variants
|
||||
}
|
||||
|
||||
// take the top variants (scored by edit distance) and reset the score
|
||||
// to include an IDF factor then add to the global queue for ranking
|
||||
// overall top query terms
|
||||
int size = variantsQ.size();
|
||||
for (int i = 0; i < size; i++) {
|
||||
ScoreTerm st = variantsQ.pop();
|
||||
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
|
||||
q.insertWithOverflow(st);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -352,9 +352,8 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
|||
occur = BooleanClause.Occur.SHOULD;
|
||||
}
|
||||
|
||||
try {
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
|
||||
//long t0 = System.currentTimeMillis();
|
||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
||||
ts.reset();
|
||||
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
|
@ -464,40 +463,39 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
|||
* result is set on each {@link
|
||||
* LookupResult#highlightKey} member. */
|
||||
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int upto = 0;
|
||||
while (ts.incrementToken()) {
|
||||
String token = termAtt.toString();
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int upto = 0;
|
||||
while (ts.incrementToken()) {
|
||||
String token = termAtt.toString();
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < startOffset) {
|
||||
addNonMatch(sb, text.substring(upto, startOffset));
|
||||
upto = startOffset;
|
||||
} else if (upto > startOffset) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (matchedTokens.contains(token)) {
|
||||
// Token matches.
|
||||
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
|
||||
upto = endOffset;
|
||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
|
||||
upto = endOffset;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < startOffset) {
|
||||
addNonMatch(sb, text.substring(upto, startOffset));
|
||||
upto = startOffset;
|
||||
} else if (upto > startOffset) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (matchedTokens.contains(token)) {
|
||||
// Token matches.
|
||||
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
|
||||
upto = endOffset;
|
||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
|
||||
upto = endOffset;
|
||||
if (upto < endOffset) {
|
||||
addNonMatch(sb, text.substring(upto));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
ts.end();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < endOffset) {
|
||||
addNonMatch(sb, text.substring(upto));
|
||||
}
|
||||
ts.close();
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/** Called while highlighting a single result, to append a
|
||||
|
|
|
@ -827,14 +827,15 @@ public class AnalyzingSuggester extends Lookup {
|
|||
}
|
||||
|
||||
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||
// Analyze surface form:
|
||||
TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
|
||||
// Analyze surface form:
|
||||
Automaton automaton = null;
|
||||
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
|
||||
|
||||
// Create corresponding automaton: labels are bytes
|
||||
// from each analyzed token, with byte 0 used as
|
||||
// separator between tokens:
|
||||
Automaton automaton = ts2a.toAutomaton(ts);
|
||||
ts.close();
|
||||
// Create corresponding automaton: labels are bytes
|
||||
// from each analyzed token, with byte 0 used as
|
||||
// separator between tokens:
|
||||
automaton = ts2a.toAutomaton(ts);
|
||||
}
|
||||
|
||||
replaceSep(automaton);
|
||||
automaton = convertAutomaton(automaton);
|
||||
|
@ -854,9 +855,10 @@ public class AnalyzingSuggester extends Lookup {
|
|||
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||
// TODO: is there a Reader from a CharSequence?
|
||||
// Turn tokenstream into automaton:
|
||||
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
|
||||
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||
ts.close();
|
||||
Automaton automaton = null;
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
|
||||
automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||
}
|
||||
|
||||
// TODO: we could use the end offset to "guess"
|
||||
// whether the final token was a partial token; this
|
||||
|
|
|
@ -449,252 +449,251 @@ public class FreeTextSuggester extends Lookup {
|
|||
|
||||
/** Retrieve suggestions. */
|
||||
public List<LookupResult> lookup(final CharSequence key, int num) throws IOException {
|
||||
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
|
||||
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
|
||||
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
|
||||
BytesRef[] lastTokens = new BytesRef[grams];
|
||||
//System.out.println("lookup: key='" + key + "'");
|
||||
BytesRef[] lastTokens = new BytesRef[grams];
|
||||
//System.out.println("lookup: key='" + key + "'");
|
||||
|
||||
// Run full analysis, but save only the
|
||||
// last 1gram, last 2gram, etc.:
|
||||
BytesRef tokenBytes = termBytesAtt.getBytesRef();
|
||||
int maxEndOffset = -1;
|
||||
boolean sawRealToken = false;
|
||||
while(ts.incrementToken()) {
|
||||
termBytesAtt.fillBytesRef();
|
||||
sawRealToken |= tokenBytes.length > 0;
|
||||
// TODO: this is somewhat iffy; today, ShingleFilter
|
||||
// sets posLen to the gram count; maybe we should make
|
||||
// a separate dedicated att for this?
|
||||
int gramCount = posLenAtt.getPositionLength();
|
||||
// Run full analysis, but save only the
|
||||
// last 1gram, last 2gram, etc.:
|
||||
BytesRef tokenBytes = termBytesAtt.getBytesRef();
|
||||
int maxEndOffset = -1;
|
||||
boolean sawRealToken = false;
|
||||
while(ts.incrementToken()) {
|
||||
termBytesAtt.fillBytesRef();
|
||||
sawRealToken |= tokenBytes.length > 0;
|
||||
// TODO: this is somewhat iffy; today, ShingleFilter
|
||||
// sets posLen to the gram count; maybe we should make
|
||||
// a separate dedicated att for this?
|
||||
int gramCount = posLenAtt.getPositionLength();
|
||||
|
||||
assert gramCount <= grams;
|
||||
assert gramCount <= grams;
|
||||
|
||||
// Safety: make sure the recalculated count "agrees":
|
||||
if (countGrams(tokenBytes) != gramCount) {
|
||||
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
|
||||
// Safety: make sure the recalculated count "agrees":
|
||||
if (countGrams(tokenBytes) != gramCount) {
|
||||
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
|
||||
}
|
||||
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
|
||||
lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
|
||||
}
|
||||
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
|
||||
lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
|
||||
}
|
||||
ts.end();
|
||||
ts.end();
|
||||
|
||||
if (!sawRealToken) {
|
||||
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
|
||||
}
|
||||
if (!sawRealToken) {
|
||||
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
|
||||
}
|
||||
|
||||
// Carefully fill last tokens with _ tokens;
|
||||
// ShingleFilter appraently won't emit "only hole"
|
||||
// tokens:
|
||||
int endPosInc = posIncAtt.getPositionIncrement();
|
||||
// Carefully fill last tokens with _ tokens;
|
||||
// ShingleFilter appraently won't emit "only hole"
|
||||
// tokens:
|
||||
int endPosInc = posIncAtt.getPositionIncrement();
|
||||
|
||||
// Note this will also be true if input is the empty
|
||||
// string (in which case we saw no tokens and
|
||||
// maxEndOffset is still -1), which in fact works out OK
|
||||
// because we fill the unigram with an empty BytesRef
|
||||
// below:
|
||||
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
|
||||
ts.close();
|
||||
//System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
|
||||
// Note this will also be true if input is the empty
|
||||
// string (in which case we saw no tokens and
|
||||
// maxEndOffset is still -1), which in fact works out OK
|
||||
// because we fill the unigram with an empty BytesRef
|
||||
// below:
|
||||
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
|
||||
//System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
|
||||
|
||||
if (lastTokenEnded) {
|
||||
//System.out.println(" lastTokenEnded");
|
||||
// If user hit space after the last token, then
|
||||
// "upgrade" all tokens. This way "foo " will suggest
|
||||
// all bigrams starting w/ foo, and not any unigrams
|
||||
// starting with "foo":
|
||||
for(int i=grams-1;i>0;i--) {
|
||||
BytesRef token = lastTokens[i-1];
|
||||
if (token == null) {
|
||||
if (lastTokenEnded) {
|
||||
//System.out.println(" lastTokenEnded");
|
||||
// If user hit space after the last token, then
|
||||
// "upgrade" all tokens. This way "foo " will suggest
|
||||
// all bigrams starting w/ foo, and not any unigrams
|
||||
// starting with "foo":
|
||||
for(int i=grams-1;i>0;i--) {
|
||||
BytesRef token = lastTokens[i-1];
|
||||
if (token == null) {
|
||||
continue;
|
||||
}
|
||||
token.grow(token.length+1);
|
||||
token.bytes[token.length] = separator;
|
||||
token.length++;
|
||||
lastTokens[i] = token;
|
||||
}
|
||||
lastTokens[0] = new BytesRef();
|
||||
}
|
||||
|
||||
Arc<Long> arc = new Arc<Long>();
|
||||
|
||||
BytesReader bytesReader = fst.getBytesReader();
|
||||
|
||||
// Try highest order models first, and if they return
|
||||
// results, return that; else, fallback:
|
||||
double backoff = 1.0;
|
||||
|
||||
List<LookupResult> results = new ArrayList<LookupResult>(num);
|
||||
|
||||
// We only add a given suffix once, from the highest
|
||||
// order model that saw it; for subsequent lower order
|
||||
// models we skip it:
|
||||
final Set<BytesRef> seen = new HashSet<BytesRef>();
|
||||
|
||||
for(int gram=grams-1;gram>=0;gram--) {
|
||||
BytesRef token = lastTokens[gram];
|
||||
// Don't make unigram predictions from empty string:
|
||||
if (token == null || (token.length == 0 && key.length() > 0)) {
|
||||
// Input didn't have enough tokens:
|
||||
//System.out.println(" gram=" + gram + ": skip: not enough input");
|
||||
continue;
|
||||
}
|
||||
token.grow(token.length+1);
|
||||
token.bytes[token.length] = separator;
|
||||
token.length++;
|
||||
lastTokens[i] = token;
|
||||
}
|
||||
lastTokens[0] = new BytesRef();
|
||||
}
|
||||
|
||||
Arc<Long> arc = new Arc<Long>();
|
||||
|
||||
BytesReader bytesReader = fst.getBytesReader();
|
||||
|
||||
// Try highest order models first, and if they return
|
||||
// results, return that; else, fallback:
|
||||
double backoff = 1.0;
|
||||
|
||||
List<LookupResult> results = new ArrayList<LookupResult>(num);
|
||||
|
||||
// We only add a given suffix once, from the highest
|
||||
// order model that saw it; for subsequent lower order
|
||||
// models we skip it:
|
||||
final Set<BytesRef> seen = new HashSet<BytesRef>();
|
||||
|
||||
for(int gram=grams-1;gram>=0;gram--) {
|
||||
BytesRef token = lastTokens[gram];
|
||||
// Don't make unigram predictions from empty string:
|
||||
if (token == null || (token.length == 0 && key.length() > 0)) {
|
||||
// Input didn't have enough tokens:
|
||||
//System.out.println(" gram=" + gram + ": skip: not enough input");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (endPosInc > 0 && gram <= endPosInc) {
|
||||
// Skip hole-only predictions; in theory we
|
||||
// shouldn't have to do this, but we'd need to fix
|
||||
// ShingleFilter to produce only-hole tokens:
|
||||
//System.out.println(" break: only holes now");
|
||||
break;
|
||||
}
|
||||
|
||||
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
|
||||
|
||||
// TODO: we could add fuzziness here
|
||||
// match the prefix portion exactly
|
||||
//Pair<Long,BytesRef> prefixOutput = null;
|
||||
Long prefixOutput = null;
|
||||
try {
|
||||
prefixOutput = lookupPrefix(fst, bytesReader, token, arc);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
//System.out.println(" prefixOutput=" + prefixOutput);
|
||||
|
||||
if (prefixOutput == null) {
|
||||
// This model never saw this prefix, e.g. the
|
||||
// trigram model never saw context "purple mushroom"
|
||||
backoff *= ALPHA;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: we could do this division at build time, and
|
||||
// bake it into the FST?
|
||||
|
||||
// Denominator for computing scores from current
|
||||
// model's predictions:
|
||||
long contextCount = totTokens;
|
||||
|
||||
BytesRef lastTokenFragment = null;
|
||||
|
||||
for(int i=token.length-1;i>=0;i--) {
|
||||
if (token.bytes[token.offset+i] == separator) {
|
||||
BytesRef context = new BytesRef(token.bytes, token.offset, i);
|
||||
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));
|
||||
assert output != null;
|
||||
contextCount = decodeWeight(output);
|
||||
lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);
|
||||
if (endPosInc > 0 && gram <= endPosInc) {
|
||||
// Skip hole-only predictions; in theory we
|
||||
// shouldn't have to do this, but we'd need to fix
|
||||
// ShingleFilter to produce only-hole tokens:
|
||||
//System.out.println(" break: only holes now");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
final BytesRef finalLastToken;
|
||||
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
|
||||
|
||||
if (lastTokenFragment == null) {
|
||||
finalLastToken = BytesRef.deepCopyOf(token);
|
||||
} else {
|
||||
finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
|
||||
}
|
||||
assert finalLastToken.offset == 0;
|
||||
// TODO: we could add fuzziness here
|
||||
// match the prefix portion exactly
|
||||
//Pair<Long,BytesRef> prefixOutput = null;
|
||||
Long prefixOutput = null;
|
||||
try {
|
||||
prefixOutput = lookupPrefix(fst, bytesReader, token, arc);
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
//System.out.println(" prefixOutput=" + prefixOutput);
|
||||
|
||||
CharsRef spare = new CharsRef();
|
||||
if (prefixOutput == null) {
|
||||
// This model never saw this prefix, e.g. the
|
||||
// trigram model never saw context "purple mushroom"
|
||||
backoff *= ALPHA;
|
||||
continue;
|
||||
}
|
||||
|
||||
// complete top-N
|
||||
MinResult<Long> completions[] = null;
|
||||
try {
|
||||
// TODO: we could do this division at build time, and
|
||||
// bake it into the FST?
|
||||
|
||||
// Because we store multiple models in one FST
|
||||
// (1gram, 2gram, 3gram), we must restrict the
|
||||
// search so that it only considers the current
|
||||
// model. For highest order model, this is not
|
||||
// necessary since all completions in the FST
|
||||
// must be from this model, but for lower order
|
||||
// models we have to filter out the higher order
|
||||
// ones:
|
||||
// Denominator for computing scores from current
|
||||
// model's predictions:
|
||||
long contextCount = totTokens;
|
||||
|
||||
// Must do num+seen.size() for queue depth because we may
|
||||
// reject up to seen.size() paths in acceptResult():
|
||||
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
|
||||
BytesRef lastTokenFragment = null;
|
||||
|
||||
BytesRef scratchBytes = new BytesRef();
|
||||
|
||||
@Override
|
||||
protected void addIfCompetitive(Util.FSTPath<Long> path) {
|
||||
if (path.arc.label != separator) {
|
||||
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||
super.addIfCompetitive(path);
|
||||
} else {
|
||||
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean acceptResult(IntsRef input, Long output) {
|
||||
Util.toBytesRef(input, scratchBytes);
|
||||
finalLastToken.grow(finalLastToken.length + scratchBytes.length);
|
||||
int lenSav = finalLastToken.length;
|
||||
finalLastToken.append(scratchBytes);
|
||||
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
|
||||
boolean ret = seen.contains(finalLastToken) == false;
|
||||
|
||||
finalLastToken.length = lenSav;
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
// since this search is initialized with a single start node
|
||||
// it is okay to start with an empty input path here
|
||||
searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
|
||||
|
||||
completions = searcher.search();
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
|
||||
int prefixLength = token.length;
|
||||
|
||||
BytesRef suffix = new BytesRef(8);
|
||||
//System.out.println(" " + completions.length + " completions");
|
||||
|
||||
nextCompletion:
|
||||
for (MinResult<Long> completion : completions) {
|
||||
token.length = prefixLength;
|
||||
// append suffix
|
||||
Util.toBytesRef(completion.input, suffix);
|
||||
token.append(suffix);
|
||||
|
||||
//System.out.println(" completion " + token.utf8ToString());
|
||||
|
||||
// Skip this path if a higher-order model already
|
||||
// saw/predicted its last token:
|
||||
BytesRef lastToken = token;
|
||||
for(int i=token.length-1;i>=0;i--) {
|
||||
if (token.bytes[token.offset+i] == separator) {
|
||||
assert token.length-i-1 > 0;
|
||||
lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
|
||||
BytesRef context = new BytesRef(token.bytes, token.offset, i);
|
||||
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));
|
||||
assert output != null;
|
||||
contextCount = decodeWeight(output);
|
||||
lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (seen.contains(lastToken)) {
|
||||
//System.out.println(" skip dup " + lastToken.utf8ToString());
|
||||
continue nextCompletion;
|
||||
}
|
||||
seen.add(BytesRef.deepCopyOf(lastToken));
|
||||
spare.grow(token.length);
|
||||
UnicodeUtil.UTF8toUTF16(token, spare);
|
||||
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
|
||||
results.add(result);
|
||||
assert results.size() == seen.size();
|
||||
//System.out.println(" add result=" + result);
|
||||
}
|
||||
backoff *= ALPHA;
|
||||
}
|
||||
|
||||
Collections.sort(results, new Comparator<LookupResult>() {
|
||||
final BytesRef finalLastToken;
|
||||
|
||||
if (lastTokenFragment == null) {
|
||||
finalLastToken = BytesRef.deepCopyOf(token);
|
||||
} else {
|
||||
finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
|
||||
}
|
||||
assert finalLastToken.offset == 0;
|
||||
|
||||
CharsRef spare = new CharsRef();
|
||||
|
||||
// complete top-N
|
||||
MinResult<Long> completions[] = null;
|
||||
try {
|
||||
|
||||
// Because we store multiple models in one FST
|
||||
// (1gram, 2gram, 3gram), we must restrict the
|
||||
// search so that it only considers the current
|
||||
// model. For highest order model, this is not
|
||||
// necessary since all completions in the FST
|
||||
// must be from this model, but for lower order
|
||||
// models we have to filter out the higher order
|
||||
// ones:
|
||||
|
||||
// Must do num+seen.size() for queue depth because we may
|
||||
// reject up to seen.size() paths in acceptResult():
|
||||
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
|
||||
|
||||
BytesRef scratchBytes = new BytesRef();
|
||||
|
||||
@Override
|
||||
protected void addIfCompetitive(Util.FSTPath<Long> path) {
|
||||
if (path.arc.label != separator) {
|
||||
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||
super.addIfCompetitive(path);
|
||||
} else {
|
||||
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean acceptResult(IntsRef input, Long output) {
|
||||
Util.toBytesRef(input, scratchBytes);
|
||||
finalLastToken.grow(finalLastToken.length + scratchBytes.length);
|
||||
int lenSav = finalLastToken.length;
|
||||
finalLastToken.append(scratchBytes);
|
||||
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
|
||||
boolean ret = seen.contains(finalLastToken) == false;
|
||||
|
||||
finalLastToken.length = lenSav;
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
// since this search is initialized with a single start node
|
||||
// it is okay to start with an empty input path here
|
||||
searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
|
||||
|
||||
completions = searcher.search();
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
|
||||
int prefixLength = token.length;
|
||||
|
||||
BytesRef suffix = new BytesRef(8);
|
||||
//System.out.println(" " + completions.length + " completions");
|
||||
|
||||
nextCompletion:
|
||||
for (MinResult<Long> completion : completions) {
|
||||
token.length = prefixLength;
|
||||
// append suffix
|
||||
Util.toBytesRef(completion.input, suffix);
|
||||
token.append(suffix);
|
||||
|
||||
//System.out.println(" completion " + token.utf8ToString());
|
||||
|
||||
// Skip this path if a higher-order model already
|
||||
// saw/predicted its last token:
|
||||
BytesRef lastToken = token;
|
||||
for(int i=token.length-1;i>=0;i--) {
|
||||
if (token.bytes[token.offset+i] == separator) {
|
||||
assert token.length-i-1 > 0;
|
||||
lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (seen.contains(lastToken)) {
|
||||
//System.out.println(" skip dup " + lastToken.utf8ToString());
|
||||
continue nextCompletion;
|
||||
}
|
||||
seen.add(BytesRef.deepCopyOf(lastToken));
|
||||
spare.grow(token.length);
|
||||
UnicodeUtil.UTF8toUTF16(token, spare);
|
||||
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
|
||||
results.add(result);
|
||||
assert results.size() == seen.size();
|
||||
//System.out.println(" add result=" + result);
|
||||
}
|
||||
backoff *= ALPHA;
|
||||
}
|
||||
|
||||
Collections.sort(results, new Comparator<LookupResult>() {
|
||||
@Override
|
||||
public int compare(LookupResult a, LookupResult b) {
|
||||
if (a.value > b.value) {
|
||||
|
@ -708,11 +707,12 @@ public class FreeTextSuggester extends Lookup {
|
|||
}
|
||||
});
|
||||
|
||||
if (results.size() > num) {
|
||||
results.subList(num, results.size()).clear();
|
||||
}
|
||||
if (results.size() > num) {
|
||||
results.subList(num, results.size()).clear();
|
||||
}
|
||||
|
||||
return results;
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
/** weight -> cost */
|
||||
|
|
|
@ -165,43 +165,43 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
|||
|
||||
@Override
|
||||
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
|
||||
int upto = 0;
|
||||
while (ts.incrementToken()) {
|
||||
String token = termAtt.toString();
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < startOffset) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
|
||||
upto = startOffset;
|
||||
} else if (upto > startOffset) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (matchedTokens.contains(token)) {
|
||||
// Token matches.
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
|
||||
upto = endOffset;
|
||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
|
||||
if (prefixToken.length() < token.length()) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
|
||||
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
|
||||
int upto = 0;
|
||||
while (ts.incrementToken()) {
|
||||
String token = termAtt.toString();
|
||||
int startOffset = offsetAtt.startOffset();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < startOffset) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
|
||||
upto = startOffset;
|
||||
} else if (upto > startOffset) {
|
||||
continue;
|
||||
}
|
||||
upto = endOffset;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < endOffset) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
|
||||
}
|
||||
ts.close();
|
||||
|
||||
return fragments;
|
||||
if (matchedTokens.contains(token)) {
|
||||
// Token matches.
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
|
||||
upto = endOffset;
|
||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
|
||||
if (prefixToken.length() < token.length()) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
|
||||
}
|
||||
upto = endOffset;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
int endOffset = offsetAtt.endOffset();
|
||||
if (upto < endOffset) {
|
||||
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
|
||||
}
|
||||
|
||||
return fragments;
|
||||
}
|
||||
}
|
||||
};
|
||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||
|
|
|
@ -258,17 +258,17 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
|||
|
||||
for (int i = 0; i < numTestPoints; i++) {
|
||||
String term = _TestUtil.randomSimpleString(random());
|
||||
TokenStream ts = analyzer.tokenStream("fake", term);
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
assertTrue(ts.incrementToken());
|
||||
termAtt.fillBytesRef();
|
||||
// ensure we make a copy of the actual bytes too
|
||||
map.put(term, BytesRef.deepCopyOf(bytes));
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
assertTrue(ts.incrementToken());
|
||||
termAtt.fillBytesRef();
|
||||
// ensure we make a copy of the actual bytes too
|
||||
map.put(term, BytesRef.deepCopyOf(bytes));
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
|
||||
Thread threads[] = new Thread[numThreads];
|
||||
|
@ -280,16 +280,16 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
|||
for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
|
||||
String term = mapping.getKey();
|
||||
BytesRef expected = mapping.getValue();
|
||||
TokenStream ts = analyzer.tokenStream("fake", term);
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
assertTrue(ts.incrementToken());
|
||||
termAtt.fillBytesRef();
|
||||
assertEquals(expected, bytes);
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
assertTrue(ts.incrementToken());
|
||||
termAtt.fillBytesRef();
|
||||
assertEquals(expected, bytes);
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -234,36 +234,23 @@ public class ICUCollationField extends FieldType {
|
|||
* simple (we already have a threadlocal clone in the reused TS)
|
||||
*/
|
||||
private BytesRef analyzeRangePart(String field, String part) {
|
||||
TokenStream source;
|
||||
|
||||
try {
|
||||
source = analyzer.tokenStream(field, part);
|
||||
try (TokenStream source = analyzer.tokenStream(field, part)) {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to initialize TokenStream to analyze range part: " + part, e);
|
||||
}
|
||||
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
||||
// we control the analyzer here: most errors are impossible
|
||||
try {
|
||||
// we control the analyzer here: most errors are impossible
|
||||
if (!source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
||||
termAtt.fillBytesRef();
|
||||
assert !source.incrementToken();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
||||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing range part: " + part, e);
|
||||
throw new RuntimeException("Unable analyze range part: " + part, e);
|
||||
}
|
||||
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -85,15 +85,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
|
||||
if (!TokenizerChain.class.isInstance(analyzer)) {
|
||||
|
||||
TokenStream tokenStream = null;
|
||||
try {
|
||||
tokenStream = analyzer.tokenStream(context.getFieldName(), value);
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
|
||||
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
|
||||
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
||||
return namedList;
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
|
||||
}
|
||||
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
|
||||
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
||||
return namedList;
|
||||
}
|
||||
|
||||
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
|
||||
|
@ -139,10 +137,8 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
* @param analyzer The analyzer to use.
|
||||
*/
|
||||
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
|
||||
TokenStream tokenStream = null;
|
||||
try {
|
||||
try (TokenStream tokenStream = analyzer.tokenStream("", query)){
|
||||
final Set<BytesRef> tokens = new HashSet<BytesRef>();
|
||||
tokenStream = analyzer.tokenStream("", query);
|
||||
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||
final BytesRef bytes = bytesAtt.getBytesRef();
|
||||
|
||||
|
@ -157,8 +153,6 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
return tokens;
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(tokenStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -344,16 +344,16 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
|
|||
return query;
|
||||
}
|
||||
StringBuilder norm = new StringBuilder();
|
||||
TokenStream tokens = analyzer.tokenStream("", query);
|
||||
tokens.reset();
|
||||
try (TokenStream tokens = analyzer.tokenStream("", query)) {
|
||||
tokens.reset();
|
||||
|
||||
CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
|
||||
while (tokens.incrementToken()) {
|
||||
norm.append(termAtt.buffer(), 0, termAtt.length());
|
||||
CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
|
||||
while (tokens.incrementToken()) {
|
||||
norm.append(termAtt.buffer(), 0, termAtt.length());
|
||||
}
|
||||
tokens.end();
|
||||
return norm.toString();
|
||||
}
|
||||
tokens.end();
|
||||
tokens.close();
|
||||
return norm.toString();
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
|
|
|
@ -463,29 +463,29 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
assert analyzer != null;
|
||||
TokenStream ts = analyzer.tokenStream("", q);
|
||||
ts.reset();
|
||||
// TODO: support custom attributes
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
|
||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
try (TokenStream ts = analyzer.tokenStream("", q)) {
|
||||
ts.reset();
|
||||
// TODO: support custom attributes
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
|
||||
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
|
||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
while (ts.incrementToken()){
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
while (ts.incrementToken()){
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||
token.setType(typeAtt.type());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
}
|
||||
ts.end();
|
||||
return result;
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
return result;
|
||||
}
|
||||
|
||||
protected SolrSpellChecker getSpellChecker(SolrParams params) {
|
||||
|
|
|
@ -403,59 +403,50 @@ public abstract class SolrQueryParserBase {
|
|||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source;
|
||||
try {
|
||||
source = analyzer.tokenStream(field, queryText);
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new SyntaxError("Unable to initialize TokenStream to analyze query text", e);
|
||||
}
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
CachingTokenFilter buffer = null;
|
||||
TermToBytesRefAttribute termAtt = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
buffer.reset();
|
||||
|
||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new SyntaxError("Cannot close TokenStream analyzing query text", e);
|
||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new SyntaxError("Error analyzing query text", e);
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
||||
|
||||
if (numTokens == 0)
|
||||
|
|
|
@ -210,36 +210,22 @@ public class CollationField extends FieldType {
|
|||
* simple (we already have a threadlocal clone in the reused TS)
|
||||
*/
|
||||
private BytesRef analyzeRangePart(String field, String part) {
|
||||
TokenStream source;
|
||||
|
||||
try {
|
||||
source = analyzer.tokenStream(field, part);
|
||||
try (TokenStream source = analyzer.tokenStream(field, part)) {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to initialize TokenStream to analyze range part: " + part, e);
|
||||
}
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
||||
// we control the analyzer here: most errors are impossible
|
||||
try {
|
||||
// we control the analyzer here: most errors are impossible
|
||||
if (!source.incrementToken())
|
||||
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
||||
termAtt.fillBytesRef();
|
||||
assert !source.incrementToken();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
||||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing range part: " + part, e);
|
||||
throw new RuntimeException("Unable to analyze range part: " + part, e);
|
||||
}
|
||||
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -138,35 +138,23 @@ public class TextField extends FieldType {
|
|||
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
|
||||
if (part == null || analyzerIn == null) return null;
|
||||
|
||||
TokenStream source;
|
||||
try {
|
||||
source = analyzerIn.tokenStream(field, part);
|
||||
try (TokenStream source = analyzerIn.tokenStream(field, part)){
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
|
||||
}
|
||||
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
|
||||
try {
|
||||
if (!source.incrementToken())
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
|
||||
termAtt.fillBytesRef();
|
||||
if (source.incrementToken())
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
|
||||
|
||||
source.end();
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
|
||||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
|
||||
}
|
||||
|
||||
return BytesRef.deepCopyOf(bytes);
|
||||
}
|
||||
|
||||
|
||||
|
@ -178,59 +166,51 @@ public class TextField extends FieldType {
|
|||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||
// PhraseQuery, or nothing based on the term count
|
||||
|
||||
TokenStream source;
|
||||
try {
|
||||
source = analyzer.tokenStream(field, queryText);
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to initialize TokenStream to analyze query text", e);
|
||||
}
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
CachingTokenFilter buffer = null;
|
||||
CharTermAttribute termAtt = null;
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
int numTokens = 0;
|
||||
|
||||
buffer.reset();
|
||||
|
||||
if (buffer.hasAttribute(CharTermAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(CharTermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
try {
|
||||
// rewind the buffer stream
|
||||
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||
source.reset();
|
||||
buffer = new CachingTokenFilter(source);
|
||||
|
||||
buffer.reset();
|
||||
|
||||
// close original stream - all tokens buffered
|
||||
source.close();
|
||||
}
|
||||
catch (IOException e) {
|
||||
// ignore
|
||||
if (buffer.hasAttribute(CharTermAttribute.class)) {
|
||||
termAtt = buffer.getAttribute(CharTermAttribute.class);
|
||||
}
|
||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
boolean hasMoreTokens = false;
|
||||
if (termAtt != null) {
|
||||
try {
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
while (hasMoreTokens) {
|
||||
numTokens++;
|
||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||
if (positionIncrement != 0) {
|
||||
positionCount += positionIncrement;
|
||||
} else {
|
||||
severalTokensAtSamePosition = true;
|
||||
}
|
||||
hasMoreTokens = buffer.incrementToken();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// rewind the buffer stream
|
||||
buffer.reset();
|
||||
|
||||
if (numTokens == 0)
|
||||
return null;
|
||||
else if (numTokens == 1) {
|
||||
|
|
|
@ -40,10 +40,10 @@ class SimpleQueryConverter extends SpellingQueryConverter {
|
|||
|
||||
@Override
|
||||
public Collection<Token> convert(String origQuery) {
|
||||
try {
|
||||
Collection<Token> result = new HashSet<Token>();
|
||||
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);
|
||||
TokenStream ts = analyzer.tokenStream("", origQuery);
|
||||
Collection<Token> result = new HashSet<Token>();
|
||||
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);
|
||||
|
||||
try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
|
||||
// TODO: support custom attributes
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
|
@ -65,8 +65,6 @@ class SimpleQueryConverter extends SpellingQueryConverter {
|
|||
result.add(tok);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
|
||||
return result;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
Loading…
Reference in New Issue