mirror of https://github.com/apache/lucene.git
LUCENE-5259: convert analysis consumers to try-with-resources
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1529770 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9c98f9d958
commit
64a795b6e3
|
@ -307,30 +307,30 @@ public class SynonymMap {
|
||||||
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
* separates by {@link SynonymMap#WORD_SEPARATOR}.
|
||||||
* reuse and its chars must not be null. */
|
* reuse and its chars must not be null. */
|
||||||
public CharsRef analyze(String text, CharsRef reuse) throws IOException {
|
public CharsRef analyze(String text, CharsRef reuse) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("", text);
|
try (TokenStream ts = analyzer.tokenStream("", text)) {
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
reuse.length = 0;
|
reuse.length = 0;
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
int length = termAtt.length();
|
int length = termAtt.length();
|
||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
|
||||||
|
}
|
||||||
|
if (posIncAtt.getPositionIncrement() != 1) {
|
||||||
|
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
||||||
|
}
|
||||||
|
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
||||||
|
int end = reuse.offset + reuse.length;
|
||||||
|
if (reuse.length > 0) {
|
||||||
|
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
||||||
|
reuse.length++;
|
||||||
|
}
|
||||||
|
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
||||||
|
reuse.length += length;
|
||||||
}
|
}
|
||||||
if (posIncAtt.getPositionIncrement() != 1) {
|
ts.end();
|
||||||
throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
|
|
||||||
}
|
|
||||||
reuse.grow(reuse.length + length + 1); /* current + word + separator */
|
|
||||||
int end = reuse.offset + reuse.length;
|
|
||||||
if (reuse.length > 0) {
|
|
||||||
reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
|
|
||||||
reuse.length++;
|
|
||||||
}
|
|
||||||
System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
|
|
||||||
reuse.length += length;
|
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
if (reuse.length == 0) {
|
if (reuse.length == 0) {
|
||||||
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
|
||||||
}
|
}
|
||||||
|
|
|
@ -117,12 +117,15 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
// LUCENE-1441
|
// LUCENE-1441
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
|
try (TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"))) {
|
||||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||||
stream.reset();
|
stream.reset();
|
||||||
assertTrue(stream.incrementToken());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals(0, offsetAtt.startOffset());
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
assertEquals(4, offsetAtt.endOffset());
|
assertEquals(4, offsetAtt.endOffset());
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
|
stream.end();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
|
|
|
@ -46,27 +46,31 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testDefaults() throws IOException {
|
public void testDefaults() throws IOException {
|
||||||
assertTrue(stop != null);
|
assertTrue(stop != null);
|
||||||
TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer");
|
try (TokenStream stream = stop.tokenStream("test", "This is a test of the english stop analyzer")) {
|
||||||
assertTrue(stream != null);
|
assertTrue(stream != null);
|
||||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
stream.reset();
|
stream.reset();
|
||||||
|
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
assertFalse(inValidTokens.contains(termAtt.toString()));
|
assertFalse(inValidTokens.contains(termAtt.toString()));
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopList() throws IOException {
|
public void testStopList() throws IOException {
|
||||||
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||||
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
||||||
TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer");
|
try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
|
||||||
assertNotNull(stream);
|
assertNotNull(stream);
|
||||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
String text = termAtt.toString();
|
String text = termAtt.toString();
|
||||||
assertFalse(stopWordsSet.contains(text));
|
assertFalse(stopWordsSet.contains(text));
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,17 +79,19 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
||||||
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
||||||
String s = "This is a good test of the english stop analyzer with positions";
|
String s = "This is a good test of the english stop analyzer with positions";
|
||||||
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1};
|
||||||
TokenStream stream = newStop.tokenStream("test", s);
|
try (TokenStream stream = newStop.tokenStream("test", s)) {
|
||||||
assertNotNull(stream);
|
assertNotNull(stream);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
String text = termAtt.toString();
|
String text = termAtt.toString();
|
||||||
assertFalse(stopWordsSet.contains(text));
|
assertFalse(stopWordsSet.contains(text));
|
||||||
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,23 +37,29 @@ public class TestPerFieldAnalyzerWrapper extends BaseTokenStreamTestCase {
|
||||||
PerFieldAnalyzerWrapper analyzer =
|
PerFieldAnalyzerWrapper analyzer =
|
||||||
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);
|
new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField);
|
||||||
|
|
||||||
TokenStream tokenStream = analyzer.tokenStream("field", text);
|
try (TokenStream tokenStream = analyzer.tokenStream("field", text)) {
|
||||||
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
|
|
||||||
assertTrue(tokenStream.incrementToken());
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals("WhitespaceAnalyzer does not lowercase",
|
assertEquals("WhitespaceAnalyzer does not lowercase",
|
||||||
"Qwerty",
|
"Qwerty",
|
||||||
termAtt.toString());
|
termAtt.toString());
|
||||||
|
assertFalse(tokenStream.incrementToken());
|
||||||
|
tokenStream.end();
|
||||||
|
}
|
||||||
|
|
||||||
tokenStream = analyzer.tokenStream("special", text);
|
try (TokenStream tokenStream = analyzer.tokenStream("special", text)) {
|
||||||
termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
|
|
||||||
assertTrue(tokenStream.incrementToken());
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals("SimpleAnalyzer lowercases",
|
assertEquals("SimpleAnalyzer lowercases",
|
||||||
"qwerty",
|
"qwerty",
|
||||||
termAtt.toString());
|
termAtt.toString());
|
||||||
|
assertFalse(tokenStream.incrementToken());
|
||||||
|
tokenStream.end();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCharFilters() throws Exception {
|
public void testCharFilters() throws Exception {
|
||||||
|
|
|
@ -95,17 +95,19 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||||
PhraseQuery q = new PhraseQuery();
|
PhraseQuery q = new PhraseQuery();
|
||||||
|
|
||||||
TokenStream ts = analyzer.tokenStream("content", "this sentence");
|
try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
|
||||||
int j = -1;
|
int j = -1;
|
||||||
|
|
||||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
j += posIncrAtt.getPositionIncrement();
|
j += posIncrAtt.getPositionIncrement();
|
||||||
String termText = termAtt.toString();
|
String termText = termAtt.toString();
|
||||||
q.add(new Term("content", termText), j);
|
q.add(new Term("content", termText), j);
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
||||||
|
@ -121,16 +123,16 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||||
BooleanQuery q = new BooleanQuery();
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
|
||||||
TokenStream ts = analyzer.tokenStream("content", "test sentence");
|
try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) {
|
||||||
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
ts.reset();
|
||||||
|
while (ts.incrementToken()) {
|
||||||
ts.reset();
|
String termText = termAtt.toString();
|
||||||
|
q.add(new TermQuery(new Term("content", termText)),
|
||||||
while (ts.incrementToken()) {
|
|
||||||
String termText = termAtt.toString();
|
|
||||||
q.add(new TermQuery(new Term("content", termText)),
|
|
||||||
BooleanClause.Occur.SHOULD);
|
BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs;
|
||||||
|
|
|
@ -123,18 +123,18 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
int num = 1000 * RANDOM_MULTIPLIER;
|
int num = 1000 * RANDOM_MULTIPLIER;
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random());
|
String s = _TestUtil.randomUnicodeString(random());
|
||||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||||
cp = highlightedText.codePointAt(j);
|
cp = highlightedText.codePointAt(j);
|
||||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
// just for fun
|
// just for fun
|
||||||
checkRandomData(random(), analyzer, num);
|
checkRandomData(random(), analyzer, num);
|
||||||
|
@ -161,18 +161,18 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||||
int num = 1000 * RANDOM_MULTIPLIER;
|
int num = 1000 * RANDOM_MULTIPLIER;
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random());
|
String s = _TestUtil.randomUnicodeString(random());
|
||||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
|
||||||
cp = highlightedText.codePointAt(j);
|
cp = highlightedText.codePointAt(j);
|
||||||
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
// just for fun
|
// just for fun
|
||||||
checkRandomData(random(), analyzer, num);
|
checkRandomData(random(), analyzer, num);
|
||||||
|
|
|
@ -249,16 +249,16 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTokenAttributes() throws Exception {
|
public void testTokenAttributes() throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", "This is a test");
|
try (TokenStream ts = a.tokenStream("dummy", "This is a test")) {
|
||||||
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
assertEquals(UScript.LATIN, scriptAtt.getCode());
|
||||||
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
|
||||||
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
|
||||||
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
assertTrue(ts.reflectAsString(false).contains("script=Latin"));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,14 +53,14 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
|
||||||
int numIterations = atLeast(1000);
|
int numIterations = atLeast(1000);
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random(), 100);
|
String s = _TestUtil.randomUnicodeString(random(), 100);
|
||||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -141,13 +141,13 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
* ideally the test would actually fail instead of hanging...
|
* ideally the test would actually fail instead of hanging...
|
||||||
*/
|
*/
|
||||||
public void testDecomposition5() throws Exception {
|
public void testDecomposition5() throws Exception {
|
||||||
TokenStream ts = analyzer.tokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ");
|
try (TokenStream ts = analyzer.tokenStream("bogus", "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ")) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
|
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -213,12 +213,12 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
public void testLargeDocReliability() throws Exception {
|
public void testLargeDocReliability() throws Exception {
|
||||||
for (int i = 0; i < 100; i++) {
|
for (int i = 0; i < 100; i++) {
|
||||||
String s = _TestUtil.randomUnicodeString(random(), 10000);
|
String s = _TestUtil.randomUnicodeString(random(), 10000);
|
||||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -236,29 +236,31 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
System.out.println("\nTEST: iter=" + i);
|
System.out.println("\nTEST: iter=" + i);
|
||||||
}
|
}
|
||||||
String s = _TestUtil.randomUnicodeString(random(), 100);
|
String s = _TestUtil.randomUnicodeString(random(), 100);
|
||||||
TokenStream ts = analyzer.tokenStream("foo", s);
|
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOnlyPunctuation() throws IOException {
|
public void testOnlyPunctuation() throws IOException {
|
||||||
TokenStream ts = analyzerNoPunct.tokenStream("foo", "。、。。");
|
try (TokenStream ts = analyzerNoPunct.tokenStream("foo", "。、。。")) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
assertFalse(ts.incrementToken());
|
assertFalse(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOnlyPunctuationExtended() throws IOException {
|
public void testOnlyPunctuationExtended() throws IOException {
|
||||||
TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......");
|
try (TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", "......")) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
assertFalse(ts.incrementToken());
|
assertFalse(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// note: test is kinda silly since kuromoji emits punctuation tokens.
|
// note: test is kinda silly since kuromoji emits punctuation tokens.
|
||||||
|
@ -369,75 +371,81 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertReadings(String input, String... readings) throws IOException {
|
private void assertReadings(String input, String... readings) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String reading : readings) {
|
for(String reading : readings) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(reading, readingAtt.getReading());
|
assertEquals(reading, readingAtt.getReading());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertPronunciations(String input, String... pronunciations) throws IOException {
|
private void assertPronunciations(String input, String... pronunciations) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String pronunciation : pronunciations) {
|
for(String pronunciation : pronunciations) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(pronunciation, readingAtt.getPronunciation());
|
assertEquals(pronunciation, readingAtt.getPronunciation());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertBaseForms(String input, String... baseForms) throws IOException {
|
private void assertBaseForms(String input, String... baseForms) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
|
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String baseForm : baseForms) {
|
for(String baseForm : baseForms) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(baseForm, baseFormAtt.getBaseForm());
|
assertEquals(baseForm, baseFormAtt.getBaseForm());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
|
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String inflectionType : inflectionTypes) {
|
for(String inflectionType : inflectionTypes) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(inflectionType, inflectionAtt.getInflectionType());
|
assertEquals(inflectionType, inflectionAtt.getInflectionType());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
|
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String inflectionForm : inflectionForms) {
|
for(String inflectionForm : inflectionForms) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
|
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
|
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
|
||||||
TokenStream ts = analyzer.tokenStream("ignored", input);
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
|
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
for(String partOfSpeech : partsOfSpeech) {
|
for(String partOfSpeech : partsOfSpeech) {
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
|
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
ts.end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReadings() throws Exception {
|
public void testReadings() throws Exception {
|
||||||
|
@ -631,11 +639,11 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
long totalStart = System.currentTimeMillis();
|
long totalStart = System.currentTimeMillis();
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
final TokenStream ts = analyzer.tokenStream("ignored", line);
|
try (TokenStream ts = analyzer.tokenStream("ignored", line)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while(ts.incrementToken());
|
while(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
}
|
||||||
}
|
}
|
||||||
String[] sentences = line.split("、|。");
|
String[] sentences = line.split("、|。");
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
@ -645,11 +653,11 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
totalStart = System.currentTimeMillis();
|
totalStart = System.currentTimeMillis();
|
||||||
for (int i = 0; i < numIterations; i++) {
|
for (int i = 0; i < numIterations; i++) {
|
||||||
for (String sentence: sentences) {
|
for (String sentence: sentences) {
|
||||||
final TokenStream ts = analyzer.tokenStream("ignored", sentence);
|
try (TokenStream ts = analyzer.tokenStream("ignored", sentence)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while(ts.incrementToken());
|
while(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
|
|
@ -72,34 +72,36 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
private void dumpTokens(String input) throws IOException {
|
private void dumpTokens(String input) throws IOException {
|
||||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", input);
|
try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", input)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
|
||||||
MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
|
MorphosyntacticTagsAttribute attribute = ts.getAttribute(MorphosyntacticTagsAttribute.class);
|
||||||
CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
|
CharTermAttribute charTerm = ts.getAttribute(CharTermAttribute.class);
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
System.out.println(charTerm.toString() + " => " + attribute.getTags());
|
System.out.println(charTerm.toString() + " => " + attribute.getTags());
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test reuse of MorfologikFilter with leftover stems. */
|
/** Test reuse of MorfologikFilter with leftover stems. */
|
||||||
public final void testLeftoverStems() throws IOException {
|
public final void testLeftoverStems() throws IOException {
|
||||||
Analyzer a = getTestAnalyzer();
|
Analyzer a = getTestAnalyzer();
|
||||||
TokenStream ts_1 = a.tokenStream("dummy", "liście");
|
try (TokenStream ts_1 = a.tokenStream("dummy", "liście")) {
|
||||||
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
|
||||||
ts_1.reset();
|
ts_1.reset();
|
||||||
ts_1.incrementToken();
|
ts_1.incrementToken();
|
||||||
assertEquals("first stream", "liście", termAtt_1.toString());
|
assertEquals("first stream", "liście", termAtt_1.toString());
|
||||||
ts_1.end();
|
ts_1.end();
|
||||||
ts_1.close();
|
}
|
||||||
|
|
||||||
TokenStream ts_2 = a.tokenStream("dummy", "danych");
|
try (TokenStream ts_2 = a.tokenStream("dummy", "danych")) {
|
||||||
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
|
||||||
ts_2.reset();
|
ts_2.reset();
|
||||||
ts_2.incrementToken();
|
ts_2.incrementToken();
|
||||||
assertEquals("second stream", "dany", termAtt_2.toString());
|
assertEquals("second stream", "dany", termAtt_2.toString());
|
||||||
ts_2.end();
|
ts_2.end();
|
||||||
ts_2.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test stemming of mixed-case tokens. */
|
/** Test stemming of mixed-case tokens. */
|
||||||
|
@ -140,28 +142,27 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** Test morphosyntactic annotations. */
|
/** Test morphosyntactic annotations. */
|
||||||
public final void testPOSAttribute() throws IOException {
|
public final void testPOSAttribute() throws IOException {
|
||||||
TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liście");
|
try (TokenStream ts = getTestAnalyzer().tokenStream("dummy", "liście")) {
|
||||||
|
ts.reset();
|
||||||
ts.reset();
|
assertPOSToken(ts, "liście",
|
||||||
assertPOSToken(ts, "liście",
|
|
||||||
"subst:sg:acc:n2",
|
"subst:sg:acc:n2",
|
||||||
"subst:sg:nom:n2",
|
"subst:sg:nom:n2",
|
||||||
"subst:sg:voc:n2");
|
"subst:sg:voc:n2");
|
||||||
|
|
||||||
assertPOSToken(ts, "liść",
|
assertPOSToken(ts, "liść",
|
||||||
"subst:pl:acc:m3",
|
"subst:pl:acc:m3",
|
||||||
"subst:pl:nom:m3",
|
"subst:pl:nom:m3",
|
||||||
"subst:pl:voc:m3");
|
"subst:pl:voc:m3");
|
||||||
|
|
||||||
assertPOSToken(ts, "list",
|
assertPOSToken(ts, "list",
|
||||||
"subst:sg:loc:m3",
|
"subst:sg:loc:m3",
|
||||||
"subst:sg:voc:m3");
|
"subst:sg:voc:m3");
|
||||||
|
|
||||||
assertPOSToken(ts, "lista",
|
assertPOSToken(ts, "lista",
|
||||||
"subst:sg:dat:f",
|
"subst:sg:dat:f",
|
||||||
"subst:sg:loc:f");
|
"subst:sg:loc:f");
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** */
|
/** */
|
||||||
|
|
|
@ -184,9 +184,11 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
sb.append("我购买了道具和服装。");
|
sb.append("我购买了道具和服装。");
|
||||||
}
|
}
|
||||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||||
TokenStream stream = analyzer.tokenStream("", sb.toString());
|
try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,9 +199,11 @@ public class TestSmartChineseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
sb.append("我购买了道具和服装");
|
sb.append("我购买了道具和服装");
|
||||||
}
|
}
|
||||||
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
|
||||||
TokenStream stream = analyzer.tokenStream("", sb.toString());
|
try (TokenStream stream = analyzer.tokenStream("", sb.toString())) {
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -91,20 +91,19 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
||||||
throw new IOException("You must first call Classifier#train");
|
throw new IOException("You must first call Classifier#train");
|
||||||
}
|
}
|
||||||
Long output = 0l;
|
Long output = 0l;
|
||||||
TokenStream tokenStream = analyzer.tokenStream(textFieldName,
|
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
|
||||||
new StringReader(text));
|
CharTermAttribute charTermAttribute = tokenStream
|
||||||
CharTermAttribute charTermAttribute = tokenStream
|
|
||||||
.addAttribute(CharTermAttribute.class);
|
.addAttribute(CharTermAttribute.class);
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
while (tokenStream.incrementToken()) {
|
while (tokenStream.incrementToken()) {
|
||||||
String s = charTermAttribute.toString();
|
String s = charTermAttribute.toString();
|
||||||
Long d = Util.get(fst, new BytesRef(s));
|
Long d = Util.get(fst, new BytesRef(s));
|
||||||
if (d != null) {
|
if (d != null) {
|
||||||
output += d;
|
output += d;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
tokenStream.end();
|
||||||
}
|
}
|
||||||
tokenStream.end();
|
|
||||||
tokenStream.close();
|
|
||||||
|
|
||||||
return new ClassificationResult<>(output >= threshold, output.doubleValue());
|
return new ClassificationResult<>(output >= threshold, output.doubleValue());
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,14 +85,14 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
||||||
|
|
||||||
private String[] tokenizeDoc(String doc) throws IOException {
|
private String[] tokenizeDoc(String doc) throws IOException {
|
||||||
Collection<String> result = new LinkedList<String>();
|
Collection<String> result = new LinkedList<String>();
|
||||||
TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc);
|
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, doc)) {
|
||||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
while (tokenStream.incrementToken()) {
|
while (tokenStream.incrementToken()) {
|
||||||
result.add(charTermAttribute.toString());
|
result.add(charTermAttribute.toString());
|
||||||
|
}
|
||||||
|
tokenStream.end();
|
||||||
}
|
}
|
||||||
tokenStream.end();
|
|
||||||
tokenStream.close();
|
|
||||||
return result.toArray(new String[result.size()]);
|
return result.toArray(new String[result.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -92,13 +92,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
|
fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
final TokenStream stream = field.tokenStream(docState.analyzer);
|
try (TokenStream stream = field.tokenStream(docState.analyzer)) {
|
||||||
// reset the TokenStream to the first token
|
// reset the TokenStream to the first token
|
||||||
stream.reset();
|
stream.reset();
|
||||||
|
|
||||||
boolean success2 = false;
|
|
||||||
|
|
||||||
try {
|
|
||||||
boolean hasMoreTokens = stream.incrementToken();
|
boolean hasMoreTokens = stream.incrementToken();
|
||||||
|
|
||||||
fieldState.attributeSource = stream;
|
fieldState.attributeSource = stream;
|
||||||
|
@ -179,13 +175,6 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
// when we come back around to the field...
|
// when we come back around to the field...
|
||||||
fieldState.position += posIncrAttribute.getPositionIncrement();
|
fieldState.position += posIncrAttribute.getPositionIncrement();
|
||||||
fieldState.offset += offsetAttribute.endOffset();
|
fieldState.offset += offsetAttribute.endOffset();
|
||||||
success2 = true;
|
|
||||||
} finally {
|
|
||||||
if (!success2) {
|
|
||||||
IOUtils.closeWhileHandlingException(stream);
|
|
||||||
} else {
|
|
||||||
stream.close();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
|
fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
|
||||||
|
|
|
@ -98,13 +98,13 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
String testString = "t";
|
String testString = "t";
|
||||||
|
|
||||||
Analyzer analyzer = new MockAnalyzer(random());
|
Analyzer analyzer = new MockAnalyzer(random());
|
||||||
TokenStream stream = analyzer.tokenStream("dummy", testString);
|
try (TokenStream stream = analyzer.tokenStream("dummy", testString)) {
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
// consume
|
// consume
|
||||||
|
}
|
||||||
|
stream.end();
|
||||||
}
|
}
|
||||||
stream.end();
|
|
||||||
stream.close();
|
|
||||||
|
|
||||||
assertAnalyzesTo(analyzer, testString, new String[] { "t" });
|
assertAnalyzesTo(analyzer, testString, new String[] { "t" });
|
||||||
}
|
}
|
||||||
|
@ -121,13 +121,13 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
StringReader reader = new StringReader(s);
|
StringReader reader = new StringReader(s);
|
||||||
MockCharFilter charfilter = new MockCharFilter(reader, 2);
|
MockCharFilter charfilter = new MockCharFilter(reader, 2);
|
||||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||||
TokenStream ts = analyzer.tokenStream("bogus", charfilter);
|
try (TokenStream ts = analyzer.tokenStream("bogus", charfilter)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
;
|
;
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -47,30 +47,29 @@ public class TestLongPostings extends LuceneTestCase {
|
||||||
if (other != null && s.equals(other)) {
|
if (other != null && s.equals(other)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
final TokenStream ts = a.tokenStream("foo", s);
|
try (TokenStream ts = a.tokenStream("foo", s)) {
|
||||||
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
|
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
|
||||||
final BytesRef termBytes = termAtt.getBytesRef();
|
final BytesRef termBytes = termAtt.getBytesRef();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
boolean changed = false;
|
boolean changed = false;
|
||||||
|
|
||||||
while(ts.incrementToken()) {
|
while(ts.incrementToken()) {
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
|
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
|
||||||
// The value was changed during analysis. Keep iterating so the
|
// The value was changed during analysis. Keep iterating so the
|
||||||
// tokenStream is exhausted.
|
// tokenStream is exhausted.
|
||||||
changed = true;
|
changed = true;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
}
|
}
|
||||||
count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
// Did we iterate just once and the value was unchanged?
|
||||||
|
if (!changed && count == 1) {
|
||||||
// Did we iterate just once and the value was unchanged?
|
return s;
|
||||||
if (!changed && count == 1) {
|
}
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,17 +174,18 @@ public class TestTermVectorsWriter extends LuceneTestCase {
|
||||||
Analyzer analyzer = new MockAnalyzer(random());
|
Analyzer analyzer = new MockAnalyzer(random());
|
||||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
TokenStream stream = analyzer.tokenStream("field", "abcd ");
|
try (TokenStream stream = analyzer.tokenStream("field", "abcd ")) {
|
||||||
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
|
stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
|
||||||
stream = new CachingTokenFilter(stream);
|
TokenStream cachedStream = new CachingTokenFilter(stream);
|
||||||
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
|
||||||
customType.setStoreTermVectors(true);
|
customType.setStoreTermVectors(true);
|
||||||
customType.setStoreTermVectorPositions(true);
|
customType.setStoreTermVectorPositions(true);
|
||||||
customType.setStoreTermVectorOffsets(true);
|
customType.setStoreTermVectorOffsets(true);
|
||||||
Field f = new Field("field", stream, customType);
|
Field f = new Field("field", cachedStream, customType);
|
||||||
doc.add(f);
|
doc.add(f);
|
||||||
doc.add(f);
|
doc.add(f);
|
||||||
w.addDocument(doc);
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
w.close();
|
w.close();
|
||||||
|
|
||||||
IndexReader r = DirectoryReader.open(dir);
|
IndexReader r = DirectoryReader.open(dir);
|
||||||
|
|
|
@ -617,16 +617,16 @@ public class TestPhraseQuery extends LuceneTestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TokenStream ts = analyzer.tokenStream("ignore", term);
|
try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
|
||||||
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while(ts.incrementToken()) {
|
while(ts.incrementToken()) {
|
||||||
String text = termAttr.toString();
|
String text = termAttr.toString();
|
||||||
doc.add(text);
|
doc.add(text);
|
||||||
sb.append(text).append(' ');
|
sb.append(text).append(' ');
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
} else {
|
} else {
|
||||||
// pick existing sub-phrase
|
// pick existing sub-phrase
|
||||||
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
|
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
|
||||||
|
|
|
@ -170,20 +170,20 @@ public abstract class AbstractTestCase extends LuceneTestCase {
|
||||||
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException {
|
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException {
|
||||||
List<BytesRef> bytesRefs = new ArrayList<BytesRef>();
|
List<BytesRef> bytesRefs = new ArrayList<BytesRef>();
|
||||||
|
|
||||||
TokenStream tokenStream = analyzer.tokenStream(field, text);
|
try (TokenStream tokenStream = analyzer.tokenStream(field, text)) {
|
||||||
TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||||
|
|
||||||
BytesRef bytesRef = termAttribute.getBytesRef();
|
BytesRef bytesRef = termAttribute.getBytesRef();
|
||||||
|
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
|
|
||||||
while (tokenStream.incrementToken()) {
|
while (tokenStream.incrementToken()) {
|
||||||
termAttribute.fillBytesRef();
|
termAttribute.fillBytesRef();
|
||||||
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
|
bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenStream.end();
|
tokenStream.end();
|
||||||
tokenStream.close();
|
}
|
||||||
|
|
||||||
return bytesRefs;
|
return bytesRefs;
|
||||||
}
|
}
|
||||||
|
|
|
@ -777,31 +777,31 @@ public final class MoreLikeThis {
|
||||||
throw new UnsupportedOperationException("To use MoreLikeThis without " +
|
throw new UnsupportedOperationException("To use MoreLikeThis without " +
|
||||||
"term vectors, you must provide an Analyzer");
|
"term vectors, you must provide an Analyzer");
|
||||||
}
|
}
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
|
||||||
int tokenCount = 0;
|
int tokenCount = 0;
|
||||||
// for every token
|
// for every token
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String word = termAtt.toString();
|
String word = termAtt.toString();
|
||||||
tokenCount++;
|
tokenCount++;
|
||||||
if (tokenCount > maxNumTokensParsed) {
|
if (tokenCount > maxNumTokensParsed) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (isNoiseWord(word)) {
|
if (isNoiseWord(word)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// increment frequency
|
// increment frequency
|
||||||
Int cnt = termFreqMap.get(word);
|
Int cnt = termFreqMap.get(word);
|
||||||
if (cnt == null) {
|
if (cnt == null) {
|
||||||
termFreqMap.put(word, new Int());
|
termFreqMap.put(word, new Int());
|
||||||
} else {
|
} else {
|
||||||
cnt.x++;
|
cnt.x++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -162,9 +162,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
*/
|
*/
|
||||||
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
|
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
|
||||||
String analyzed = null;
|
String analyzed = null;
|
||||||
TokenStream stream = null;
|
try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
|
||||||
try{
|
|
||||||
stream = getAnalyzer().tokenStream(field, chunk);
|
|
||||||
stream.reset();
|
stream.reset();
|
||||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
// get first and hopefully only output token
|
// get first and hopefully only output token
|
||||||
|
@ -186,7 +184,6 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
multipleOutputs.append('"');
|
multipleOutputs.append('"');
|
||||||
}
|
}
|
||||||
stream.end();
|
stream.end();
|
||||||
stream.close();
|
|
||||||
if (null != multipleOutputs) {
|
if (null != multipleOutputs) {
|
||||||
throw new ParseException(
|
throw new ParseException(
|
||||||
String.format(getLocale(),
|
String.format(getLocale(),
|
||||||
|
@ -196,7 +193,6 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
|
||||||
// nothing returned by analyzer. Was it a stop word and the user accidentally
|
// nothing returned by analyzer. Was it a stop word and the user accidentally
|
||||||
// used an analyzer with stop words?
|
// used an analyzer with stop words?
|
||||||
stream.end();
|
stream.end();
|
||||||
stream.close();
|
|
||||||
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
|
throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
|
||||||
}
|
}
|
||||||
} catch (IOException e){
|
} catch (IOException e){
|
||||||
|
|
|
@ -497,63 +497,51 @@ public abstract class QueryParserBase implements CommonQueryParserConfiguration
|
||||||
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException {
|
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException {
|
||||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
CachingTokenFilter buffer = null;
|
||||||
TokenStream source;
|
|
||||||
try {
|
|
||||||
source = analyzer.tokenStream(field, queryText);
|
|
||||||
source.reset();
|
|
||||||
} catch (IOException e) {
|
|
||||||
ParseException p = new ParseException("Unable to initialize TokenStream to analyze query text");
|
|
||||||
p.initCause(e);
|
|
||||||
throw p;
|
|
||||||
}
|
|
||||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
|
||||||
TermToBytesRefAttribute termAtt = null;
|
TermToBytesRefAttribute termAtt = null;
|
||||||
PositionIncrementAttribute posIncrAtt = null;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
int numTokens = 0;
|
int numTokens = 0;
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
|
|
||||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
|
||||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
|
||||||
}
|
|
||||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
|
||||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
boolean hasMoreTokens = false;
|
||||||
boolean hasMoreTokens = false;
|
|
||||||
if (termAtt != null) {
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
try {
|
source.reset();
|
||||||
hasMoreTokens = buffer.incrementToken();
|
buffer = new CachingTokenFilter(source);
|
||||||
while (hasMoreTokens) {
|
|
||||||
numTokens++;
|
|
||||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
|
||||||
if (positionIncrement != 0) {
|
|
||||||
positionCount += positionIncrement;
|
|
||||||
} else {
|
|
||||||
severalTokensAtSamePosition = true;
|
|
||||||
}
|
|
||||||
hasMoreTokens = buffer.incrementToken();
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
// rewind the buffer stream
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
|
|
||||||
// close original stream - all tokens buffered
|
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||||
source.close();
|
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
ParseException p = new ParseException("Cannot close TokenStream analyzing query text");
|
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (termAtt != null) {
|
||||||
|
try {
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
while (hasMoreTokens) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
} else {
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
ParseException p = new ParseException("Eror analyzing query text");
|
||||||
p.initCause(e);
|
p.initCause(e);
|
||||||
throw p;
|
throw p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rewind the buffer stream
|
||||||
|
buffer.reset();
|
||||||
|
|
||||||
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
||||||
|
|
||||||
|
@ -839,38 +827,24 @@ public abstract class QueryParserBase implements CommonQueryParserConfiguration
|
||||||
}
|
}
|
||||||
|
|
||||||
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
|
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
|
||||||
TokenStream source;
|
|
||||||
|
|
||||||
if (analyzerIn == null) analyzerIn = analyzer;
|
if (analyzerIn == null) analyzerIn = analyzer;
|
||||||
|
|
||||||
try {
|
try (TokenStream source = analyzerIn.tokenStream(field, part)) {
|
||||||
source = analyzerIn.tokenStream(field, part);
|
|
||||||
source.reset();
|
source.reset();
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
|
|
||||||
try {
|
|
||||||
if (!source.incrementToken())
|
if (!source.incrementToken())
|
||||||
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
|
throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
if (source.incrementToken())
|
if (source.incrementToken())
|
||||||
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
|
throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
source.end();
|
source.end();
|
||||||
source.close();
|
return BytesRef.deepCopyOf(bytes);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
|
throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return BytesRef.deepCopyOf(bytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -113,52 +113,44 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
String text = fieldNode.getTextAsString();
|
String text = fieldNode.getTextAsString();
|
||||||
String field = fieldNode.getFieldAsString();
|
String field = fieldNode.getFieldAsString();
|
||||||
|
|
||||||
TokenStream source;
|
CachingTokenFilter buffer = null;
|
||||||
try {
|
|
||||||
source = this.analyzer.tokenStream(field, text);
|
|
||||||
source.reset();
|
|
||||||
} catch (IOException e1) {
|
|
||||||
throw new RuntimeException(e1);
|
|
||||||
}
|
|
||||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
|
||||||
|
|
||||||
PositionIncrementAttribute posIncrAtt = null;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
int numTokens = 0;
|
int numTokens = 0;
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
|
||||||
|
try (TokenStream source = this.analyzer.tokenStream(field, text)) {
|
||||||
|
source.reset();
|
||||||
|
buffer = new CachingTokenFilter(source);
|
||||||
|
|
||||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
|
|
||||||
while (buffer.incrementToken()) {
|
|
||||||
numTokens++;
|
|
||||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt
|
|
||||||
.getPositionIncrement() : 1;
|
|
||||||
if (positionIncrement != 0) {
|
|
||||||
positionCount += positionIncrement;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
severalTokensAtSamePosition = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
try {
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
while (buffer.incrementToken()) {
|
||||||
// rewind the buffer stream
|
numTokens++;
|
||||||
buffer.reset();
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt
|
||||||
|
.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
|
||||||
// close original stream - all tokens buffered
|
} else {
|
||||||
source.close();
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// ignore
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rewind the buffer stream
|
||||||
|
buffer.reset();
|
||||||
|
|
||||||
if (!buffer.hasAttribute(CharTermAttribute.class)) {
|
if (!buffer.hasAttribute(CharTermAttribute.class)) {
|
||||||
return new NoTokenFoundQueryNode();
|
return new NoTokenFoundQueryNode();
|
||||||
|
|
|
@ -73,8 +73,7 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
||||||
if ((stopWords != null) && (fields != null)) {
|
if ((stopWords != null) && (fields != null)) {
|
||||||
stopWordsSet = new HashSet<String>();
|
stopWordsSet = new HashSet<String>();
|
||||||
for (String field : fields) {
|
for (String field : fields) {
|
||||||
try {
|
try (TokenStream ts = analyzer.tokenStream(field, stopWords)) {
|
||||||
TokenStream ts = analyzer.tokenStream(field, stopWords);
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
|
|
|
@ -49,9 +49,9 @@ public class SpanOrTermsBuilder extends SpanBuilderBase {
|
||||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||||
String value = DOMUtils.getNonBlankTextOrFail(e);
|
String value = DOMUtils.getNonBlankTextOrFail(e);
|
||||||
|
|
||||||
try {
|
List<SpanQuery> clausesList = new ArrayList<SpanQuery>();
|
||||||
List<SpanQuery> clausesList = new ArrayList<SpanQuery>();
|
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, value);
|
try (TokenStream ts = analyzer.tokenStream(fieldName, value)) {
|
||||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
@ -61,7 +61,6 @@ public class SpanOrTermsBuilder extends SpanBuilderBase {
|
||||||
clausesList.add(stq);
|
clausesList.add(stq);
|
||||||
}
|
}
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
|
||||||
SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||||
soq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
|
soq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
|
||||||
return soq;
|
return soq;
|
||||||
|
|
|
@ -54,8 +54,7 @@ public class TermsFilterBuilder implements FilterBuilder {
|
||||||
String text = DOMUtils.getNonBlankTextOrFail(e);
|
String text = DOMUtils.getNonBlankTextOrFail(e);
|
||||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||||
|
|
||||||
try {
|
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, text);
|
|
||||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
@ -64,7 +63,6 @@ public class TermsFilterBuilder implements FilterBuilder {
|
||||||
terms.add(BytesRef.deepCopyOf(bytes));
|
terms.add(BytesRef.deepCopyOf(bytes));
|
||||||
}
|
}
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
catch (IOException ioe) {
|
catch (IOException ioe) {
|
||||||
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
||||||
|
|
|
@ -51,8 +51,7 @@ public class TermsQueryBuilder implements QueryBuilder {
|
||||||
|
|
||||||
BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false));
|
BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false));
|
||||||
bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0));
|
bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0));
|
||||||
try {
|
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, text);
|
|
||||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
Term term = null;
|
Term term = null;
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
|
@ -63,7 +62,6 @@ public class TermsQueryBuilder implements QueryBuilder {
|
||||||
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
|
||||||
}
|
}
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
catch (IOException ioe) {
|
catch (IOException ioe) {
|
||||||
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
throw new RuntimeException("Error constructing terms from index:" + ioe);
|
||||||
|
|
|
@ -193,67 +193,67 @@ public class FuzzyLikeThisQuery extends Query
|
||||||
|
|
||||||
private void addTerms(IndexReader reader, FieldVals f) throws IOException {
|
private void addTerms(IndexReader reader, FieldVals f) throws IOException {
|
||||||
if (f.queryString == null) return;
|
if (f.queryString == null) return;
|
||||||
TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString);
|
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
|
||||||
|
|
||||||
int corpusNumDocs = reader.numDocs();
|
|
||||||
HashSet<String> processedTerms = new HashSet<String>();
|
|
||||||
ts.reset();
|
|
||||||
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
|
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
|
||||||
if (terms == null) {
|
if (terms == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
while (ts.incrementToken()) {
|
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
|
||||||
String term = termAtt.toString();
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
if (!processedTerms.contains(term)) {
|
|
||||||
processedTerms.add(term);
|
int corpusNumDocs = reader.numDocs();
|
||||||
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
HashSet<String> processedTerms = new HashSet<String>();
|
||||||
float minScore = 0;
|
ts.reset();
|
||||||
Term startTerm = new Term(f.fieldName, term);
|
while (ts.incrementToken()) {
|
||||||
AttributeSource atts = new AttributeSource();
|
String term = termAtt.toString();
|
||||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
if (!processedTerms.contains(term)) {
|
||||||
|
processedTerms.add(term);
|
||||||
|
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||||
|
float minScore = 0;
|
||||||
|
Term startTerm = new Term(f.fieldName, term);
|
||||||
|
AttributeSource atts = new AttributeSource();
|
||||||
|
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||||
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
|
||||||
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
|
SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
|
||||||
//store the df so all variants use same idf
|
//store the df so all variants use same idf
|
||||||
int df = reader.docFreq(startTerm);
|
int df = reader.docFreq(startTerm);
|
||||||
int numVariants = 0;
|
int numVariants = 0;
|
||||||
int totalVariantDocFreqs = 0;
|
int totalVariantDocFreqs = 0;
|
||||||
BytesRef possibleMatch;
|
BytesRef possibleMatch;
|
||||||
BoostAttribute boostAtt =
|
BoostAttribute boostAtt =
|
||||||
fe.attributes().addAttribute(BoostAttribute.class);
|
fe.attributes().addAttribute(BoostAttribute.class);
|
||||||
while ((possibleMatch = fe.next()) != null) {
|
while ((possibleMatch = fe.next()) != null) {
|
||||||
numVariants++;
|
numVariants++;
|
||||||
totalVariantDocFreqs += fe.docFreq();
|
totalVariantDocFreqs += fe.docFreq();
|
||||||
float score = boostAtt.getBoost();
|
float score = boostAtt.getBoost();
|
||||||
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
|
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
|
||||||
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
|
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
|
||||||
variantsQ.insertWithOverflow(st);
|
variantsQ.insertWithOverflow(st);
|
||||||
minScore = variantsQ.top().score; // maintain minScore
|
minScore = variantsQ.top().score; // maintain minScore
|
||||||
}
|
}
|
||||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||||
}
|
|
||||||
|
|
||||||
if (numVariants > 0) {
|
|
||||||
int avgDf = totalVariantDocFreqs / numVariants;
|
|
||||||
if (df == 0)//no direct match we can use as df for all variants
|
|
||||||
{
|
|
||||||
df = avgDf; //use avg df of all variants
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// take the top variants (scored by edit distance) and reset the score
|
if (numVariants > 0) {
|
||||||
// to include an IDF factor then add to the global queue for ranking
|
int avgDf = totalVariantDocFreqs / numVariants;
|
||||||
// overall top query terms
|
if (df == 0)//no direct match we can use as df for all variants
|
||||||
int size = variantsQ.size();
|
{
|
||||||
for (int i = 0; i < size; i++) {
|
df = avgDf; //use avg df of all variants
|
||||||
ScoreTerm st = variantsQ.pop();
|
}
|
||||||
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
|
|
||||||
q.insertWithOverflow(st);
|
// take the top variants (scored by edit distance) and reset the score
|
||||||
|
// to include an IDF factor then add to the global queue for ranking
|
||||||
|
// overall top query terms
|
||||||
|
int size = variantsQ.size();
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
ScoreTerm st = variantsQ.pop();
|
||||||
|
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
|
||||||
|
q.insertWithOverflow(st);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ts.end();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -352,9 +352,8 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
occur = BooleanClause.Occur.SHOULD;
|
occur = BooleanClause.Occur.SHOULD;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
|
||||||
//long t0 = System.currentTimeMillis();
|
//long t0 = System.currentTimeMillis();
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
|
||||||
ts.reset();
|
ts.reset();
|
||||||
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -464,40 +463,39 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
* result is set on each {@link
|
* result is set on each {@link
|
||||||
* LookupResult#highlightKey} member. */
|
* LookupResult#highlightKey} member. */
|
||||||
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
int upto = 0;
|
int upto = 0;
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String token = termAtt.toString();
|
String token = termAtt.toString();
|
||||||
int startOffset = offsetAtt.startOffset();
|
int startOffset = offsetAtt.startOffset();
|
||||||
|
int endOffset = offsetAtt.endOffset();
|
||||||
|
if (upto < startOffset) {
|
||||||
|
addNonMatch(sb, text.substring(upto, startOffset));
|
||||||
|
upto = startOffset;
|
||||||
|
} else if (upto > startOffset) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (matchedTokens.contains(token)) {
|
||||||
|
// Token matches.
|
||||||
|
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
|
||||||
|
upto = endOffset;
|
||||||
|
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||||
|
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
|
||||||
|
upto = endOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
int endOffset = offsetAtt.endOffset();
|
int endOffset = offsetAtt.endOffset();
|
||||||
if (upto < startOffset) {
|
if (upto < endOffset) {
|
||||||
addNonMatch(sb, text.substring(upto, startOffset));
|
addNonMatch(sb, text.substring(upto));
|
||||||
upto = startOffset;
|
|
||||||
} else if (upto > startOffset) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (matchedTokens.contains(token)) {
|
|
||||||
// Token matches.
|
|
||||||
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
|
|
||||||
upto = endOffset;
|
|
||||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
|
||||||
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
|
|
||||||
upto = endOffset;
|
|
||||||
}
|
}
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
int endOffset = offsetAtt.endOffset();
|
|
||||||
if (upto < endOffset) {
|
|
||||||
addNonMatch(sb, text.substring(upto));
|
|
||||||
}
|
|
||||||
ts.close();
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Called while highlighting a single result, to append a
|
/** Called while highlighting a single result, to append a
|
||||||
|
|
|
@ -827,14 +827,15 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
|
|
||||||
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||||
// Analyze surface form:
|
// Analyze surface form:
|
||||||
TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
|
Automaton automaton = null;
|
||||||
|
try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
|
||||||
|
|
||||||
// Create corresponding automaton: labels are bytes
|
// Create corresponding automaton: labels are bytes
|
||||||
// from each analyzed token, with byte 0 used as
|
// from each analyzed token, with byte 0 used as
|
||||||
// separator between tokens:
|
// separator between tokens:
|
||||||
Automaton automaton = ts2a.toAutomaton(ts);
|
automaton = ts2a.toAutomaton(ts);
|
||||||
ts.close();
|
}
|
||||||
|
|
||||||
replaceSep(automaton);
|
replaceSep(automaton);
|
||||||
automaton = convertAutomaton(automaton);
|
automaton = convertAutomaton(automaton);
|
||||||
|
@ -854,9 +855,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
|
||||||
// TODO: is there a Reader from a CharSequence?
|
// TODO: is there a Reader from a CharSequence?
|
||||||
// Turn tokenstream into automaton:
|
// Turn tokenstream into automaton:
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
|
Automaton automaton = null;
|
||||||
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
|
||||||
ts.close();
|
automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: we could use the end offset to "guess"
|
// TODO: we could use the end offset to "guess"
|
||||||
// whether the final token was a partial token; this
|
// whether the final token was a partial token; this
|
||||||
|
|
|
@ -449,252 +449,251 @@ public class FreeTextSuggester extends Lookup {
|
||||||
|
|
||||||
/** Retrieve suggestions. */
|
/** Retrieve suggestions. */
|
||||||
public List<LookupResult> lookup(final CharSequence key, int num) throws IOException {
|
public List<LookupResult> lookup(final CharSequence key, int num) throws IOException {
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
|
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
|
||||||
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
|
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
|
||||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
|
||||||
BytesRef[] lastTokens = new BytesRef[grams];
|
BytesRef[] lastTokens = new BytesRef[grams];
|
||||||
//System.out.println("lookup: key='" + key + "'");
|
//System.out.println("lookup: key='" + key + "'");
|
||||||
|
|
||||||
// Run full analysis, but save only the
|
// Run full analysis, but save only the
|
||||||
// last 1gram, last 2gram, etc.:
|
// last 1gram, last 2gram, etc.:
|
||||||
BytesRef tokenBytes = termBytesAtt.getBytesRef();
|
BytesRef tokenBytes = termBytesAtt.getBytesRef();
|
||||||
int maxEndOffset = -1;
|
int maxEndOffset = -1;
|
||||||
boolean sawRealToken = false;
|
boolean sawRealToken = false;
|
||||||
while(ts.incrementToken()) {
|
while(ts.incrementToken()) {
|
||||||
termBytesAtt.fillBytesRef();
|
termBytesAtt.fillBytesRef();
|
||||||
sawRealToken |= tokenBytes.length > 0;
|
sawRealToken |= tokenBytes.length > 0;
|
||||||
// TODO: this is somewhat iffy; today, ShingleFilter
|
// TODO: this is somewhat iffy; today, ShingleFilter
|
||||||
// sets posLen to the gram count; maybe we should make
|
// sets posLen to the gram count; maybe we should make
|
||||||
// a separate dedicated att for this?
|
// a separate dedicated att for this?
|
||||||
int gramCount = posLenAtt.getPositionLength();
|
int gramCount = posLenAtt.getPositionLength();
|
||||||
|
|
||||||
assert gramCount <= grams;
|
assert gramCount <= grams;
|
||||||
|
|
||||||
// Safety: make sure the recalculated count "agrees":
|
// Safety: make sure the recalculated count "agrees":
|
||||||
if (countGrams(tokenBytes) != gramCount) {
|
if (countGrams(tokenBytes) != gramCount) {
|
||||||
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
|
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
|
||||||
|
}
|
||||||
|
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
|
||||||
|
lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
|
||||||
}
|
}
|
||||||
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
|
ts.end();
|
||||||
lastTokens[gramCount-1] = BytesRef.deepCopyOf(tokenBytes);
|
|
||||||
}
|
if (!sawRealToken) {
|
||||||
ts.end();
|
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
|
||||||
|
}
|
||||||
if (!sawRealToken) {
|
|
||||||
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
|
// Carefully fill last tokens with _ tokens;
|
||||||
}
|
// ShingleFilter appraently won't emit "only hole"
|
||||||
|
// tokens:
|
||||||
// Carefully fill last tokens with _ tokens;
|
int endPosInc = posIncAtt.getPositionIncrement();
|
||||||
// ShingleFilter appraently won't emit "only hole"
|
|
||||||
// tokens:
|
// Note this will also be true if input is the empty
|
||||||
int endPosInc = posIncAtt.getPositionIncrement();
|
// string (in which case we saw no tokens and
|
||||||
|
// maxEndOffset is still -1), which in fact works out OK
|
||||||
// Note this will also be true if input is the empty
|
// because we fill the unigram with an empty BytesRef
|
||||||
// string (in which case we saw no tokens and
|
// below:
|
||||||
// maxEndOffset is still -1), which in fact works out OK
|
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
|
||||||
// because we fill the unigram with an empty BytesRef
|
//System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
|
||||||
// below:
|
|
||||||
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
|
if (lastTokenEnded) {
|
||||||
ts.close();
|
//System.out.println(" lastTokenEnded");
|
||||||
//System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
|
// If user hit space after the last token, then
|
||||||
|
// "upgrade" all tokens. This way "foo " will suggest
|
||||||
if (lastTokenEnded) {
|
// all bigrams starting w/ foo, and not any unigrams
|
||||||
//System.out.println(" lastTokenEnded");
|
// starting with "foo":
|
||||||
// If user hit space after the last token, then
|
for(int i=grams-1;i>0;i--) {
|
||||||
// "upgrade" all tokens. This way "foo " will suggest
|
BytesRef token = lastTokens[i-1];
|
||||||
// all bigrams starting w/ foo, and not any unigrams
|
if (token == null) {
|
||||||
// starting with "foo":
|
continue;
|
||||||
for(int i=grams-1;i>0;i--) {
|
}
|
||||||
BytesRef token = lastTokens[i-1];
|
token.grow(token.length+1);
|
||||||
if (token == null) {
|
token.bytes[token.length] = separator;
|
||||||
|
token.length++;
|
||||||
|
lastTokens[i] = token;
|
||||||
|
}
|
||||||
|
lastTokens[0] = new BytesRef();
|
||||||
|
}
|
||||||
|
|
||||||
|
Arc<Long> arc = new Arc<Long>();
|
||||||
|
|
||||||
|
BytesReader bytesReader = fst.getBytesReader();
|
||||||
|
|
||||||
|
// Try highest order models first, and if they return
|
||||||
|
// results, return that; else, fallback:
|
||||||
|
double backoff = 1.0;
|
||||||
|
|
||||||
|
List<LookupResult> results = new ArrayList<LookupResult>(num);
|
||||||
|
|
||||||
|
// We only add a given suffix once, from the highest
|
||||||
|
// order model that saw it; for subsequent lower order
|
||||||
|
// models we skip it:
|
||||||
|
final Set<BytesRef> seen = new HashSet<BytesRef>();
|
||||||
|
|
||||||
|
for(int gram=grams-1;gram>=0;gram--) {
|
||||||
|
BytesRef token = lastTokens[gram];
|
||||||
|
// Don't make unigram predictions from empty string:
|
||||||
|
if (token == null || (token.length == 0 && key.length() > 0)) {
|
||||||
|
// Input didn't have enough tokens:
|
||||||
|
//System.out.println(" gram=" + gram + ": skip: not enough input");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
token.grow(token.length+1);
|
|
||||||
token.bytes[token.length] = separator;
|
if (endPosInc > 0 && gram <= endPosInc) {
|
||||||
token.length++;
|
// Skip hole-only predictions; in theory we
|
||||||
lastTokens[i] = token;
|
// shouldn't have to do this, but we'd need to fix
|
||||||
}
|
// ShingleFilter to produce only-hole tokens:
|
||||||
lastTokens[0] = new BytesRef();
|
//System.out.println(" break: only holes now");
|
||||||
}
|
|
||||||
|
|
||||||
Arc<Long> arc = new Arc<Long>();
|
|
||||||
|
|
||||||
BytesReader bytesReader = fst.getBytesReader();
|
|
||||||
|
|
||||||
// Try highest order models first, and if they return
|
|
||||||
// results, return that; else, fallback:
|
|
||||||
double backoff = 1.0;
|
|
||||||
|
|
||||||
List<LookupResult> results = new ArrayList<LookupResult>(num);
|
|
||||||
|
|
||||||
// We only add a given suffix once, from the highest
|
|
||||||
// order model that saw it; for subsequent lower order
|
|
||||||
// models we skip it:
|
|
||||||
final Set<BytesRef> seen = new HashSet<BytesRef>();
|
|
||||||
|
|
||||||
for(int gram=grams-1;gram>=0;gram--) {
|
|
||||||
BytesRef token = lastTokens[gram];
|
|
||||||
// Don't make unigram predictions from empty string:
|
|
||||||
if (token == null || (token.length == 0 && key.length() > 0)) {
|
|
||||||
// Input didn't have enough tokens:
|
|
||||||
//System.out.println(" gram=" + gram + ": skip: not enough input");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (endPosInc > 0 && gram <= endPosInc) {
|
|
||||||
// Skip hole-only predictions; in theory we
|
|
||||||
// shouldn't have to do this, but we'd need to fix
|
|
||||||
// ShingleFilter to produce only-hole tokens:
|
|
||||||
//System.out.println(" break: only holes now");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
|
|
||||||
|
|
||||||
// TODO: we could add fuzziness here
|
|
||||||
// match the prefix portion exactly
|
|
||||||
//Pair<Long,BytesRef> prefixOutput = null;
|
|
||||||
Long prefixOutput = null;
|
|
||||||
try {
|
|
||||||
prefixOutput = lookupPrefix(fst, bytesReader, token, arc);
|
|
||||||
} catch (IOException bogus) {
|
|
||||||
throw new RuntimeException(bogus);
|
|
||||||
}
|
|
||||||
//System.out.println(" prefixOutput=" + prefixOutput);
|
|
||||||
|
|
||||||
if (prefixOutput == null) {
|
|
||||||
// This model never saw this prefix, e.g. the
|
|
||||||
// trigram model never saw context "purple mushroom"
|
|
||||||
backoff *= ALPHA;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: we could do this division at build time, and
|
|
||||||
// bake it into the FST?
|
|
||||||
|
|
||||||
// Denominator for computing scores from current
|
|
||||||
// model's predictions:
|
|
||||||
long contextCount = totTokens;
|
|
||||||
|
|
||||||
BytesRef lastTokenFragment = null;
|
|
||||||
|
|
||||||
for(int i=token.length-1;i>=0;i--) {
|
|
||||||
if (token.bytes[token.offset+i] == separator) {
|
|
||||||
BytesRef context = new BytesRef(token.bytes, token.offset, i);
|
|
||||||
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));
|
|
||||||
assert output != null;
|
|
||||||
contextCount = decodeWeight(output);
|
|
||||||
lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
|
||||||
final BytesRef finalLastToken;
|
|
||||||
|
// TODO: we could add fuzziness here
|
||||||
if (lastTokenFragment == null) {
|
// match the prefix portion exactly
|
||||||
finalLastToken = BytesRef.deepCopyOf(token);
|
//Pair<Long,BytesRef> prefixOutput = null;
|
||||||
} else {
|
Long prefixOutput = null;
|
||||||
finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
|
try {
|
||||||
}
|
prefixOutput = lookupPrefix(fst, bytesReader, token, arc);
|
||||||
assert finalLastToken.offset == 0;
|
} catch (IOException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
CharsRef spare = new CharsRef();
|
}
|
||||||
|
//System.out.println(" prefixOutput=" + prefixOutput);
|
||||||
// complete top-N
|
|
||||||
MinResult<Long> completions[] = null;
|
if (prefixOutput == null) {
|
||||||
try {
|
// This model never saw this prefix, e.g. the
|
||||||
|
// trigram model never saw context "purple mushroom"
|
||||||
// Because we store multiple models in one FST
|
backoff *= ALPHA;
|
||||||
// (1gram, 2gram, 3gram), we must restrict the
|
continue;
|
||||||
// search so that it only considers the current
|
}
|
||||||
// model. For highest order model, this is not
|
|
||||||
// necessary since all completions in the FST
|
// TODO: we could do this division at build time, and
|
||||||
// must be from this model, but for lower order
|
// bake it into the FST?
|
||||||
// models we have to filter out the higher order
|
|
||||||
// ones:
|
// Denominator for computing scores from current
|
||||||
|
// model's predictions:
|
||||||
// Must do num+seen.size() for queue depth because we may
|
long contextCount = totTokens;
|
||||||
// reject up to seen.size() paths in acceptResult():
|
|
||||||
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
|
BytesRef lastTokenFragment = null;
|
||||||
|
|
||||||
BytesRef scratchBytes = new BytesRef();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void addIfCompetitive(Util.FSTPath<Long> path) {
|
|
||||||
if (path.arc.label != separator) {
|
|
||||||
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
|
||||||
super.addIfCompetitive(path);
|
|
||||||
} else {
|
|
||||||
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected boolean acceptResult(IntsRef input, Long output) {
|
|
||||||
Util.toBytesRef(input, scratchBytes);
|
|
||||||
finalLastToken.grow(finalLastToken.length + scratchBytes.length);
|
|
||||||
int lenSav = finalLastToken.length;
|
|
||||||
finalLastToken.append(scratchBytes);
|
|
||||||
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
|
|
||||||
boolean ret = seen.contains(finalLastToken) == false;
|
|
||||||
|
|
||||||
finalLastToken.length = lenSav;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// since this search is initialized with a single start node
|
|
||||||
// it is okay to start with an empty input path here
|
|
||||||
searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
|
|
||||||
|
|
||||||
completions = searcher.search();
|
|
||||||
} catch (IOException bogus) {
|
|
||||||
throw new RuntimeException(bogus);
|
|
||||||
}
|
|
||||||
|
|
||||||
int prefixLength = token.length;
|
|
||||||
|
|
||||||
BytesRef suffix = new BytesRef(8);
|
|
||||||
//System.out.println(" " + completions.length + " completions");
|
|
||||||
|
|
||||||
nextCompletion:
|
|
||||||
for (MinResult<Long> completion : completions) {
|
|
||||||
token.length = prefixLength;
|
|
||||||
// append suffix
|
|
||||||
Util.toBytesRef(completion.input, suffix);
|
|
||||||
token.append(suffix);
|
|
||||||
|
|
||||||
//System.out.println(" completion " + token.utf8ToString());
|
|
||||||
|
|
||||||
// Skip this path if a higher-order model already
|
|
||||||
// saw/predicted its last token:
|
|
||||||
BytesRef lastToken = token;
|
|
||||||
for(int i=token.length-1;i>=0;i--) {
|
for(int i=token.length-1;i>=0;i--) {
|
||||||
if (token.bytes[token.offset+i] == separator) {
|
if (token.bytes[token.offset+i] == separator) {
|
||||||
assert token.length-i-1 > 0;
|
BytesRef context = new BytesRef(token.bytes, token.offset, i);
|
||||||
lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
|
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRef()));
|
||||||
|
assert output != null;
|
||||||
|
contextCount = decodeWeight(output);
|
||||||
|
lastTokenFragment = new BytesRef(token.bytes, token.offset + i + 1, token.length - i - 1);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (seen.contains(lastToken)) {
|
|
||||||
//System.out.println(" skip dup " + lastToken.utf8ToString());
|
final BytesRef finalLastToken;
|
||||||
continue nextCompletion;
|
|
||||||
|
if (lastTokenFragment == null) {
|
||||||
|
finalLastToken = BytesRef.deepCopyOf(token);
|
||||||
|
} else {
|
||||||
|
finalLastToken = BytesRef.deepCopyOf(lastTokenFragment);
|
||||||
}
|
}
|
||||||
seen.add(BytesRef.deepCopyOf(lastToken));
|
assert finalLastToken.offset == 0;
|
||||||
spare.grow(token.length);
|
|
||||||
UnicodeUtil.UTF8toUTF16(token, spare);
|
CharsRef spare = new CharsRef();
|
||||||
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
|
|
||||||
results.add(result);
|
// complete top-N
|
||||||
assert results.size() == seen.size();
|
MinResult<Long> completions[] = null;
|
||||||
//System.out.println(" add result=" + result);
|
try {
|
||||||
|
|
||||||
|
// Because we store multiple models in one FST
|
||||||
|
// (1gram, 2gram, 3gram), we must restrict the
|
||||||
|
// search so that it only considers the current
|
||||||
|
// model. For highest order model, this is not
|
||||||
|
// necessary since all completions in the FST
|
||||||
|
// must be from this model, but for lower order
|
||||||
|
// models we have to filter out the higher order
|
||||||
|
// ones:
|
||||||
|
|
||||||
|
// Must do num+seen.size() for queue depth because we may
|
||||||
|
// reject up to seen.size() paths in acceptResult():
|
||||||
|
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num+seen.size(), weightComparator) {
|
||||||
|
|
||||||
|
BytesRef scratchBytes = new BytesRef();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void addIfCompetitive(Util.FSTPath<Long> path) {
|
||||||
|
if (path.arc.label != separator) {
|
||||||
|
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||||
|
super.addIfCompetitive(path);
|
||||||
|
} else {
|
||||||
|
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean acceptResult(IntsRef input, Long output) {
|
||||||
|
Util.toBytesRef(input, scratchBytes);
|
||||||
|
finalLastToken.grow(finalLastToken.length + scratchBytes.length);
|
||||||
|
int lenSav = finalLastToken.length;
|
||||||
|
finalLastToken.append(scratchBytes);
|
||||||
|
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
|
||||||
|
boolean ret = seen.contains(finalLastToken) == false;
|
||||||
|
|
||||||
|
finalLastToken.length = lenSav;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// since this search is initialized with a single start node
|
||||||
|
// it is okay to start with an empty input path here
|
||||||
|
searcher.addStartPaths(arc, prefixOutput, true, new IntsRef());
|
||||||
|
|
||||||
|
completions = searcher.search();
|
||||||
|
} catch (IOException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
|
||||||
|
int prefixLength = token.length;
|
||||||
|
|
||||||
|
BytesRef suffix = new BytesRef(8);
|
||||||
|
//System.out.println(" " + completions.length + " completions");
|
||||||
|
|
||||||
|
nextCompletion:
|
||||||
|
for (MinResult<Long> completion : completions) {
|
||||||
|
token.length = prefixLength;
|
||||||
|
// append suffix
|
||||||
|
Util.toBytesRef(completion.input, suffix);
|
||||||
|
token.append(suffix);
|
||||||
|
|
||||||
|
//System.out.println(" completion " + token.utf8ToString());
|
||||||
|
|
||||||
|
// Skip this path if a higher-order model already
|
||||||
|
// saw/predicted its last token:
|
||||||
|
BytesRef lastToken = token;
|
||||||
|
for(int i=token.length-1;i>=0;i--) {
|
||||||
|
if (token.bytes[token.offset+i] == separator) {
|
||||||
|
assert token.length-i-1 > 0;
|
||||||
|
lastToken = new BytesRef(token.bytes, token.offset+i+1, token.length-i-1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (seen.contains(lastToken)) {
|
||||||
|
//System.out.println(" skip dup " + lastToken.utf8ToString());
|
||||||
|
continue nextCompletion;
|
||||||
|
}
|
||||||
|
seen.add(BytesRef.deepCopyOf(lastToken));
|
||||||
|
spare.grow(token.length);
|
||||||
|
UnicodeUtil.UTF8toUTF16(token, spare);
|
||||||
|
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
|
||||||
|
results.add(result);
|
||||||
|
assert results.size() == seen.size();
|
||||||
|
//System.out.println(" add result=" + result);
|
||||||
|
}
|
||||||
|
backoff *= ALPHA;
|
||||||
}
|
}
|
||||||
backoff *= ALPHA;
|
|
||||||
}
|
Collections.sort(results, new Comparator<LookupResult>() {
|
||||||
|
|
||||||
Collections.sort(results, new Comparator<LookupResult>() {
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(LookupResult a, LookupResult b) {
|
public int compare(LookupResult a, LookupResult b) {
|
||||||
if (a.value > b.value) {
|
if (a.value > b.value) {
|
||||||
|
@ -707,12 +706,13 @@ public class FreeTextSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (results.size() > num) {
|
if (results.size() > num) {
|
||||||
results.subList(num, results.size()).clear();
|
results.subList(num, results.size()).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** weight -> cost */
|
/** weight -> cost */
|
||||||
|
|
|
@ -165,43 +165,43 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
|
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
|
List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
|
||||||
int upto = 0;
|
int upto = 0;
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String token = termAtt.toString();
|
String token = termAtt.toString();
|
||||||
int startOffset = offsetAtt.startOffset();
|
int startOffset = offsetAtt.startOffset();
|
||||||
int endOffset = offsetAtt.endOffset();
|
int endOffset = offsetAtt.endOffset();
|
||||||
if (upto < startOffset) {
|
if (upto < startOffset) {
|
||||||
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
|
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
|
||||||
upto = startOffset;
|
upto = startOffset;
|
||||||
} else if (upto > startOffset) {
|
} else if (upto > startOffset) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matchedTokens.contains(token)) {
|
if (matchedTokens.contains(token)) {
|
||||||
// Token matches.
|
// Token matches.
|
||||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
|
||||||
upto = endOffset;
|
upto = endOffset;
|
||||||
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
} else if (prefixToken != null && token.startsWith(prefixToken)) {
|
||||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
|
||||||
if (prefixToken.length() < token.length()) {
|
if (prefixToken.length() < token.length()) {
|
||||||
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
|
fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
|
||||||
|
}
|
||||||
|
upto = endOffset;
|
||||||
}
|
}
|
||||||
upto = endOffset;
|
|
||||||
}
|
}
|
||||||
|
ts.end();
|
||||||
|
int endOffset = offsetAtt.endOffset();
|
||||||
|
if (upto < endOffset) {
|
||||||
|
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
|
||||||
|
}
|
||||||
|
|
||||||
|
return fragments;
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
int endOffset = offsetAtt.endOffset();
|
|
||||||
if (upto < endOffset) {
|
|
||||||
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
|
|
||||||
}
|
|
||||||
ts.close();
|
|
||||||
|
|
||||||
return fragments;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||||
|
|
|
@ -258,17 +258,17 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
||||||
|
|
||||||
for (int i = 0; i < numTestPoints; i++) {
|
for (int i = 0; i < numTestPoints; i++) {
|
||||||
String term = _TestUtil.randomSimpleString(random());
|
String term = _TestUtil.randomSimpleString(random());
|
||||||
TokenStream ts = analyzer.tokenStream("fake", term);
|
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
|
||||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
// ensure we make a copy of the actual bytes too
|
// ensure we make a copy of the actual bytes too
|
||||||
map.put(term, BytesRef.deepCopyOf(bytes));
|
map.put(term, BytesRef.deepCopyOf(bytes));
|
||||||
assertFalse(ts.incrementToken());
|
assertFalse(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Thread threads[] = new Thread[numThreads];
|
Thread threads[] = new Thread[numThreads];
|
||||||
|
@ -280,16 +280,16 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
||||||
for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
|
for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
|
||||||
String term = mapping.getKey();
|
String term = mapping.getKey();
|
||||||
BytesRef expected = mapping.getValue();
|
BytesRef expected = mapping.getValue();
|
||||||
TokenStream ts = analyzer.tokenStream("fake", term);
|
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
|
||||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
ts.reset();
|
ts.reset();
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
assertEquals(expected, bytes);
|
assertEquals(expected, bytes);
|
||||||
assertFalse(ts.incrementToken());
|
assertFalse(ts.incrementToken());
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|
|
@ -234,36 +234,23 @@ public class ICUCollationField extends FieldType {
|
||||||
* simple (we already have a threadlocal clone in the reused TS)
|
* simple (we already have a threadlocal clone in the reused TS)
|
||||||
*/
|
*/
|
||||||
private BytesRef analyzeRangePart(String field, String part) {
|
private BytesRef analyzeRangePart(String field, String part) {
|
||||||
TokenStream source;
|
try (TokenStream source = analyzer.tokenStream(field, part)) {
|
||||||
|
|
||||||
try {
|
|
||||||
source = analyzer.tokenStream(field, part);
|
|
||||||
source.reset();
|
source.reset();
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to initialize TokenStream to analyze range part: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
|
|
||||||
// we control the analyzer here: most errors are impossible
|
// we control the analyzer here: most errors are impossible
|
||||||
try {
|
|
||||||
if (!source.incrementToken())
|
if (!source.incrementToken())
|
||||||
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
assert !source.incrementToken();
|
assert !source.incrementToken();
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
source.end();
|
source.end();
|
||||||
source.close();
|
return BytesRef.deepCopyOf(bytes);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing range part: " + part, e);
|
throw new RuntimeException("Unable analyze range part: " + part, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return BytesRef.deepCopyOf(bytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -85,15 +85,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
||||||
|
|
||||||
if (!TokenizerChain.class.isInstance(analyzer)) {
|
if (!TokenizerChain.class.isInstance(analyzer)) {
|
||||||
|
|
||||||
TokenStream tokenStream = null;
|
try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
|
||||||
try {
|
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
|
||||||
tokenStream = analyzer.tokenStream(context.getFieldName(), value);
|
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
||||||
|
return namedList;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
|
||||||
}
|
}
|
||||||
NamedList<List<NamedList>> namedList = new NamedList<List<NamedList>>();
|
|
||||||
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
|
|
||||||
return namedList;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
|
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
|
||||||
|
@ -139,10 +137,8 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
||||||
* @param analyzer The analyzer to use.
|
* @param analyzer The analyzer to use.
|
||||||
*/
|
*/
|
||||||
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
|
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
|
||||||
TokenStream tokenStream = null;
|
try (TokenStream tokenStream = analyzer.tokenStream("", query)){
|
||||||
try {
|
|
||||||
final Set<BytesRef> tokens = new HashSet<BytesRef>();
|
final Set<BytesRef> tokens = new HashSet<BytesRef>();
|
||||||
tokenStream = analyzer.tokenStream("", query);
|
|
||||||
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||||
final BytesRef bytes = bytesAtt.getBytesRef();
|
final BytesRef bytes = bytesAtt.getBytesRef();
|
||||||
|
|
||||||
|
@ -157,8 +153,6 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
||||||
return tokens;
|
return tokens;
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
|
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
|
||||||
} finally {
|
|
||||||
IOUtils.closeWhileHandlingException(tokenStream);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -344,16 +344,16 @@ public class QueryElevationComponent extends SearchComponent implements SolrCore
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
StringBuilder norm = new StringBuilder();
|
StringBuilder norm = new StringBuilder();
|
||||||
TokenStream tokens = analyzer.tokenStream("", query);
|
try (TokenStream tokens = analyzer.tokenStream("", query)) {
|
||||||
tokens.reset();
|
tokens.reset();
|
||||||
|
|
||||||
CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
|
||||||
while (tokens.incrementToken()) {
|
while (tokens.incrementToken()) {
|
||||||
norm.append(termAtt.buffer(), 0, termAtt.length());
|
norm.append(termAtt.buffer(), 0, termAtt.length());
|
||||||
|
}
|
||||||
|
tokens.end();
|
||||||
|
return norm.toString();
|
||||||
}
|
}
|
||||||
tokens.end();
|
|
||||||
tokens.close();
|
|
||||||
return norm.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------
|
||||||
|
|
|
@ -463,29 +463,29 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
||||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||||
Collection<Token> result = new ArrayList<Token>();
|
Collection<Token> result = new ArrayList<Token>();
|
||||||
assert analyzer != null;
|
assert analyzer != null;
|
||||||
TokenStream ts = analyzer.tokenStream("", q);
|
try (TokenStream ts = analyzer.tokenStream("", q)) {
|
||||||
ts.reset();
|
ts.reset();
|
||||||
// TODO: support custom attributes
|
// TODO: support custom attributes
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
|
||||||
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
|
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
|
||||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
||||||
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()){
|
while (ts.incrementToken()){
|
||||||
Token token = new Token();
|
Token token = new Token();
|
||||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
token.setType(typeAtt.type());
|
token.setType(typeAtt.type());
|
||||||
token.setFlags(flagsAtt.getFlags());
|
token.setFlags(flagsAtt.getFlags());
|
||||||
token.setPayload(payloadAtt.getPayload());
|
token.setPayload(payloadAtt.getPayload());
|
||||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||||
result.add(token);
|
result.add(token);
|
||||||
|
}
|
||||||
|
ts.end();
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
ts.end();
|
|
||||||
ts.close();
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected SolrSpellChecker getSpellChecker(SolrParams params) {
|
protected SolrSpellChecker getSpellChecker(SolrParams params) {
|
||||||
|
|
|
@ -403,58 +403,49 @@ public abstract class SolrQueryParserBase {
|
||||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source;
|
CachingTokenFilter buffer = null;
|
||||||
try {
|
|
||||||
source = analyzer.tokenStream(field, queryText);
|
|
||||||
source.reset();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new SyntaxError("Unable to initialize TokenStream to analyze query text", e);
|
|
||||||
}
|
|
||||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
|
||||||
TermToBytesRefAttribute termAtt = null;
|
TermToBytesRefAttribute termAtt = null;
|
||||||
PositionIncrementAttribute posIncrAtt = null;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
int numTokens = 0;
|
int numTokens = 0;
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
|
|
||||||
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
|
||||||
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
|
||||||
}
|
|
||||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
|
||||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
|
||||||
boolean hasMoreTokens = false;
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
if (termAtt != null) {
|
source.reset();
|
||||||
try {
|
buffer = new CachingTokenFilter(source);
|
||||||
hasMoreTokens = buffer.incrementToken();
|
|
||||||
while (hasMoreTokens) {
|
|
||||||
numTokens++;
|
|
||||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
|
||||||
if (positionIncrement != 0) {
|
|
||||||
positionCount += positionIncrement;
|
|
||||||
} else {
|
|
||||||
severalTokensAtSamePosition = true;
|
|
||||||
}
|
|
||||||
hasMoreTokens = buffer.incrementToken();
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
// rewind the buffer stream
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
|
|
||||||
// close original stream - all tokens buffered
|
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
|
||||||
source.close();
|
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
throw new SyntaxError("Cannot close TokenStream analyzing query text", e);
|
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasMoreTokens = false;
|
||||||
|
if (termAtt != null) {
|
||||||
|
try {
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
while (hasMoreTokens) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
} else {
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SyntaxError("Error analyzing query text", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rewind the buffer stream
|
||||||
|
buffer.reset();
|
||||||
|
|
||||||
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
|
||||||
|
|
||||||
|
|
|
@ -209,37 +209,23 @@ public class CollationField extends FieldType {
|
||||||
* its just that all methods are synced), this keeps things
|
* its just that all methods are synced), this keeps things
|
||||||
* simple (we already have a threadlocal clone in the reused TS)
|
* simple (we already have a threadlocal clone in the reused TS)
|
||||||
*/
|
*/
|
||||||
private BytesRef analyzeRangePart(String field, String part) {
|
private BytesRef analyzeRangePart(String field, String part) {
|
||||||
TokenStream source;
|
try (TokenStream source = analyzer.tokenStream(field, part)) {
|
||||||
|
source.reset();
|
||||||
try {
|
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||||
source = analyzer.tokenStream(field, part);
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
source.reset();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to initialize TokenStream to analyze range part: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
|
||||||
|
|
||||||
// we control the analyzer here: most errors are impossible
|
// we control the analyzer here: most errors are impossible
|
||||||
try {
|
|
||||||
if (!source.incrementToken())
|
if (!source.incrementToken())
|
||||||
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
throw new IllegalArgumentException("analyzer returned no terms for range part: " + part);
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
assert !source.incrementToken();
|
assert !source.incrementToken();
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("error analyzing range part: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
source.end();
|
source.end();
|
||||||
source.close();
|
return BytesRef.deepCopyOf(bytes);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing range part: " + part, e);
|
throw new RuntimeException("Unable to analyze range part: " + part, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return BytesRef.deepCopyOf(bytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -138,35 +138,23 @@ public class TextField extends FieldType {
|
||||||
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
|
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
|
||||||
if (part == null || analyzerIn == null) return null;
|
if (part == null || analyzerIn == null) return null;
|
||||||
|
|
||||||
TokenStream source;
|
try (TokenStream source = analyzerIn.tokenStream(field, part)){
|
||||||
try {
|
|
||||||
source = analyzerIn.tokenStream(field, part);
|
|
||||||
source.reset();
|
source.reset();
|
||||||
} catch (IOException e) {
|
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unable to initialize TokenStream to analyze multiTerm term: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
|
||||||
BytesRef bytes = termAtt.getBytesRef();
|
BytesRef bytes = termAtt.getBytesRef();
|
||||||
|
|
||||||
try {
|
|
||||||
if (!source.incrementToken())
|
if (!source.incrementToken())
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned no terms for multiTerm term: " + part);
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
if (source.incrementToken())
|
if (source.incrementToken())
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"analyzer returned too many terms for multiTerm term: " + part);
|
||||||
|
|
||||||
|
source.end();
|
||||||
|
return BytesRef.deepCopyOf(bytes);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"error analyzing range part: " + part, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
|
||||||
source.end();
|
|
||||||
source.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to end & close TokenStream after analyzing multiTerm term: " + part, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return BytesRef.deepCopyOf(bytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -178,58 +166,50 @@ public class TextField extends FieldType {
|
||||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source;
|
CachingTokenFilter buffer = null;
|
||||||
try {
|
|
||||||
source = analyzer.tokenStream(field, queryText);
|
|
||||||
source.reset();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to initialize TokenStream to analyze query text", e);
|
|
||||||
}
|
|
||||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
|
||||||
CharTermAttribute termAtt = null;
|
CharTermAttribute termAtt = null;
|
||||||
PositionIncrementAttribute posIncrAtt = null;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
int numTokens = 0;
|
int numTokens = 0;
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
|
|
||||||
if (buffer.hasAttribute(CharTermAttribute.class)) {
|
|
||||||
termAtt = buffer.getAttribute(CharTermAttribute.class);
|
|
||||||
}
|
|
||||||
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
|
||||||
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
|
||||||
boolean hasMoreTokens = false;
|
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
|
||||||
if (termAtt != null) {
|
source.reset();
|
||||||
try {
|
buffer = new CachingTokenFilter(source);
|
||||||
hasMoreTokens = buffer.incrementToken();
|
|
||||||
while (hasMoreTokens) {
|
|
||||||
numTokens++;
|
|
||||||
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
|
||||||
if (positionIncrement != 0) {
|
|
||||||
positionCount += positionIncrement;
|
|
||||||
} else {
|
|
||||||
severalTokensAtSamePosition = true;
|
|
||||||
}
|
|
||||||
hasMoreTokens = buffer.incrementToken();
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
// rewind the buffer stream
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
|
|
||||||
|
if (buffer.hasAttribute(CharTermAttribute.class)) {
|
||||||
|
termAtt = buffer.getAttribute(CharTermAttribute.class);
|
||||||
|
}
|
||||||
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
|
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasMoreTokens = false;
|
||||||
|
if (termAtt != null) {
|
||||||
|
try {
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
while (hasMoreTokens) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
} else {
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
hasMoreTokens = buffer.incrementToken();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
// close original stream - all tokens buffered
|
// rewind the buffer stream
|
||||||
source.close();
|
buffer.reset();
|
||||||
}
|
|
||||||
catch (IOException e) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
|
|
||||||
if (numTokens == 0)
|
if (numTokens == 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -40,10 +40,10 @@ class SimpleQueryConverter extends SpellingQueryConverter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<Token> convert(String origQuery) {
|
public Collection<Token> convert(String origQuery) {
|
||||||
try {
|
Collection<Token> result = new HashSet<Token>();
|
||||||
Collection<Token> result = new HashSet<Token>();
|
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);
|
||||||
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);
|
|
||||||
TokenStream ts = analyzer.tokenStream("", origQuery);
|
try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
|
||||||
// TODO: support custom attributes
|
// TODO: support custom attributes
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -64,9 +64,7 @@ class SimpleQueryConverter extends SpellingQueryConverter {
|
||||||
tok.setType(typeAtt.type());
|
tok.setType(typeAtt.type());
|
||||||
result.add(tok);
|
result.add(tok);
|
||||||
}
|
}
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|
Loading…
Reference in New Issue