From 755ebafa490cc604c00a987b5d30a112675c8fb1 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sat, 7 Apr 2012 23:06:12 +0000 Subject: [PATCH] LUCENE-3873: add MockGraphTokenFilter, inserting random graph tokens git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1310910 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/analysis/Token.java | 18 +- .../tokenattributes/OffsetAttributeImpl.java | 4 + .../lucene/analysis/TestGraphTokenizers.java | 382 ++++++++++++++++++ .../lucene/search/TestPositionIncrement.java | 1 + .../apache/lucene/util/TestRollingBuffer.java | 78 ++++ .../lucene/analysis/CannedTokenStream.java | 5 +- .../lucene/analysis/MockGraphTokenFilter.java | 318 +++++++++++++++ .../MockHoleInjectingTokenFilter.java | 62 +++ .../org/apache/lucene/util/RollingBuffer.java | 130 ++++++ .../analysis/core/TestStandardAnalyzer.java | 40 +- .../synonym/TestSynonymMapFilter.java | 58 ++- .../analysis/ja/TestJapaneseTokenizer.java | 17 +- 12 files changed, 1095 insertions(+), 18 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Token.java b/lucene/core/src/java/org/apache/lucene/analysis/Token.java index 9f3d7babb79..72fce9937a6 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/Token.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/Token.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc @@ -121,13 +122,14 @@ import org.apache.lucene.util.AttributeReflector; */ public class Token extends CharTermAttributeImpl implements TypeAttribute, PositionIncrementAttribute, - FlagsAttribute, OffsetAttribute, PayloadAttribute { + FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute { private int startOffset,endOffset; private String type = DEFAULT_TYPE; private int flags; private Payload payload; private int positionIncrement = 1; + private int positionLength = 1; /** Constructs a Token will null text. */ public Token() { @@ -270,6 +272,20 @@ public class Token extends CharTermAttributeImpl return positionIncrement; } + /** Set the position length. + * @see PositionLengthAttribute */ + @Override + public void setPositionLength(int positionLength) { + this.positionLength = positionLength; + } + + /** Get the position length. + * @see PositionLengthAttribute */ + @Override + public int getPositionLength() { + return positionLength; + } + /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java index 72191f42b54..1a158d00d69 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java @@ -40,6 +40,10 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ public void setOffset(int startOffset, int endOffset) { + // TODO: check that these are valid! IE, each should be + // >= 0, and endOffset should be >= startOffset. + // Problem is this could "break" existing + // tokenizers/filters. this.startOffset = startOffset; this.endOffset = endOffset; } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java new file mode 100644 index 00000000000..566c3e3e295 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java @@ -0,0 +1,382 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; + +public class TestGraphTokenizers extends BaseTokenStreamTestCase { + + // Makes a graph TokenStream from the string; separate + // positions with single space, multiple tokens at the same + // position with /, and add optional position length with + // :. EG "a b c" is a simple chain, "a/x b c" adds 'x' + // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds + // 'x' over a with posLen=3. Tokens are in normal-form! + // So, offsets are computed based on the first token at a + // given position. NOTE: each token must be a single + // character! We assume this when computing offsets... + + // NOTE: all input tokens must be length 1!!! This means + // you cannot turn on MockCharFilter when random + // testing... + + private static class GraphTokenizer extends Tokenizer { + private List tokens; + private int upto; + private int inputLength; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); + + public GraphTokenizer(Reader input) { + super(input); + } + + @Override + public void reset() { + tokens = null; + upto = 0; + } + + @Override + public boolean incrementToken() throws IOException { + if (tokens == null) { + fillTokens(); + } + //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size()); + if (upto == tokens.size()) { + //System.out.println(" END @ " + tokens.size()); + return false; + } + final Token t = tokens.get(upto++); + //System.out.println(" return token=" + t); + clearAttributes(); + termAtt.append(t.toString()); + offsetAtt.setOffset(t.startOffset(), t.endOffset()); + posIncrAtt.setPositionIncrement(t.getPositionIncrement()); + posLengthAtt.setPositionLength(t.getPositionLength()); + return true; + } + + @Override + public void end() throws IOException { + super.end(); + // NOTE: somewhat... hackish, but we need this to + // satisfy BTSTC: + final int lastOffset; + if (tokens != null && !tokens.isEmpty()) { + lastOffset = tokens.get(tokens.size()-1).endOffset(); + } else { + lastOffset = 0; + } + offsetAtt.setOffset(correctOffset(lastOffset), + correctOffset(inputLength)); + } + + private void fillTokens() throws IOException { + final StringBuilder sb = new StringBuilder(); + final char[] buffer = new char[256]; + while (true) { + final int count = input.read(buffer); + if (count == -1) { + break; + } + sb.append(buffer, 0, count); + //System.out.println("got count=" + count); + } + //System.out.println("fillTokens: " + sb); + + inputLength = sb.length(); + + final String[] parts = sb.toString().split(" "); + + tokens = new ArrayList(); + int pos = 0; + int maxPos = -1; + int offset = 0; + //System.out.println("again"); + for(String part : parts) { + final String[] overlapped = part.split("/"); + boolean firstAtPos = true; + int minPosLength = Integer.MAX_VALUE; + for(String part2 : overlapped) { + final int colonIndex = part2.indexOf(':'); + final String token; + final int posLength; + if (colonIndex != -1) { + token = part2.substring(0, colonIndex); + posLength = Integer.parseInt(part2.substring(1+colonIndex)); + } else { + token = part2; + posLength = 1; + } + maxPos = Math.max(maxPos, pos + posLength); + minPosLength = Math.min(minPosLength, posLength); + final Token t = new Token(token, offset, offset + 2*posLength - 1); + t.setPositionLength(posLength); + t.setPositionIncrement(firstAtPos ? 1:0); + firstAtPos = false; + //System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset()); + tokens.add(t); + } + pos += minPosLength; + offset = 2 * pos; + } + assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end"; + } + } + + public void testMockGraphTokenFilterBasic() throws Exception { + + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t2 = new MockGraphTokenFilter(random, t); + return new TokenStreamComponents(t, t2); + } + }; + + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); + } + } + + public void testMockGraphTokenFilterOnGraphInput() throws Exception { + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new GraphTokenizer(reader); + final TokenStream t2 = new MockGraphTokenFilter(random, t); + return new TokenStreamComponents(t, t2); + } + }; + + checkAnalysisConsistency(random, a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k"); + } + } + + // Just deletes (leaving hole) token 'a': + private final static class RemoveATokens extends TokenFilter { + private int pendingPosInc; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + public RemoveATokens(TokenStream in) { + super(in); + } + + @Override + public void reset() throws IOException { + super.reset(); + pendingPosInc = 0; + } + + @Override + public void end() throws IOException { + super.end(); + posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); + } + + @Override + public boolean incrementToken() throws IOException { + while (true) { + final boolean gotOne = input.incrementToken(); + if (!gotOne) { + return false; + } else if (termAtt.toString().equals("a")) { + pendingPosInc += posIncAtt.getPositionIncrement(); + } else { + posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement()); + pendingPosInc = 0; + return true; + } + } + } + } + + public void testMockGraphTokenFilterBeforeHoles() throws Exception { + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t2 = new MockGraphTokenFilter(random, t); + final TokenStream t3 = new RemoveATokens(t2); + return new TokenStreamComponents(t, t3); + } + }; + + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); + checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); + } + } + + public void testMockGraphTokenFilterAfterHoles() throws Exception { + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t2 = new RemoveATokens(t); + final TokenStream t3 = new MockGraphTokenFilter(random, t2); + return new TokenStreamComponents(t, t3); + } + }; + + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k"); + checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k"); + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a"); + checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y"); + } + } + + public void testMockGraphTokenFilterRandom() throws Exception { + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t2 = new MockGraphTokenFilter(random, t); + return new TokenStreamComponents(t, t2); + } + }; + + checkRandomData(random, a, 5, atLeast(1000)); + } + } + + // Two MockGraphTokenFilters + public void testDoubleMockGraphTokenFilterRandom() throws Exception { + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t1 = new MockGraphTokenFilter(random, t); + final TokenStream t2 = new MockGraphTokenFilter(random, t1); + return new TokenStreamComponents(t, t2); + } + }; + + checkRandomData(random, a, 5, atLeast(1000)); + } + } + + public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception { + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t1 = new MockGraphTokenFilter(random, t); + final TokenStream t2 = new MockHoleInjectingTokenFilter(random, t1); + return new TokenStreamComponents(t, t2); + } + }; + + checkRandomData(random, a, 5, atLeast(1000)); + } + } + + public void testMockGraphTokenFilterAfterHolesRandom() throws Exception { + for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) { + + if (VERBOSE) { + System.out.println("\nTEST: iter=" + iter); + } + + // Make new analyzer each time, because MGTF has fixed + // seed: + final Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + final TokenStream t1 = new MockHoleInjectingTokenFilter(random, t); + final TokenStream t2 = new MockGraphTokenFilter(random, t1); + return new TokenStreamComponents(t, t2); + } + }; + + checkRandomData(random, a, 5, atLeast(1000)); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java b/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java index 88b4cb01203..85fa9c50e9d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPositionIncrement.java @@ -59,6 +59,7 @@ public class TestPositionIncrement extends LuceneTestCase { @Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(new Tokenizer(reader) { + // TODO: use CannedTokenStream private final String[] TOKENS = {"1", "2", "3", "4", "5"}; private final int[] INCREMENTS = {0, 2, 1, 0, 1}; private int i = 0; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java b/lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java new file mode 100644 index 00000000000..8c6d1355b07 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestRollingBuffer.java @@ -0,0 +1,78 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestRollingBuffer extends LuceneTestCase { + + private static class Position implements RollingBuffer.Resettable { + public int pos; + + @Override + public void reset() { + pos = -1; + } + } + + public void test() { + + final RollingBuffer buffer = new RollingBuffer() { + @Override + protected Position newInstance() { + final Position pos = new Position(); + pos.pos = -1; + return pos; + } + }; + + for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) { + + int freeBeforePos = 0; + final int maxPos = atLeast(10000); + final FixedBitSet posSet = new FixedBitSet(maxPos + 1000); + int posUpto = 0; + while (freeBeforePos < maxPos) { + if (random.nextInt(4) == 1) { + final int limit = rarely() ? 1000 : 20; + final int inc = random.nextInt(limit); + final int pos = freeBeforePos + inc; + posUpto = Math.max(posUpto, pos); + if (VERBOSE) { + System.out.println(" check pos=" + pos + " posUpto=" + posUpto); + } + final Position posData = buffer.get(pos); + if (!posSet.getAndSet(pos)) { + assertEquals(-1, posData.pos); + posData.pos = pos; + } else { + assertEquals(pos, posData.pos); + } + } else { + if (posUpto > freeBeforePos) { + freeBeforePos += random.nextInt(posUpto - freeBeforePos); + } + if (VERBOSE) { + System.out.println(" freeBeforePos=" + freeBeforePos); + } + buffer.freeBefore(freeBeforePos); + } + } + + buffer.reset(); + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java index d4f39e99f47..655c5741ee2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; /** * TokenStream from a canned list of Tokens. @@ -32,10 +33,11 @@ public final class CannedTokenStream extends TokenStream { private int upto = 0; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - public CannedTokenStream(Token[] tokens) { + public CannedTokenStream(Token... tokens) { this.tokens = tokens; } @@ -49,6 +51,7 @@ public final class CannedTokenStream extends TokenStream { termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); + posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java new file mode 100644 index 00000000000..49aa69560ab --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockGraphTokenFilter.java @@ -0,0 +1,318 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util._TestUtil; + +// TODO: sometimes remove tokens too...? + +/** Randomly inserts overlapped (posInc=0) tokens with + * posLength sometimes > 1. The chain must have + * an OffsetAttribute. */ + +public final class MockGraphTokenFilter extends TokenFilter { + + private static boolean DEBUG = false; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final long seed; + private Random random; + + // Don't init to -1 (caller must first call reset): + private int inputPos; + private int outputPos; + // Don't init to -1 (caller must first call reset): + private int lastOutputPos; + private boolean end; + + private final class Position implements RollingBuffer.Resettable { + final List states = new ArrayList(); + int nextRead; + + // Any token leaving from this position should have this startOffset: + int startOffset = -1; + + // Any token arriving to this positoin should have this endOffset: + int endOffset = -1; + + @Override + public void reset() { + states.clear(); + nextRead = 0; + startOffset = -1; + endOffset = -1; + } + + public void captureState() throws IOException { + assert startOffset == offsetAtt.startOffset(); + states.add(MockGraphTokenFilter.this.captureState()); + } + } + + private final RollingBuffer positions = new RollingBuffer() { + @Override + protected Position newInstance() { + return new Position(); + } + }; + + public MockGraphTokenFilter(Random random, TokenStream input) { + super(input); + seed = random.nextLong(); + } + + @Override + public void reset() throws IOException { + super.reset(); + end = false; + positions.reset(); + // NOTE: must be "deterministically random" because + // BaseTokenStreamTestCase pulls tokens twice on the + // same input and asserts they are the same: + this.random = new Random(seed); + inputPos = -1; + outputPos = 0; + lastOutputPos = -1; + } + + private enum TOKEN_POS {SAME, NEXT, END}; + + private TOKEN_POS nextInputToken() throws IOException { + assert !end; + if (DEBUG) { + System.out.println(" call input.incr"); + } + final boolean result = input.incrementToken(); + if (result) { + final int posInc = posIncAtt.getPositionIncrement(); + final int posLength = posLengthAtt.getPositionLength(); + + // NOTE: when posLength > 1, we have a hole... we + // don't allow injected tokens to start or end + // "inside" a hole, so we don't need to make up + // offsets inside it + + assert inputPos != -1 || posInc > 0; + inputPos += posInc; + if (DEBUG) { + System.out.println(" got token term=" + termAtt + " posLength=" + posLength + " posInc=" + posInc + " inputPos=" + inputPos); + } + final Position posData = positions.get(inputPos); + if (posInc == 0) { + assert posData.startOffset == offsetAtt.startOffset(); + } else { + assert posData.startOffset == -1; + posData.startOffset = offsetAtt.startOffset(); + if (DEBUG) { + System.out.println(" record startOffset[" + inputPos + "]=" + posData.startOffset); + } + } + + final Position posEndData = positions.get(inputPos + posLength); + if (posEndData.endOffset == -1) { + // First time we are seeing a token that + // arrives to this position: record the + // endOffset + posEndData.endOffset = offsetAtt.endOffset(); + if (DEBUG) { + System.out.println(" record endOffset[" + (inputPos+posLength) + "]=" + posEndData.endOffset); + } + } else { + // We've already seen a token arriving there; + // make sure its endOffset is the same (NOTE: + // some tokenizers, eg WDF, will fail + // this...): + assert posEndData.endOffset == offsetAtt.endOffset(): "posEndData.endOffset=" + posEndData.endOffset + " vs offsetAtt.endOffset()=" + offsetAtt.endOffset(); + } + if (posInc == 0) { + return TOKEN_POS.SAME; + } else { + return TOKEN_POS.NEXT; + } + } else { + if (DEBUG) { + System.out.println(" got END"); + } + return TOKEN_POS.END; + } + } + + private void pushOutputPos() { + posIncAtt.setPositionIncrement(outputPos - lastOutputPos); + if (DEBUG) { + System.out.println(" pushOutputPos: set posInc=" + posIncAtt.getPositionIncrement()); + } + lastOutputPos = outputPos; + positions.freeBefore(outputPos); + } + + @Override + public boolean incrementToken() throws IOException { + + if (DEBUG) { + System.out.println("MockGraphTF.incr inputPos=" + inputPos + " outputPos=" + outputPos); + } + + while (true) { + final Position posData = positions.get(outputPos); + if (posData.nextRead < posData.states.size()) { + // Serve up all buffered tokens from this position: + if (DEBUG) { + System.out.println(" restore buffered nextRead=" + posData.nextRead + " vs " + posData.states.size()); + } + restoreState(posData.states.get(posData.nextRead++)); + if (DEBUG) { + System.out.println(" term=" + termAtt + " outputPos=" + outputPos); + } + pushOutputPos(); + return true; + } + + boolean tokenPending = false; + + final int prevInputPos = inputPos; + + if (inputPos == -1 || inputPos == outputPos) { + // We've used up the buffered tokens; pull the next + // input token: + if (end) { + return false; + } + final TOKEN_POS result = nextInputToken(); + if (result == TOKEN_POS.SAME) { + return true; + } else if (result == TOKEN_POS.NEXT) { + tokenPending = true; + } else { + // NOTE: we don't set end=true here... because we + // are immediately passing through "the end" to + // caller (return false), and caller must not call + // us again: + return false; + } + } else { + assert inputPos > outputPos; + if (DEBUG) { + System.out.println(" done @ outputPos=" + outputPos); + } + } + + // We're done (above) serving up all tokens leaving + // from the same position; now maybe insert a token. + // Note that we may insert more than one token leaving + // from this position. We only inject tokens at + // positions where we've seen at least one input token + // (ie, we cannot inject inside holes): + + if (prevInputPos != -1 && positions.get(outputPos).startOffset != -1 && random.nextInt(7) == 5) { + if (DEBUG) { + System.out.println(" inject @ outputPos=" + outputPos); + } + + if (tokenPending) { + positions.get(inputPos).captureState(); + } + final int posLength = _TestUtil.nextInt(random, 1, 5); + final Position posEndData = positions.get(outputPos + posLength); + + // Pull enough tokens until we discover what our + // endOffset should be: + while (!end && posEndData.endOffset == -1 && inputPos <= (outputPos + posLength)) { + if (DEBUG) { + System.out.println(" lookahead [endPos=" + (outputPos + posLength) + "]..."); + } + final TOKEN_POS result = nextInputToken(); + if (result != TOKEN_POS.END) { + positions.get(inputPos).captureState(); + } else { + end = true; + if (DEBUG) { + System.out.println(" force end lookahead"); + } + break; + } + } + + // TODO: really, here, on hitting end-of-tokens, + // we'd like to know the ending "posInc", and allow + // our token to extend up until that. But: a + // TokenFilter is not allowed to call end() from + // within its incrementToken, so we can't do that. + // It may have been better if the ending + // posInc/offsets were set when incrementToken + // returned false (ie, without having to call the + // special end method): + + if (posEndData.endOffset != -1) { + assert posEndData.endOffset != -1; + clearAttributes(); + posLengthAtt.setPositionLength(posLength); + termAtt.append(_TestUtil.randomUnicodeString(random)); + pushOutputPos(); + offsetAtt.setOffset(positions.get(outputPos).startOffset, + positions.get(outputPos + posLength).endOffset); + if (DEBUG) { + System.out.println(" inject: outputPos=" + outputPos + " startOffset=" + offsetAtt.startOffset() + + " endOffset=" + offsetAtt.endOffset() + + " posLength=" + posLengthAtt.getPositionLength()); + } + // TODO: set TypeAtt too? + return true; + + } else { + // Either, we hit the end of the tokens (ie, our + // attempted posLength is too long because it + // hangs out over the end), or, our attempted + // posLength ended in the middle of a hole; just + // skip injecting in these cases. We will still + // test these cases by having a StopFilter after + // MockGraphTokenFilter... + } + + } else if (tokenPending) { + outputPos = inputPos; + if (DEBUG) { + System.out.println(" pass-through"); + } + pushOutputPos(); + return true; + } else { + // We are skipping over a hole (posInc > 1) from our input: + outputPos++; + if (DEBUG) { + System.out.println(" incr outputPos=" + outputPos); + } + } + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java new file mode 100644 index 00000000000..8bf7b86c822 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java @@ -0,0 +1,62 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util._TestUtil; + +// Randomly injects holes: +public final class MockHoleInjectingTokenFilter extends TokenFilter { + + private final long randomSeed; + private Random random; + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + public MockHoleInjectingTokenFilter(Random random, TokenStream in) { + super(in); + randomSeed = random.nextLong(); + } + + @Override + public void reset() throws IOException { + super.reset(); + random = new Random(randomSeed); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final int posInc = posIncAtt.getPositionIncrement(); + if (posInc > 0 && random.nextInt(5) == 3) { + posIncAtt.setPositionIncrement(posInc + _TestUtil.nextInt(random, 1, 5)); + // TODO: should we tweak offsets...? + } + return true; + } else { + return false; + } + } + + // TODO: end? +} + + diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java b/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java new file mode 100644 index 00000000000..cc049721c1e --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/util/RollingBuffer.java @@ -0,0 +1,130 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: probably move this to core at some point (eg, +// cutover kuromoji, synfilter, LookaheadTokenFilter) + +/** Acts like forever growing T[], but internally uses a + * circular buffer to reuse instances of T. + * + * @lucene.internal */ +public abstract class RollingBuffer { + + public static interface Resettable { + public void reset(); + } + + @SuppressWarnings("unchecked") private T[] buffer = (T[]) new RollingBuffer.Resettable[8]; + + // Next array index to write to: + private int nextWrite; + + // Next position to write: + private int nextPos; + + // How many valid Position are held in the + // array: + private int count; + + public RollingBuffer() { + for(int idx=0;idx 0) { + if (nextWrite == -1) { + nextWrite = buffer.length - 1; + } + buffer[nextWrite--].reset(); + count--; + } + nextWrite = 0; + nextPos = 0; + count = 0; + } + + // For assert: + private boolean inBounds(int pos) { + return pos < nextPos && pos >= nextPos - count; + } + + private int getIndex(int pos) { + int index = nextWrite - (nextPos - pos); + if (index < 0) { + index += buffer.length; + } + return index; + } + + /** Get T instance for this absolute position; + * this is allowed to be arbitrarily far "in the + * future" but cannot be before the last freeBefore. */ + public T get(int pos) { + //System.out.println("RA.get pos=" + pos + " nextPos=" + nextPos + " nextWrite=" + nextWrite + " count=" + count); + while (pos >= nextPos) { + if (count == buffer.length) { + @SuppressWarnings("unchecked") T[] newBuffer = (T[]) new Resettable[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + //System.out.println(" grow length=" + newBuffer.length); + System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length-nextWrite); + System.arraycopy(buffer, 0, newBuffer, buffer.length-nextWrite, nextWrite); + for(int i=buffer.length;i index=" + index); + //assert buffer[index].pos == pos; + return buffer[index]; + } + + public void freeBefore(int pos) { + final int toFree = count - (nextPos - pos); + assert toFree >= 0; + assert toFree <= count: "toFree=" + toFree + " count=" + count; + int index = nextWrite - count; + if (index < 0) { + index += buffer.length; + } + for(int i=0;i