From c884384d5c3a1a5cc59861a6411834be89ad4d86 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Sat, 11 Jun 2011 16:46:59 +0000 Subject: [PATCH] SOLR-2400: Field- and DocumentAnalysisRequestHandler now provide a position history for each token, so you can follow the token through all analysis stages git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1134685 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 6 + .../handler/AnalysisRequestHandlerBase.java | 147 ++++++++---- .../AnalysisRequestHandlerTestBase.java | 8 + .../DocumentAnalysisRequestHandlerTest.java | 72 +++--- .../FieldAnalysisRequestHandlerTest.java | 213 +++++++++++------- 5 files changed, 282 insertions(+), 164 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8983d445e6c..1663a89d98e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -285,6 +285,12 @@ New Features compared to ternary trees and jaspell and very fast lookups at runtime. (Dawid Weiss) +* SOLR-2400: Field- and DocumentAnalysisRequestHandler now provide a position + history for each token, so you can follow the token through all analysis stages. + The output contains a separate string attribute, that is a "/"-delimited string + containing all positions from previous Tokenizers/TokenFilters. + (Uwe Schindler) + Optimizations ---------------------- diff --git a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java index dd83506462e..13f915f556e 100644 --- a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java +++ b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java @@ -25,10 +25,11 @@ import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeReflector; import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.SorterTemplate; +import org.apache.lucene.util.ArrayUtil; import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; @@ -120,10 +121,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens); for (TokenFilterFactory tokenFilterFactory : filtfacs) { + for (final AttributeSource tok : tokens) { + tok.getAttribute(TokenTrackingAttribute.class).freezeStage(); + } tokenStream = tokenFilterFactory.create(listBasedTokenStream); - List tokenList = analyzeTokenStream(tokenStream); - namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context)); - listBasedTokenStream = new ListBasedTokenStream(tokenList); + tokens = analyzeTokenStream(tokenStream); + namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); + listBasedTokenStream = new ListBasedTokenStream(tokens); } return namedList; @@ -160,15 +164,19 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { * @return List of tokens produced from the TokenStream */ private List analyzeTokenStream(TokenStream tokenStream) { - List tokens = new ArrayList(); + final List tokens = new ArrayList(); + final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); + final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes - tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); final BytesRef bytes = new BytesRef(); try { tokenStream.reset(); + int position = 0; while (tokenStream.incrementToken()) { + position += posIncrAtt.getPositionIncrement(); + trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } } catch (IOException ioe) { @@ -183,6 +191,8 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { put(OffsetAttribute.class.getName() + "#startOffset", "start"); put(OffsetAttribute.class.getName() + "#endOffset", "end"); put(TypeAttribute.class.getName() + "#type", "type"); + put(TokenTrackingAttribute.class.getName() + "#position", "position"); + put(TokenTrackingAttribute.class.getName() + "#positionHistory", "positionHistory"); }}); /** @@ -193,49 +203,35 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { * * @return List of NamedLists containing the relevant information taken from the tokens */ - private List convertTokensToNamedLists(final List tokens, AnalysisContext context) { + private List convertTokensToNamedLists(final List tokenList, AnalysisContext context) { final List tokensNamedLists = new ArrayList(); - - final int[] positions = new int[tokens.size()]; - int position = 0; - for (int i = 0, c = tokens.size(); i < c; i++) { - AttributeSource token = tokens.get(i); - position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement(); - positions[i] = position; - } + final FieldType fieldType = context.getFieldType(); + final AttributeSource[] tokens = tokenList.toArray(new AttributeSource[tokenList.size()]); // sort the tokens by absoulte position - new SorterTemplate() { - @Override - protected void swap(int i, int j) { - final int p = positions[i]; - positions[i] = positions[j]; - positions[j] = p; - Collections.swap(tokens, i, j); + ArrayUtil.mergeSort(tokens, new Comparator() { + public int compare(AttributeSource a, AttributeSource b) { + return arrayCompare( + a.getAttribute(TokenTrackingAttribute.class).getPositions(), + b.getAttribute(TokenTrackingAttribute.class).getPositions() + ); } - @Override - protected int compare(int i, int j) { - return positions[i] - positions[j]; + private int arrayCompare(int[] a, int[] b) { + int p = 0; + final int stop = Math.min(a.length, b.length); + while(p < stop) { + int diff = a[p] - b[p]; + if (diff != 0) return diff; + p++; + } + // One is a prefix of the other, or, they are equal: + return a.length - b.length; } + }); - @Override - protected void setPivot(int i) { - pivot = positions[i]; - } - - @Override - protected int comparePivot(int j) { - return pivot - positions[j]; - } - - private int pivot; - }.mergeSort(0, tokens.size() - 1); - - FieldType fieldType = context.getFieldType(); - - for (int i = 0, c = tokens.size(); i < c; i++) { - AttributeSource token = tokens.get(i); + for (int i = 0; i < tokens.length; i++) { + AttributeSource token = tokens[i]; final NamedList tokenNamedList = new SimpleOrderedMap(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); @@ -256,8 +252,6 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { tokenNamedList.add("match", true); } - tokenNamedList.add("position", positions[i]); - token.reflectWith(new AttributeReflector() { public void reflect(Class attClass, String key, Object value) { // leave out position and bytes term @@ -312,8 +306,8 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { /** * TokenStream that iterates over a list of pre-existing Tokens + * @lucene.internal */ - // TODO refactor to support custom attributes protected final static class ListBasedTokenStream extends TokenStream { private final List tokens; private Iterator tokenIterator; @@ -350,6 +344,69 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { } } + /** This is an {@link Attribute} used to track the positions of tokens + * in the analysis chain. + * @lucene.internal This class is only public for usage by the {@link AttributeSource} API. + */ + public interface TokenTrackingAttribute extends Attribute { + void freezeStage(); + void setActPosition(int pos); + int[] getPositions(); + void reset(int[] basePositions, int position); + } + + /** Implementation of {@link TokenTrackingAttribute}. + * @lucene.internal This class is only public for usage by the {@link AttributeSource} API. + */ + public static final class TokenTrackingAttributeImpl extends AttributeImpl implements TokenTrackingAttribute { + private int[] basePositions = new int[0]; + private int position = 0; + + public void freezeStage() { + this.basePositions = getPositions(); + this.position = 0; + } + + public void setActPosition(int pos) { + this.position = pos; + } + + public int[] getPositions() { + final int[] positions = new int[basePositions.length + 1]; + System.arraycopy(basePositions, 0, positions, 0, basePositions.length); + positions[basePositions.length] = position; + return positions; + } + + public void reset(int[] basePositions, int position) { + this.basePositions = basePositions; + this.position = position; + } + + @Override + public void clear() { + // we do nothing here, as all attribute values are controlled externally by consumer + } + + @Override + public void reflectWith(AttributeReflector reflector) { + final int[] positions = getPositions(); + final StringBuilder sb = new StringBuilder(positions.length * 2); + for (int p : positions) { + if (sb.length() > 0) sb.append('/'); + sb.append(p); + } + reflector.reflect(TokenTrackingAttribute.class, "positionHistory", sb.toString()); + reflector.reflect(TokenTrackingAttribute.class, "position", position); + } + + @Override + public void copyTo(AttributeImpl target) { + final TokenTrackingAttribute t = (TokenTrackingAttribute) target; + t.reset(basePositions, position); + } + } + /** * Serves as the context of an analysis process. This context contains the following constructs */ diff --git a/solr/src/test/org/apache/solr/handler/AnalysisRequestHandlerTestBase.java b/solr/src/test/org/apache/solr/handler/AnalysisRequestHandlerTestBase.java index feb827e0a34..5244d7dd2d0 100644 --- a/solr/src/test/org/apache/solr/handler/AnalysisRequestHandlerTestBase.java +++ b/solr/src/test/org/apache/solr/handler/AnalysisRequestHandlerTestBase.java @@ -37,6 +37,7 @@ public abstract class AnalysisRequestHandlerTestBase extends SolrTestCaseJ4 { assertEquals(new Integer(info.getStart()), token.get("start")); assertEquals(new Integer(info.getEnd()), token.get("end")); assertEquals(new Integer(info.getPosition()), token.get("position")); + assertEquals(info.getPositionHistory(), token.get("positionHistory")); if (info.isMatch()) { assertEquals(Boolean.TRUE, token.get("match")); } @@ -57,6 +58,7 @@ public abstract class AnalysisRequestHandlerTestBase extends SolrTestCaseJ4 { private int end; private String payload; private int position; + private String positionHistory; private boolean match; public TokenInfo( @@ -66,6 +68,7 @@ public abstract class AnalysisRequestHandlerTestBase extends SolrTestCaseJ4 { int start, int end, int position, + String positionHistory, String payload, boolean match) { @@ -75,6 +78,7 @@ public abstract class AnalysisRequestHandlerTestBase extends SolrTestCaseJ4 { this.start = start; this.end = end; this.position = position; + this.positionHistory = positionHistory; this.payload = payload; this.match = match; } @@ -107,6 +111,10 @@ public abstract class AnalysisRequestHandlerTestBase extends SolrTestCaseJ4 { return position; } + public String getPositionHistory() { + return positionHistory; + } + public boolean isMatch() { return match; } diff --git a/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java b/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java index cddf4946ec0..1f2ed7612be 100644 --- a/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java +++ b/solr/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java @@ -235,7 +235,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*")); List tokenList = (List) queryResult.getVal(0); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, "1", null, false)); NamedList indexResult = idResult.get("index"); assertEquals("The id field has only a single value", 1, indexResult.size()); @@ -245,7 +245,7 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*")); tokenList = valueResult.getVal(0); assertEquals("The 'id' field value has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("1", null, "word", 0, 1, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("1", null, "word", 0, 1, 1, "1", null, false)); ***/ // the name field @@ -255,14 +255,14 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe tokenList = (List) queryResult.get("org.apache.lucene.analysis.core.WhitespaceTokenizer"); assertNotNull("Expecting the 'WhitespaceTokenizer' to be applied on the query for the 'whitetok' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, "1", null, false)); indexResult = whitetokResult.get("index"); assertEquals("The 'whitetok' field has only a single value", 1, indexResult.size()); valueResult = (NamedList>) indexResult.get("Jumping Jack"); tokenList = valueResult.getVal(0); assertEquals("Expecting 2 tokens to be present", 2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("Jumping", null, "word", 0, 7, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("Jack", null, "word", 8, 12, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("Jumping", null, "word", 0, 7, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("Jack", null, "word", 8, 12, 2, "2", null, false)); // the text field NamedList> textResult = documentResult.get("text"); @@ -271,66 +271,66 @@ public class DocumentAnalysisRequestHandlerTest extends AnalysisRequestHandlerTe tokenList = (List) queryResult.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expecting the 'StandardTokenizer' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "", 0, 7, 1, "1", null, false)); tokenList = (List) queryResult.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "", 0, 7, 1, "1/1", null, false)); tokenList = (List) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, "1/1/1", null, false)); tokenList = (List) queryResult.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("jumping", null, "", 0, 7, 1, "1/1/1/1", null, false)); tokenList = (List) queryResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the query for the 'text' field", tokenList); assertEquals("Query has only one token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("jump", null, "", 0, 7, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("jump", null, "", 0, 7, 1, "1/1/1/1/1", null, false)); indexResult = textResult.get("index"); assertEquals("The 'text' field has only a single value", 1, indexResult.size()); valueResult = (NamedList>) indexResult.get("The Fox Jumped Over The Dogs"); tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expecting the 'StandardTokenizer' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("The", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("Fox", null, "", 4, 7, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "", 8, 14, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("Over", null, "", 15, 19, 4, null, false)); - assertToken(tokenList.get(4), new TokenInfo("The", null, "", 20, 23, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "", 24, 28, 6, null, false)); + assertToken(tokenList.get(0), new TokenInfo("The", null, "", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("Fox", null, "", 4, 7, 2, "2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "", 8, 14, 3, "3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("Over", null, "", 15, 19, 4, "4", null, false)); + assertToken(tokenList.get(4), new TokenInfo("The", null, "", 20, 23, 5, "5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "", 24, 28, 6, "6", null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expecting the 'StandardFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("The", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("Fox", null, "", 4, 7, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "", 8, 14, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("Over", null, "", 15, 19, 4, null, false)); - assertToken(tokenList.get(4), new TokenInfo("The", null, "", 20, 23, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "", 24, 28, 6, null, false)); + assertToken(tokenList.get(0), new TokenInfo("The", null, "", 0, 3, 1, "1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("Fox", null, "", 4, 7, 2, "2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "", 8, 14, 3, "3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("Over", null, "", 15, 19, 4, "4/4", null, false)); + assertToken(tokenList.get(4), new TokenInfo("The", null, "", 20, 23, 5, "5/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "", 24, 28, 6, "6/6", null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 6 tokens", 6, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("fox", null, "", 4, 7, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("jumped", null, "", 8, 14, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("over", null, "", 15, 19, 4, null, false)); - assertToken(tokenList.get(4), new TokenInfo("the", null, "", 20, 23, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("dogs", null, "", 24, 28, 6, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, "1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("fox", null, "", 4, 7, 2, "2/2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("jumped", null, "", 8, 14, 3, "3/3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("over", null, "", 15, 19, 4, "4/4/4", null, false)); + assertToken(tokenList.get(4), new TokenInfo("the", null, "", 20, 23, 5, "5/5/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("dogs", null, "", 24, 28, 6, "6/6/6", null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("jumped", null, "", 8, 14, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("dogs", null, "", 24, 28, 4, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, "2/2/2/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("jumped", null, "", 8, 14, 2, "3/3/3/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, "4/4/4/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("dogs", null, "", 24, 28, 4, "6/6/6/4", null, false)); tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList); assertEquals("Expecting 4 tokens", 4, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("jump", null, "", 8, 14, 2, null, true)); - assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("dog", null, "", 24, 28, 4, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 4, 7, 1, "2/2/2/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("jump", null, "", 8, 14, 2, "3/3/3/2/2", null, true)); + assertToken(tokenList.get(2), new TokenInfo("over", null, "", 15, 19, 3, "4/4/4/3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("dog", null, "", 24, 28, 4, "6/6/6/4/4", null, false)); } } diff --git a/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java index da11b5f1d6a..400b1020130 100644 --- a/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java +++ b/solr/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java @@ -139,64 +139,64 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB List tokenList = indexPart.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expcting StandardTokenizer analysis breakdown", tokenList); assertEquals(tokenList.size(), 10); - assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, null, true)); - assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, null, false)); - assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, null, false)); - assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, null, true)); - assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, "2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, "3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, "4", null, true)); + assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, "5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, "6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, "7", null, false)); + assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, "8", null, false)); + assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, "9", null, true)); + assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, "10", null, false)); tokenList = indexPart.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expcting StandardFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 10); - assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, null, true)); - assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, null, false)); - assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, null, false)); - assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, null, true)); - assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, "1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, "2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, "3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, "4/4", null, true)); + assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, "5/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, "6/6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, "7/7", null, false)); + assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, "8/8", null, false)); + assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, "9/9", null, true)); + assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, "10/10", null, false)); tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 10); - assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, null, true)); - assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, null, false)); - assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, null, false)); - assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, null, true)); - assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "", 0, 3, 1, "1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("quick", null, "", 4, 9, 2, "2/2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("red", null, "", 10, 13, 3, "3/3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("fox", null, "", 14, 17, 4, "4/4/4", null, true)); + assertToken(tokenList.get(4), new TokenInfo("jumped", null, "", 18, 24, 5, "5/5/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("over", null, "", 25, 29, 6, "6/6/6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("the", null, "", 30, 33, 7, "7/7/7", null, false)); + assertToken(tokenList.get(7), new TokenInfo("lazy", null, "", 34, 38, 8, "8/8/8", null, false)); + assertToken(tokenList.get(8), new TokenInfo("brown", null, "", 39, 44, 9, "9/9/9", null, true)); + assertToken(tokenList.get(9), new TokenInfo("dogs", null, "", 45, 49, 10, "10/10/10", null, false)); tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expcting StopFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 8); - assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("red", null, "", 10, 13, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("fox", null, "", 14, 17, 3, null, true)); - assertToken(tokenList.get(3), new TokenInfo("jumped", null, "", 18, 24, 4, null, false)); - assertToken(tokenList.get(4), new TokenInfo("over", null, "", 25, 29, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("lazy", null, "", 34, 38, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, null, true)); - assertToken(tokenList.get(7), new TokenInfo("dogs", null, "", 45, 49, 8, null, false)); + assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, "2/2/2/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("red", null, "", 10, 13, 2, "3/3/3/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("fox", null, "", 14, 17, 3, "4/4/4/3", null, true)); + assertToken(tokenList.get(3), new TokenInfo("jumped", null, "", 18, 24, 4, "5/5/5/4", null, false)); + assertToken(tokenList.get(4), new TokenInfo("over", null, "", 25, 29, 5, "6/6/6/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("lazy", null, "", 34, 38, 6, "8/8/8/6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, "9/9/9/7", null, true)); + assertToken(tokenList.get(7), new TokenInfo("dogs", null, "", 45, 49, 8, "10/10/10/8", null, false)); tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList); assertEquals(tokenList.size(), 8); - assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("red", null, "", 10, 13, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("fox", null, "", 14, 17, 3, null, true)); - assertToken(tokenList.get(3), new TokenInfo("jump", null, "", 18, 24, 4, null, false)); - assertToken(tokenList.get(4), new TokenInfo("over", null, "", 25, 29, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("lazi", null, "", 34, 38, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, null, true)); - assertToken(tokenList.get(7), new TokenInfo("dog", null, "", 45, 49, 8, null, false)); + assertToken(tokenList.get(0), new TokenInfo("quick", null, "", 4, 9, 1, "2/2/2/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("red", null, "", 10, 13, 2, "3/3/3/2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("fox", null, "", 14, 17, 3, "4/4/4/3/3", null, true)); + assertToken(tokenList.get(3), new TokenInfo("jump", null, "", 18, 24, 4, "5/5/5/4/4", null, false)); + assertToken(tokenList.get(4), new TokenInfo("over", null, "", 25, 29, 5, "6/6/6/5/5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("lazi", null, "", 34, 38, 6, "8/8/8/6/6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("brown", null, "", 39, 44, 7, "9/9/9/7/7", null, true)); + assertToken(tokenList.get(7), new TokenInfo("dog", null, "", 45, 49, 8, "10/10/10/8/8", null, false)); NamedList> queryPart = textType.get("query"); assertNotNull("expecting a query token analysis for field type 'text'", queryPart); @@ -204,28 +204,28 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = queryPart.get("org.apache.lucene.analysis.standard.StandardTokenizer"); assertNotNull("Expecting StandardTokenizer analysis breakdown", tokenList); assertEquals("Expecting StandardTokenizer to produce 2 tokens from '" + request.getQuery() + "'", 2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, "2", null, false)); tokenList = queryPart.get("org.apache.lucene.analysis.standard.StandardFilter"); assertNotNull("Expcting StandardFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, "1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, "2/2", null, false)); tokenList = queryPart.get("org.apache.lucene.analysis.core.LowerCaseFilter"); assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, "1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, "2/2/2", null, false)); tokenList = queryPart.get("org.apache.lucene.analysis.core.StopFilter"); assertNotNull("Expcting StopFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, "1/1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, "2/2/2/2", null, false)); tokenList = queryPart.get("org.apache.lucene.analysis.en.PorterStemFilter"); assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList); assertEquals(2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "", 0, 3, 1, "1/1/1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "", 4, 9, 2, "2/2/2/2/2", null, false)); NamedList nameTextType = fieldTypes.get("nametext"); assertNotNull("expecting result for field type 'nametext'", nameTextType); @@ -236,22 +236,22 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = indexPart.get("org.apache.lucene.analysis.core.WhitespaceTokenizer"); assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList); assertEquals(10, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("quick", null, "word", 4, 9, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("red", null, "word", 10, 13, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("fox", null, "word", 14, 17, 4, null, true)); - assertToken(tokenList.get(4), new TokenInfo("jumped", null, "word", 18, 24, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("over", null, "word", 25, 29, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("the", null, "word", 30, 33, 7, null, false)); - assertToken(tokenList.get(7), new TokenInfo("lazy", null, "word", 34, 38, 8, null, false)); - assertToken(tokenList.get(8), new TokenInfo("brown", null, "word", 39, 44, 9, null, true)); - assertToken(tokenList.get(9), new TokenInfo("dogs", null, "word", 45, 49, 10, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("quick", null, "word", 4, 9, 2, "2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("red", null, "word", 10, 13, 3, "3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("fox", null, "word", 14, 17, 4, "4", null, true)); + assertToken(tokenList.get(4), new TokenInfo("jumped", null, "word", 18, 24, 5, "5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("over", null, "word", 25, 29, 6, "6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("the", null, "word", 30, 33, 7, "7", null, false)); + assertToken(tokenList.get(7), new TokenInfo("lazy", null, "word", 34, 38, 8, "8", null, false)); + assertToken(tokenList.get(8), new TokenInfo("brown", null, "word", 39, 44, 9, "9", null, true)); + assertToken(tokenList.get(9), new TokenInfo("dogs", null, "word", 45, 49, 10, "10", null, false)); queryPart = nameTextType.get("query"); assertNotNull("expecting a query token analysis for field type 'nametext'", queryPart); tokenList = queryPart.get(WhitespaceTokenizer.class.getName()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "word", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "word", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "word", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "word", 4, 9, 2, "2", null, false)); NamedList fieldNames = result.get("field_names"); assertNotNull("field_nameds should never be null", fieldNames); @@ -265,16 +265,16 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = indexPart.get(WhitespaceTokenizer.class.getName()); assertNotNull("expecting only WhitespaceTokenizer to be applied", tokenList); assertEquals("expecting WhitespaceTokenizer to produce 10 tokens", 10, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("quick", null, "word", 4, 9, 2, null, false)); - assertToken(tokenList.get(2), new TokenInfo("red", null, "word", 10, 13, 3, null, false)); - assertToken(tokenList.get(3), new TokenInfo("fox", null, "word", 14, 17, 4, null, true)); - assertToken(tokenList.get(4), new TokenInfo("jumped", null, "word", 18, 24, 5, null, false)); - assertToken(tokenList.get(5), new TokenInfo("over", null, "word", 25, 29, 6, null, false)); - assertToken(tokenList.get(6), new TokenInfo("the", null, "word", 30, 33, 7, null, false)); - assertToken(tokenList.get(7), new TokenInfo("lazy", null, "word", 34, 38, 8, null, false)); - assertToken(tokenList.get(8), new TokenInfo("brown", null, "word", 39, 44, 9, null, true)); - assertToken(tokenList.get(9), new TokenInfo("dogs", null, "word", 45, 49, 10, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the", null, "word", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("quick", null, "word", 4, 9, 2, "2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("red", null, "word", 10, 13, 3, "3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("fox", null, "word", 14, 17, 4, "4", null, true)); + assertToken(tokenList.get(4), new TokenInfo("jumped", null, "word", 18, 24, 5, "5", null, false)); + assertToken(tokenList.get(5), new TokenInfo("over", null, "word", 25, 29, 6, "6", null, false)); + assertToken(tokenList.get(6), new TokenInfo("the", null, "word", 30, 33, 7, "7", null, false)); + assertToken(tokenList.get(7), new TokenInfo("lazy", null, "word", 34, 38, 8, "8", null, false)); + assertToken(tokenList.get(8), new TokenInfo("brown", null, "word", 39, 44, 9, "9", null, true)); + assertToken(tokenList.get(9), new TokenInfo("dogs", null, "word", 45, 49, 10, "10", null, false)); queryPart = whitetok.get("query"); assertNotNull("expecting a query token analysis for field 'whitetok'", queryPart); @@ -282,8 +282,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = queryPart.get(WhitespaceTokenizer.class.getName()); assertNotNull("expecting only WhitespaceTokenizer to be applied", tokenList); assertEquals("expecting WhitespaceTokenizer to produce 2 tokens", 2, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox", null, "word", 0, 3, 1, null, false)); - assertToken(tokenList.get(1), new TokenInfo("brown", null, "word", 4, 9, 2, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox", null, "word", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("brown", null, "word", 4, 9, 2, "2", null, false)); NamedList keywordtok = fieldNames.get("keywordtok"); assertNotNull("expecting result for field 'keywordtok'", keywordtok); @@ -294,7 +294,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = indexPart.get(KeywordTokenizer.class.getName()); assertNotNull("expecting only KeywordTokenizer to be applied", tokenList); assertEquals("expecting KeywordTokenizer to produce 1 token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("the quick red fox jumped over the lazy brown dogs", null, "word", 0, 49, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("the quick red fox jumped over the lazy brown dogs", null, "word", 0, 49, 1, "1", null, false)); queryPart = keywordtok.get("query"); assertNotNull("expecting a query token analysis for field 'keywordtok'", queryPart); @@ -302,7 +302,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB tokenList = queryPart.get(KeywordTokenizer.class.getName()); assertNotNull("expecting only KeywordTokenizer to be applied", tokenList); assertEquals("expecting KeywordTokenizer to produce 1 token", 1, tokenList.size()); - assertToken(tokenList.get(0), new TokenInfo("fox brown", null, "word", 0, 9, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("fox brown", null, "word", 0, 9, 1, "1", null, false)); } @@ -331,6 +331,53 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB List tokenList = (List)indexPart.get("org.apache.lucene.analysis.core.WhitespaceTokenizer"); assertNotNull("Expecting WhitespaceTokenizer analysis breakdown", tokenList); assertEquals(tokenList.size(), 1); - assertToken(tokenList.get(0), new TokenInfo("whatever", null, "word", 12, 20, 1, null, false)); + assertToken(tokenList.get(0), new TokenInfo("whatever", null, "word", 12, 20, 1, "1", null, false)); } + + @Test + public void testPositionHistoryWithWDF() throws Exception { + + FieldAnalysisRequest request = new FieldAnalysisRequest(); + request.addFieldType("skutype1"); + request.setFieldValue("hi, 3456-12 a Test"); + request.setShowMatch(false); + + NamedList result = handler.handleAnalysisRequest(request, h.getCore().getSchema()); + assertTrue("result is null and it shouldn't be", result != null); + + NamedList fieldTypes = result.get("field_types"); + assertNotNull("field_types should never be null", fieldTypes); + NamedList textType = fieldTypes.get("skutype1"); + assertNotNull("expecting result for field type 'skutype1'", textType); + + NamedList> indexPart = textType.get("index"); + assertNotNull("expecting an index token analysis for field type 'skutype1'", indexPart); + + List tokenList = indexPart.get("org.apache.lucene.analysis.core.WhitespaceTokenizer"); + assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList); + assertEquals(4, tokenList.size()); + assertToken(tokenList.get(0), new TokenInfo("hi,", null, "word", 0, 3, 1, "1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("3456-12", null, "word", 4, 11, 2, "2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("a", null, "word", 12, 13, 3, "3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("Test", null, "word", 14, 18, 4, "4", null, false)); + tokenList = indexPart.get("org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter"); + assertNotNull("Expcting WordDelimiterFilter analysis breakdown", tokenList); + assertEquals(6, tokenList.size()); + assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, "1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, "2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, "2/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, "2/3", null, false)); + assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, "3/4", null, false)); + assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, "4/5", null, false)); + tokenList = indexPart.get("org.apache.lucene.analysis.core.LowerCaseFilter"); + assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList); + assertEquals(6, tokenList.size()); + assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, "1/1/1", null, false)); + assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, "2/2/2", null, false)); + assertToken(tokenList.get(2), new TokenInfo("12", null, "word", 9, 11, 3, "2/3/3", null, false)); + assertToken(tokenList.get(3), new TokenInfo("345612", null, "word", 4, 11, 3, "2/3/3", null, false)); + assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, "3/4/4", null, false)); + assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, "4/5/5", null, false)); + } + }