LUCENE-3071: Add ReversePathHierarchyTokenizer and enable skip on PathHierarchyTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1099999 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2011-05-05 23:30:05 +00:00
parent 6af8928c95
commit 96878534a0
5 changed files with 494 additions and 41 deletions

View File

@ -29,53 +29,67 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* Take something like: * Take something like:
* *
* <pre> * <pre>
* /soemthing/something/else * /something/something/else
* </pre> * </pre>
* *
* and make: * and make:
* *
* <pre> * <pre>
* /soemthing * /something
* /soemthing/something * /something/something
* /soemthing/something/else * /something/something/else
* </pre> * </pre>
*
*/ */
public class PathHierarchyTokenizer extends Tokenizer { public class PathHierarchyTokenizer extends Tokenizer {
public PathHierarchyTokenizer(Reader input) { public PathHierarchyTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER); this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
}
public PathHierarchyTokenizer(Reader input, int skip) {
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip);
} }
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) { public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
this(input, bufferSize, delimiter, delimiter); this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP);
} }
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) { public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement); this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP);
} }
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) { public PathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
}
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input); super(input);
termAtt.resizeBuffer(bufferSize); termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter; this.delimiter = delimiter;
this.replacement = replacement; this.replacement = replacement;
endDelimiter = false; this.skip = skip;
resultToken = new StringBuilder(bufferSize); resultToken = new StringBuilder(bufferSize);
} }
private static final int DEFAULT_BUFFER_SIZE = 1024; private static final int DEFAULT_BUFFER_SIZE = 1024;
public static final char DEFAULT_DELIMITER = '/'; public static final char DEFAULT_DELIMITER = '/';
public static final int DEFAULT_SKIP = 0;
private final char delimiter; private final char delimiter;
private final char replacement; private final char replacement;
private final int skip;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int startPosition = 0;
private int finalOffset = 0; private int finalOffset = 0;
private boolean endDelimiter; private int skipped = 0;
private boolean endDelimiter = false;
private StringBuilder resultToken; private StringBuilder resultToken;
@Override @Override
public final boolean incrementToken() throws IOException { public final boolean incrementToken() throws IOException {
clearAttributes(); clearAttributes();
@ -98,37 +112,63 @@ public class PathHierarchyTokenizer extends Tokenizer {
while (true) { while (true) {
int c = input.read(); int c = input.read();
if( c < 0 ){ if( c < 0 ){
if( skipped > skip ) {
length += resultToken.length(); length += resultToken.length();
termAtt.setLength(length); termAtt.setLength(length);
finalOffset = correctOffset(length); finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(0), finalOffset); offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
if( added ){ if( added ){
resultToken.setLength(0); resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length); resultToken.append(termAtt.buffer(), 0, length);
} }
return added; return added;
} }
else{
finalOffset = correctOffset(startPosition + length);
return false;
}
}
if( !added ){
added = true; added = true;
skipped++;
if( skipped > skip ){
termAtt.append(c == delimiter ? replacement : (char)c);
length++;
}
else {
startPosition++;
}
}
else {
if( c == delimiter ){ if( c == delimiter ){
if( length > 0 ){ if( skipped > skip ){
endDelimiter = true; endDelimiter = true;
break; break;
} }
else{ skipped++;
if( skipped > skip ){
termAtt.append(replacement); termAtt.append(replacement);
length++; length++;
} }
else {
startPosition++;
}
} }
else { else {
if( skipped > skip ){
termAtt.append((char)c); termAtt.append((char)c);
length++; length++;
} }
else {
startPosition++;
}
}
}
} }
length += resultToken.length(); length += resultToken.length();
termAtt.setLength(length); termAtt.setLength(length);
finalOffset = correctOffset(length); finalOffset = correctOffset(startPosition + length);
offsetAtt.setOffset(correctOffset(0), finalOffset); offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
resultToken.setLength(0); resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length); resultToken.append(termAtt.buffer(), 0, length);
return true; return true;
@ -146,5 +186,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
resultToken.setLength(0); resultToken.setLength(0);
finalOffset = 0; finalOffset = 0;
endDelimiter = false; endDelimiter = false;
skipped = 0;
} }
} }

View File

@ -0,0 +1,173 @@
package org.apache.lucene.analysis.path;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
*
* Take something like:
*
* <pre>
* www.site.co.uk
* </pre>
*
* and make:
*
* <pre>
* www.site.co.uk
* site.co.uk
* co.uk
* uk
* </pre>
*
*/
public class ReversePathHierarchyTokenizer extends Tokenizer {
public ReversePathHierarchyTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
}
public ReversePathHierarchyTokenizer(Reader input, int skip) {
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip);
}
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP);
}
public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP);
}
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) {
this(input, bufferSize, delimiter, replacement, DEFAULT_SKIP);
}
public ReversePathHierarchyTokenizer(Reader input, char delimiter, int skip) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, delimiter, skip);
}
public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
}
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
super(input);
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
this.replacement = replacement;
this.skip = skip;
resultToken = new StringBuilder(bufferSize);
resultTokenBuffer = new char[bufferSize];
delimiterPositions = new ArrayList<Integer>(bufferSize/10);
}
private static final int DEFAULT_BUFFER_SIZE = 1024;
public static final char DEFAULT_DELIMITER = '/';
public static final int DEFAULT_SKIP = 0;
private final char delimiter;
private final char replacement;
private final int skip;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int endPosition = 0;
private int finalOffset = 0;
private int skipped = 0;
private StringBuilder resultToken;
private List<Integer> delimiterPositions;
private int delimitersCount = -1;
private char[] resultTokenBuffer;
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if(delimitersCount == -1){
int length = 0;
delimiterPositions.add(0);
while (true) {
int c = input.read();
if( c < 0 ) {
break;
}
length++;
if( c == delimiter ) {
delimiterPositions.add(length);
resultToken.append(replacement);
}
else{
resultToken.append((char)c);
}
}
delimitersCount = delimiterPositions.size();
if( delimiterPositions.get(delimitersCount-1) < length ){
delimiterPositions.add(length);
delimitersCount++;
}
if( resultTokenBuffer.length < resultToken.length() ){
resultTokenBuffer = new char[resultToken.length()];
}
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
resultToken.setLength(0);
endPosition = delimiterPositions.get(delimitersCount-1 - skip);
finalOffset = correctOffset(length);
posAtt.setPositionIncrement(1);
}
else{
posAtt.setPositionIncrement(0);
}
while( skipped < delimitersCount-skip-1 ){
int start = delimiterPositions.get(skipped);
termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
skipped++;
return true;
}
return false;
}
@Override
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
resultToken.setLength(0);
finalOffset = 0;
skipped = 0;
delimitersCount = -1;
delimiterPositions.clear();
}
}

View File

@ -127,4 +127,70 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
new int[]{1, 0, 0, 0}, new int[]{1, 0, 0, 0},
path.length()); path.length());
} }
public void testBasicSkip() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/b", "/b/c"},
new int[]{2, 2},
new int[]{4, 6},
new int[]{1, 0},
path.length());
}
public void testEndOfDelimiterSkip() throws Exception {
String path = "/a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/b", "/b/c", "/b/c/"},
new int[]{2, 2, 2},
new int[]{4, 6, 7},
new int[]{1, 0, 0},
path.length());
}
public void testStartOfCharSkip() throws Exception {
String path = "a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/b", "/b/c"},
new int[]{1, 1},
new int[]{3, 5},
new int[]{1, 0},
path.length());
}
public void testStartOfCharEndOfDelimiterSkip() throws Exception {
String path = "a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/b", "/b/c", "/b/c/"},
new int[]{1, 1, 1},
new int[]{3, 5, 6},
new int[]{1, 0, 0},
path.length());
}
public void testOnlyDelimiterSkip() throws Exception {
String path = "/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{},
new int[]{},
new int[]{},
new int[]{},
path.length());
}
public void testOnlyDelimitersSkip() throws Exception {
String path = "//";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/"},
new int[]{1},
new int[]{2},
new int[]{1},
path.length());
}
} }

View File

@ -0,0 +1,157 @@
package org.apache.lucene.analysis.path;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testBasicReverse() throws Exception {
String path = "/a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/a/b/c", "a/b/c", "b/c", "c"},
new int[]{0, 1, 3, 5},
new int[]{6, 6, 6, 6},
new int[]{1, 0, 0, 0},
path.length());
}
public void testEndOfDelimiterReverse() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/a/b/c/", "a/b/c/", "b/c/", "c/"},
new int[]{0, 1, 3, 5},
new int[]{7, 7, 7, 7},
new int[]{1, 0, 0, 0},
path.length());
}
public void testStartOfCharReverse() throws Exception {
String path = "a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a/b/c", "b/c", "c"},
new int[]{0, 2, 4},
new int[]{5, 5, 5},
new int[]{1, 0, 0},
path.length());
}
public void testStartOfCharEndOfDelimiterReverse() throws Exception {
String path = "a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a/b/c/", "b/c/", "c/"},
new int[]{0, 2, 4},
new int[]{6, 6, 6},
new int[]{1, 0, 0},
path.length());
}
public void testOnlyDelimiterReverse() throws Exception {
String path = "/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/"},
new int[]{0},
new int[]{1},
new int[]{1},
path.length());
}
public void testOnlyDelimitersReverse() throws Exception {
String path = "//";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"//", "/"},
new int[]{0, 1},
new int[]{2, 2},
new int[]{1, 0},
path.length());
}
public void testEndOfDelimiterReverseSkip() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/a/b/", "a/b/", "b/"},
new int[]{0, 1, 3},
new int[]{5, 5, 5},
new int[]{1, 0, 0},
path.length());
}
public void testStartOfCharReverseSkip() throws Exception {
String path = "a/b/c";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"a/b/", "b/"},
new int[]{0, 2},
new int[]{4, 4},
new int[]{1, 0},
path.length());
}
public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
String path = "a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"a/b/", "b/"},
new int[]{0, 2},
new int[]{4, 4},
new int[]{1, 0},
path.length());
}
public void testOnlyDelimiterReverseSkip() throws Exception {
String path = "/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{},
new int[]{},
new int[]{},
new int[]{},
path.length());
}
public void testOnlyDelimitersReverseSkip() throws Exception {
String path = "//";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
assertTokenStreamContents(t,
new String[]{"/"},
new int[]{0},
new int[]{1},
new int[]{1},
path.length());
}
public void testReverseSkip2() throws Exception {
String path = "/a/b/c/";
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 2 );
assertTokenStreamContents(t,
new String[]{"/a/", "a/"},
new int[]{0, 1},
new int[]{3, 3},
new int[]{1, 0},
path.length());
}
}

View File

@ -21,6 +21,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
/** /**
@ -37,6 +38,8 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
private char delimiter; private char delimiter;
private char replacement; private char replacement;
private boolean reverse = false;
private int skip = PathHierarchyTokenizer.DEFAULT_SKIP;
/** /**
* Require a configured pattern * Require a configured pattern
@ -70,10 +73,23 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
else{ else{
replacement = delimiter; replacement = delimiter;
} }
v = args.get( "reverse" );
if( v != null ){
reverse = "true".equals( v );
}
v = args.get( "skip" );
if( v != null ){
skip = Integer.parseInt( v );
}
} }
public Tokenizer create(Reader input) { public Tokenizer create(Reader input) {
return new PathHierarchyTokenizer(input, delimiter, replacement); if( reverse ) {
return new ReversePathHierarchyTokenizer(input, delimiter, replacement, skip);
}
return new PathHierarchyTokenizer(input, delimiter, replacement, skip);
} }
} }