mirror of https://github.com/apache/lucene.git
LUCENE-3071: Add ReversePathHierarchyTokenizer and enable skip on PathHierarchyTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1099999 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6af8928c95
commit
96878534a0
|
@ -29,53 +29,67 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
* Take something like:
|
* Take something like:
|
||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* /soemthing/something/else
|
* /something/something/else
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* and make:
|
* and make:
|
||||||
*
|
*
|
||||||
* <pre>
|
* <pre>
|
||||||
* /soemthing
|
* /something
|
||||||
* /soemthing/something
|
* /something/something
|
||||||
* /soemthing/something/else
|
* /something/something/else
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class PathHierarchyTokenizer extends Tokenizer {
|
public class PathHierarchyTokenizer extends Tokenizer {
|
||||||
|
|
||||||
public PathHierarchyTokenizer(Reader input) {
|
public PathHierarchyTokenizer(Reader input) {
|
||||||
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER);
|
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PathHierarchyTokenizer(Reader input, int skip) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
|
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
|
||||||
this(input, bufferSize, delimiter, delimiter);
|
this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
|
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
|
||||||
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement);
|
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) {
|
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
|
||||||
super(input);
|
super(input);
|
||||||
termAtt.resizeBuffer(bufferSize);
|
termAtt.resizeBuffer(bufferSize);
|
||||||
|
|
||||||
this.delimiter = delimiter;
|
this.delimiter = delimiter;
|
||||||
this.replacement = replacement;
|
this.replacement = replacement;
|
||||||
endDelimiter = false;
|
this.skip = skip;
|
||||||
resultToken = new StringBuilder(bufferSize);
|
resultToken = new StringBuilder(bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final int DEFAULT_BUFFER_SIZE = 1024;
|
private static final int DEFAULT_BUFFER_SIZE = 1024;
|
||||||
public static final char DEFAULT_DELIMITER = '/';
|
public static final char DEFAULT_DELIMITER = '/';
|
||||||
|
public static final int DEFAULT_SKIP = 0;
|
||||||
|
|
||||||
private final char delimiter;
|
private final char delimiter;
|
||||||
private final char replacement;
|
private final char replacement;
|
||||||
|
private final int skip;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private int startPosition = 0;
|
||||||
private int finalOffset = 0;
|
private int finalOffset = 0;
|
||||||
private boolean endDelimiter;
|
private int skipped = 0;
|
||||||
|
private boolean endDelimiter = false;
|
||||||
private StringBuilder resultToken;
|
private StringBuilder resultToken;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
|
@ -98,37 +112,63 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
||||||
while (true) {
|
while (true) {
|
||||||
int c = input.read();
|
int c = input.read();
|
||||||
if( c < 0 ){
|
if( c < 0 ){
|
||||||
|
if( skipped > skip ) {
|
||||||
length += resultToken.length();
|
length += resultToken.length();
|
||||||
termAtt.setLength(length);
|
termAtt.setLength(length);
|
||||||
finalOffset = correctOffset(length);
|
finalOffset = correctOffset(startPosition + length);
|
||||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
|
||||||
if( added ){
|
if( added ){
|
||||||
resultToken.setLength(0);
|
resultToken.setLength(0);
|
||||||
resultToken.append(termAtt.buffer(), 0, length);
|
resultToken.append(termAtt.buffer(), 0, length);
|
||||||
}
|
}
|
||||||
return added;
|
return added;
|
||||||
}
|
}
|
||||||
|
else{
|
||||||
|
finalOffset = correctOffset(startPosition + length);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( !added ){
|
||||||
added = true;
|
added = true;
|
||||||
|
skipped++;
|
||||||
|
if( skipped > skip ){
|
||||||
|
termAtt.append(c == delimiter ? replacement : (char)c);
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
startPosition++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
if( c == delimiter ){
|
if( c == delimiter ){
|
||||||
if( length > 0 ){
|
if( skipped > skip ){
|
||||||
endDelimiter = true;
|
endDelimiter = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else{
|
skipped++;
|
||||||
|
if( skipped > skip ){
|
||||||
termAtt.append(replacement);
|
termAtt.append(replacement);
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
startPosition++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
if( skipped > skip ){
|
||||||
termAtt.append((char)c);
|
termAtt.append((char)c);
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
startPosition++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
length += resultToken.length();
|
length += resultToken.length();
|
||||||
termAtt.setLength(length);
|
termAtt.setLength(length);
|
||||||
finalOffset = correctOffset(length);
|
finalOffset = correctOffset(startPosition + length);
|
||||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
offsetAtt.setOffset(correctOffset(startPosition), finalOffset);
|
||||||
resultToken.setLength(0);
|
resultToken.setLength(0);
|
||||||
resultToken.append(termAtt.buffer(), 0, length);
|
resultToken.append(termAtt.buffer(), 0, length);
|
||||||
return true;
|
return true;
|
||||||
|
@ -146,5 +186,6 @@ public class PathHierarchyTokenizer extends Tokenizer {
|
||||||
resultToken.setLength(0);
|
resultToken.setLength(0);
|
||||||
finalOffset = 0;
|
finalOffset = 0;
|
||||||
endDelimiter = false;
|
endDelimiter = false;
|
||||||
|
skipped = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,173 @@
|
||||||
|
package org.apache.lucene.analysis.path;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Take something like:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* www.site.co.uk
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* and make:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* www.site.co.uk
|
||||||
|
* site.co.uk
|
||||||
|
* co.uk
|
||||||
|
* uk
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ReversePathHierarchyTokenizer extends Tokenizer {
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, int skip) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
|
||||||
|
this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) {
|
||||||
|
this(input, bufferSize, delimiter, replacement, DEFAULT_SKIP);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, char delimiter, int skip) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, delimiter, delimiter, skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) {
|
||||||
|
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) {
|
||||||
|
super(input);
|
||||||
|
termAtt.resizeBuffer(bufferSize);
|
||||||
|
this.delimiter = delimiter;
|
||||||
|
this.replacement = replacement;
|
||||||
|
this.skip = skip;
|
||||||
|
resultToken = new StringBuilder(bufferSize);
|
||||||
|
resultTokenBuffer = new char[bufferSize];
|
||||||
|
delimiterPositions = new ArrayList<Integer>(bufferSize/10);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final int DEFAULT_BUFFER_SIZE = 1024;
|
||||||
|
public static final char DEFAULT_DELIMITER = '/';
|
||||||
|
public static final int DEFAULT_SKIP = 0;
|
||||||
|
|
||||||
|
private final char delimiter;
|
||||||
|
private final char replacement;
|
||||||
|
private final int skip;
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
private int endPosition = 0;
|
||||||
|
private int finalOffset = 0;
|
||||||
|
private int skipped = 0;
|
||||||
|
private StringBuilder resultToken;
|
||||||
|
|
||||||
|
private List<Integer> delimiterPositions;
|
||||||
|
private int delimitersCount = -1;
|
||||||
|
private char[] resultTokenBuffer;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
|
if(delimitersCount == -1){
|
||||||
|
int length = 0;
|
||||||
|
delimiterPositions.add(0);
|
||||||
|
while (true) {
|
||||||
|
int c = input.read();
|
||||||
|
if( c < 0 ) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
length++;
|
||||||
|
if( c == delimiter ) {
|
||||||
|
delimiterPositions.add(length);
|
||||||
|
resultToken.append(replacement);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
resultToken.append((char)c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delimitersCount = delimiterPositions.size();
|
||||||
|
if( delimiterPositions.get(delimitersCount-1) < length ){
|
||||||
|
delimiterPositions.add(length);
|
||||||
|
delimitersCount++;
|
||||||
|
}
|
||||||
|
if( resultTokenBuffer.length < resultToken.length() ){
|
||||||
|
resultTokenBuffer = new char[resultToken.length()];
|
||||||
|
}
|
||||||
|
resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0);
|
||||||
|
resultToken.setLength(0);
|
||||||
|
endPosition = delimiterPositions.get(delimitersCount-1 - skip);
|
||||||
|
finalOffset = correctOffset(length);
|
||||||
|
posAtt.setPositionIncrement(1);
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
posAtt.setPositionIncrement(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
while( skipped < delimitersCount-skip-1 ){
|
||||||
|
int start = delimiterPositions.get(skipped);
|
||||||
|
termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
|
||||||
|
offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
|
||||||
|
skipped++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public final void end() {
|
||||||
|
// set final offset
|
||||||
|
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset(Reader input) throws IOException {
|
||||||
|
super.reset(input);
|
||||||
|
resultToken.setLength(0);
|
||||||
|
finalOffset = 0;
|
||||||
|
skipped = 0;
|
||||||
|
delimitersCount = -1;
|
||||||
|
delimiterPositions.clear();
|
||||||
|
}
|
||||||
|
}
|
|
@ -127,4 +127,70 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||||
new int[]{1, 0, 0, 0},
|
new int[]{1, 0, 0, 0},
|
||||||
path.length());
|
path.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testBasicSkip() throws Exception {
|
||||||
|
String path = "/a/b/c";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/b", "/b/c"},
|
||||||
|
new int[]{2, 2},
|
||||||
|
new int[]{4, 6},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEndOfDelimiterSkip() throws Exception {
|
||||||
|
String path = "/a/b/c/";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/b", "/b/c", "/b/c/"},
|
||||||
|
new int[]{2, 2, 2},
|
||||||
|
new int[]{4, 6, 7},
|
||||||
|
new int[]{1, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharSkip() throws Exception {
|
||||||
|
String path = "a/b/c";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/b", "/b/c"},
|
||||||
|
new int[]{1, 1},
|
||||||
|
new int[]{3, 5},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharEndOfDelimiterSkip() throws Exception {
|
||||||
|
String path = "a/b/c/";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/b", "/b/c", "/b/c/"},
|
||||||
|
new int[]{1, 1, 1},
|
||||||
|
new int[]{3, 5, 6},
|
||||||
|
new int[]{1, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimiterSkip() throws Exception {
|
||||||
|
String path = "/";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{},
|
||||||
|
new int[]{},
|
||||||
|
new int[]{},
|
||||||
|
new int[]{},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimitersSkip() throws Exception {
|
||||||
|
String path = "//";
|
||||||
|
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/"},
|
||||||
|
new int[]{1},
|
||||||
|
new int[]{2},
|
||||||
|
new int[]{1},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,157 @@
|
||||||
|
package org.apache.lucene.analysis.path;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
|
||||||
|
public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
public void testBasicReverse() throws Exception {
|
||||||
|
String path = "/a/b/c";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/a/b/c", "a/b/c", "b/c", "c"},
|
||||||
|
new int[]{0, 1, 3, 5},
|
||||||
|
new int[]{6, 6, 6, 6},
|
||||||
|
new int[]{1, 0, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEndOfDelimiterReverse() throws Exception {
|
||||||
|
String path = "/a/b/c/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/a/b/c/", "a/b/c/", "b/c/", "c/"},
|
||||||
|
new int[]{0, 1, 3, 5},
|
||||||
|
new int[]{7, 7, 7, 7},
|
||||||
|
new int[]{1, 0, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharReverse() throws Exception {
|
||||||
|
String path = "a/b/c";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"a/b/c", "b/c", "c"},
|
||||||
|
new int[]{0, 2, 4},
|
||||||
|
new int[]{5, 5, 5},
|
||||||
|
new int[]{1, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharEndOfDelimiterReverse() throws Exception {
|
||||||
|
String path = "a/b/c/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"a/b/c/", "b/c/", "c/"},
|
||||||
|
new int[]{0, 2, 4},
|
||||||
|
new int[]{6, 6, 6},
|
||||||
|
new int[]{1, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimiterReverse() throws Exception {
|
||||||
|
String path = "/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/"},
|
||||||
|
new int[]{0},
|
||||||
|
new int[]{1},
|
||||||
|
new int[]{1},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimitersReverse() throws Exception {
|
||||||
|
String path = "//";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"//", "/"},
|
||||||
|
new int[]{0, 1},
|
||||||
|
new int[]{2, 2},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEndOfDelimiterReverseSkip() throws Exception {
|
||||||
|
String path = "/a/b/c/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/a/b/", "a/b/", "b/"},
|
||||||
|
new int[]{0, 1, 3},
|
||||||
|
new int[]{5, 5, 5},
|
||||||
|
new int[]{1, 0, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharReverseSkip() throws Exception {
|
||||||
|
String path = "a/b/c";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"a/b/", "b/"},
|
||||||
|
new int[]{0, 2},
|
||||||
|
new int[]{4, 4},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception {
|
||||||
|
String path = "a/b/c/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"a/b/", "b/"},
|
||||||
|
new int[]{0, 2},
|
||||||
|
new int[]{4, 4},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimiterReverseSkip() throws Exception {
|
||||||
|
String path = "/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{},
|
||||||
|
new int[]{},
|
||||||
|
new int[]{},
|
||||||
|
new int[]{},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOnlyDelimitersReverseSkip() throws Exception {
|
||||||
|
String path = "//";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/"},
|
||||||
|
new int[]{0},
|
||||||
|
new int[]{1},
|
||||||
|
new int[]{1},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReverseSkip2() throws Exception {
|
||||||
|
String path = "/a/b/c/";
|
||||||
|
ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 2 );
|
||||||
|
assertTokenStreamContents(t,
|
||||||
|
new String[]{"/a/", "a/"},
|
||||||
|
new int[]{0, 1},
|
||||||
|
new int[]{3, 3},
|
||||||
|
new int[]{1, 0},
|
||||||
|
path.length());
|
||||||
|
}
|
||||||
|
}
|
|
@ -21,6 +21,7 @@ import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||||
|
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,6 +38,8 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
|
||||||
|
|
||||||
private char delimiter;
|
private char delimiter;
|
||||||
private char replacement;
|
private char replacement;
|
||||||
|
private boolean reverse = false;
|
||||||
|
private int skip = PathHierarchyTokenizer.DEFAULT_SKIP;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Require a configured pattern
|
* Require a configured pattern
|
||||||
|
@ -70,10 +73,23 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
|
||||||
else{
|
else{
|
||||||
replacement = delimiter;
|
replacement = delimiter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
v = args.get( "reverse" );
|
||||||
|
if( v != null ){
|
||||||
|
reverse = "true".equals( v );
|
||||||
|
}
|
||||||
|
|
||||||
|
v = args.get( "skip" );
|
||||||
|
if( v != null ){
|
||||||
|
skip = Integer.parseInt( v );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Tokenizer create(Reader input) {
|
public Tokenizer create(Reader input) {
|
||||||
return new PathHierarchyTokenizer(input, delimiter, replacement);
|
if( reverse ) {
|
||||||
|
return new ReversePathHierarchyTokenizer(input, delimiter, replacement, skip);
|
||||||
|
}
|
||||||
|
return new PathHierarchyTokenizer(input, delimiter, replacement, skip);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue