SOLR-1057: Add PathHierarchyTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1067131 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-02-04 10:19:52 +00:00
parent 1f9a474116
commit 6f31407109
6 changed files with 363 additions and 0 deletions

View File

@ -80,6 +80,9 @@ New Features
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
* SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
/something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)
Build
* LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the

View File

@ -0,0 +1,150 @@
package org.apache.lucene.analysis.path;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
*
* Take something like:
*
* <pre>
* /soemthing/something/else
* </pre>
*
* and make:
*
* <pre>
* /soemthing
* /soemthing/something
* /soemthing/something/else
* </pre>
*
*/
public class PathHierarchyTokenizer extends Tokenizer {
public PathHierarchyTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER);
}
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
this(input, bufferSize, delimiter, delimiter);
}
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement);
}
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) {
super(input);
termAtt.resizeBuffer(bufferSize);
this.delimiter = delimiter;
this.replacement = replacement;
endDelimiter = false;
resultToken = new StringBuilder(bufferSize);
}
private static final int DEFAULT_BUFFER_SIZE = 1024;
public static final char DEFAULT_DELIMITER = '/';
private final char delimiter;
private final char replacement;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
private int finalOffset = 0;
private boolean endDelimiter;
private StringBuilder resultToken;
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
termAtt.append( resultToken );
if(resultToken.length() == 0){
posAtt.setPositionIncrement(1);
}
else{
posAtt.setPositionIncrement(0);
}
int length = 0;
boolean added = false;
if( endDelimiter ){
termAtt.append(replacement);
length++;
endDelimiter = false;
added = true;
}
while (true) {
int c = input.read();
if( c < 0 ) {
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(length);
offsetAtt.setOffset(correctOffset(0), finalOffset);
if( added ){
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
}
return added;
}
added = true;
if( c == delimiter ) {
if( length > 0 ){
endDelimiter = true;
break;
}
else{
termAtt.append(replacement);
length++;
}
}
else {
termAtt.append((char)c);
length++;
}
}
length += resultToken.length();
termAtt.setLength(length);
finalOffset = correctOffset(length);
offsetAtt.setOffset(correctOffset(0), finalOffset);
resultToken.setLength(0);
resultToken.append(termAtt.buffer(), 0, length);
return true;
}
@Override
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
resultToken.setLength(0);
finalOffset = 0;
endDelimiter = false;
}
}

View File

@ -0,0 +1,130 @@
package org.apache.lucene.analysis.path;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
public void testBasic() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/a", "/a/b", "/a/b/c"},
new int[]{0, 0, 0},
new int[]{2, 4, 6},
new int[]{1, 0, 0},
path.length());
}
public void testEndOfDelimiter() throws Exception {
String path = "/a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/a", "/a/b", "/a/b/c", "/a/b/c/"},
new int[]{0, 0, 0, 0},
new int[]{2, 4, 6, 7},
new int[]{1, 0, 0, 0},
path.length());
}
public void testStartOfChar() throws Exception {
String path = "a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a", "a/b", "a/b/c"},
new int[]{0, 0, 0},
new int[]{1, 3, 5},
new int[]{1, 0, 0},
path.length());
}
public void testStartOfCharEndOfDelimiter() throws Exception {
String path = "a/b/c/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"a", "a/b", "a/b/c", "a/b/c/"},
new int[]{0, 0, 0, 0},
new int[]{1, 3, 5, 6},
new int[]{1, 0, 0, 0},
path.length());
}
public void testOnlyDelimiter() throws Exception {
String path = "/";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/"},
new int[]{0},
new int[]{1},
new int[]{1},
path.length());
}
public void testOnlyDelimiters() throws Exception {
String path = "//";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
assertTokenStreamContents(t,
new String[]{"/", "//"},
new int[]{0, 0},
new int[]{1, 2},
new int[]{1, 0},
path.length());
}
public void testReplace() throws Exception {
String path = "/a/b/c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), '/', '\\' );
assertTokenStreamContents(t,
new String[]{"\\a", "\\a\\b", "\\a\\b\\c"},
new int[]{0, 0, 0},
new int[]{2, 4, 6},
new int[]{1, 0, 0},
path.length());
}
public void testWindowsPath() throws Exception {
String path = "c:\\a\\b\\c";
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), '\\', '\\' );
assertTokenStreamContents(t,
new String[]{"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
new int[]{0, 0, 0, 0},
new int[]{2, 4, 6, 8},
new int[]{1, 0, 0, 0},
path.length());
}
public void testNormalizeWinDelimToLinuxDelim() throws Exception {
NormalizeCharMap normMap = new NormalizeCharMap();
normMap.add("\\", "/");
String path = "c:\\a\\b\\c";
CharStream cs = new MappingCharFilter(normMap, new StringReader(path));
PathHierarchyTokenizer t = new PathHierarchyTokenizer( cs );
assertTokenStreamContents(t,
new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
new int[]{0, 0, 0, 0},
new int[]{2, 4, 6, 8},
new int[]{1, 0, 0, 0},
path.length());
}
}

View File

@ -441,6 +441,8 @@ New Features
* SOLR-860: Add debug output for MoreLikeThis. (koji)
* SOLR-1057: Add PathHierarchyTokenizerFactory. (ryan, koji)
Optimizations
----------------------

View File

@ -376,6 +376,11 @@
</analyzer>
</fieldType>
<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.PathTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->

View File

@ -0,0 +1,73 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
/**
* @version $Id$
*/
public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
private char delimiter;
private char replacement;
/**
* Require a configured pattern
*/
@Override
public void init(Map<String,String> args){
super.init( args );
String v = args.get( "delimiter" );
if( v != null ){
if( v.length() != 1 ){
throw new IllegalArgumentException( "delimiter should be a char. \"" + v + "\" is invalid" );
}
else{
delimiter = v.charAt(0);
}
}
else{
delimiter = PathHierarchyTokenizer.DEFAULT_DELIMITER;
}
v = args.get( "replace" );
if( v != null ){
if( v.length() != 1 ){
throw new IllegalArgumentException( "replace should be a char. \"" + v + "\" is invalid" );
}
else{
replacement = v.charAt(0);
}
}
else{
replacement = delimiter;
}
}
public Tokenizer create(Reader input) {
return new PathHierarchyTokenizer(input, delimiter, replacement);
}
}