mirror of https://github.com/apache/lucene.git
SOLR-1057: Add PathHierarchyTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1067131 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f9a474116
commit
6f31407109
|
@ -80,6 +80,9 @@ New Features
|
|||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
|
||||
* SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
|
||||
/something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
package org.apache.lucene.analysis.path;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/**
|
||||
*
|
||||
* Take something like:
|
||||
*
|
||||
* <pre>
|
||||
* /soemthing/something/else
|
||||
* </pre>
|
||||
*
|
||||
* and make:
|
||||
*
|
||||
* <pre>
|
||||
* /soemthing
|
||||
* /soemthing/something
|
||||
* /soemthing/something/else
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public class PathHierarchyTokenizer extends Tokenizer {
|
||||
|
||||
public PathHierarchyTokenizer(Reader input) {
|
||||
this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER);
|
||||
}
|
||||
|
||||
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) {
|
||||
this(input, bufferSize, delimiter, delimiter);
|
||||
}
|
||||
|
||||
public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) {
|
||||
this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement);
|
||||
}
|
||||
|
||||
public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) {
|
||||
super(input);
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
this.delimiter = delimiter;
|
||||
this.replacement = replacement;
|
||||
endDelimiter = false;
|
||||
resultToken = new StringBuilder(bufferSize);
|
||||
}
|
||||
|
||||
private static final int DEFAULT_BUFFER_SIZE = 1024;
|
||||
public static final char DEFAULT_DELIMITER = '/';
|
||||
private final char delimiter;
|
||||
private final char replacement;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private int finalOffset = 0;
|
||||
private boolean endDelimiter;
|
||||
private StringBuilder resultToken;
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
termAtt.append( resultToken );
|
||||
if(resultToken.length() == 0){
|
||||
posAtt.setPositionIncrement(1);
|
||||
}
|
||||
else{
|
||||
posAtt.setPositionIncrement(0);
|
||||
}
|
||||
int length = 0;
|
||||
boolean added = false;
|
||||
if( endDelimiter ){
|
||||
termAtt.append(replacement);
|
||||
length++;
|
||||
endDelimiter = false;
|
||||
added = true;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
int c = input.read();
|
||||
if( c < 0 ) {
|
||||
length += resultToken.length();
|
||||
termAtt.setLength(length);
|
||||
finalOffset = correctOffset(length);
|
||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
||||
if( added ){
|
||||
resultToken.setLength(0);
|
||||
resultToken.append(termAtt.buffer(), 0, length);
|
||||
}
|
||||
return added;
|
||||
}
|
||||
added = true;
|
||||
if( c == delimiter ) {
|
||||
if( length > 0 ){
|
||||
endDelimiter = true;
|
||||
break;
|
||||
}
|
||||
else{
|
||||
termAtt.append(replacement);
|
||||
length++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
termAtt.append((char)c);
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
length += resultToken.length();
|
||||
termAtt.setLength(length);
|
||||
finalOffset = correctOffset(length);
|
||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
||||
resultToken.setLength(0);
|
||||
resultToken.append(termAtt.buffer(), 0, length);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
resultToken.setLength(0);
|
||||
finalOffset = 0;
|
||||
endDelimiter = false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.analysis.path;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
|
||||
public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBasic() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a", "/a/b", "/a/b/c"},
|
||||
new int[]{0, 0, 0},
|
||||
new int[]{2, 4, 6},
|
||||
new int[]{1, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testEndOfDelimiter() throws Exception {
|
||||
String path = "/a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/a", "/a/b", "/a/b/c", "/a/b/c/"},
|
||||
new int[]{0, 0, 0, 0},
|
||||
new int[]{2, 4, 6, 7},
|
||||
new int[]{1, 0, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testStartOfChar() throws Exception {
|
||||
String path = "a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a", "a/b", "a/b/c"},
|
||||
new int[]{0, 0, 0},
|
||||
new int[]{1, 3, 5},
|
||||
new int[]{1, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testStartOfCharEndOfDelimiter() throws Exception {
|
||||
String path = "a/b/c/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"a", "a/b", "a/b/c", "a/b/c/"},
|
||||
new int[]{0, 0, 0, 0},
|
||||
new int[]{1, 3, 5, 6},
|
||||
new int[]{1, 0, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testOnlyDelimiter() throws Exception {
|
||||
String path = "/";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/"},
|
||||
new int[]{0},
|
||||
new int[]{1},
|
||||
new int[]{1},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testOnlyDelimiters() throws Exception {
|
||||
String path = "//";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path) );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"/", "//"},
|
||||
new int[]{0, 0},
|
||||
new int[]{1, 2},
|
||||
new int[]{1, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testReplace() throws Exception {
|
||||
String path = "/a/b/c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), '/', '\\' );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"\\a", "\\a\\b", "\\a\\b\\c"},
|
||||
new int[]{0, 0, 0},
|
||||
new int[]{2, 4, 6},
|
||||
new int[]{1, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testWindowsPath() throws Exception {
|
||||
String path = "c:\\a\\b\\c";
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), '\\', '\\' );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
|
||||
new int[]{0, 0, 0, 0},
|
||||
new int[]{2, 4, 6, 8},
|
||||
new int[]{1, 0, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
|
||||
public void testNormalizeWinDelimToLinuxDelim() throws Exception {
|
||||
NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
normMap.add("\\", "/");
|
||||
String path = "c:\\a\\b\\c";
|
||||
CharStream cs = new MappingCharFilter(normMap, new StringReader(path));
|
||||
PathHierarchyTokenizer t = new PathHierarchyTokenizer( cs );
|
||||
assertTokenStreamContents(t,
|
||||
new String[]{"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
|
||||
new int[]{0, 0, 0, 0},
|
||||
new int[]{2, 4, 6, 8},
|
||||
new int[]{1, 0, 0, 0},
|
||||
path.length());
|
||||
}
|
||||
}
|
|
@ -441,6 +441,8 @@ New Features
|
|||
|
||||
* SOLR-860: Add debug output for MoreLikeThis. (koji)
|
||||
|
||||
* SOLR-1057: Add PathHierarchyTokenizerFactory. (ryan, koji)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -376,6 +376,11 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.PathTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright. -->
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.solr.analysis;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*/
|
||||
public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
private char delimiter;
|
||||
private char replacement;
|
||||
|
||||
/**
|
||||
* Require a configured pattern
|
||||
*/
|
||||
@Override
|
||||
public void init(Map<String,String> args){
|
||||
super.init( args );
|
||||
|
||||
String v = args.get( "delimiter" );
|
||||
if( v != null ){
|
||||
if( v.length() != 1 ){
|
||||
throw new IllegalArgumentException( "delimiter should be a char. \"" + v + "\" is invalid" );
|
||||
}
|
||||
else{
|
||||
delimiter = v.charAt(0);
|
||||
}
|
||||
}
|
||||
else{
|
||||
delimiter = PathHierarchyTokenizer.DEFAULT_DELIMITER;
|
||||
}
|
||||
|
||||
v = args.get( "replace" );
|
||||
if( v != null ){
|
||||
if( v.length() != 1 ){
|
||||
throw new IllegalArgumentException( "replace should be a char. \"" + v + "\" is invalid" );
|
||||
}
|
||||
else{
|
||||
replacement = v.charAt(0);
|
||||
}
|
||||
}
|
||||
else{
|
||||
replacement = delimiter;
|
||||
}
|
||||
}
|
||||
|
||||
public Tokenizer create(Reader input) {
|
||||
return new PathHierarchyTokenizer(input, delimiter, replacement);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue