mirror of https://github.com/apache/lucene.git
Upgraded to Lucene 2.9-dev r794238.
Other changes include: LUCENE-1614 - Use Lucene's DocIdSetIterator.NO_MORE_DOCS as the sentinel value. LUCENE-1630 - Add acceptsDocsOutOfOrder method to Collector implementations. LUCENE-1673, LUCENE-1701 - Trie has moved to Lucene core and renamed to NumericRangeQuery. LUCENE-1662, LUCENE-1687 - Replace usage of ExtendedFieldCache by FieldCache. SOLR-1241: Solr's CharFilter has been moved to Lucene. Remove CharFilter and related classes from Solr and use Lucene's corresponding code. SOLR-1261: Lucene trunk renamed RangeQuery & Co to TermRangeQuery. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@794328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c9eb4ea1ee
commit
03cf5cdad6
11
CHANGES.txt
11
CHANGES.txt
|
@ -552,6 +552,17 @@ Other Changes
|
||||||
hitting "/admin/logging.jsp" should switch to "/admin/logging".
|
hitting "/admin/logging.jsp" should switch to "/admin/logging".
|
||||||
(hossman)
|
(hossman)
|
||||||
|
|
||||||
|
42. Upgraded to Lucene 2.9-dev r794238. Other changes include:
|
||||||
|
LUCENE-1614 - Use Lucene's DocIdSetIterator.NO_MORE_DOCS as the sentinel value.
|
||||||
|
LUCENE-1630 - Add acceptsDocsOutOfOrder method to Collector implementations.
|
||||||
|
LUCENE-1673, LUCENE-1701 - Trie has moved to Lucene core and renamed to NumericRangeQuery.
|
||||||
|
LUCENE-1662, LUCENE-1687 - Replace usage of ExtendedFieldCache by FieldCache.
|
||||||
|
(shalin)
|
||||||
|
|
||||||
|
42. SOLR-1241: Solr's CharFilter has been moved to Lucene. Remove CharFilter and related classes
|
||||||
|
from Solr and use Lucene's corresponding code (koji via shalin)
|
||||||
|
|
||||||
|
43. SOLR-1261: Lucene trunk renamed RangeQuery & Co to TermRangeQuery (Uwe Schindler via shalin)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -252,12 +252,12 @@
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
|
<!-- charFilter + WhitespaceTokenizer -->
|
||||||
<!--
|
<!--
|
||||||
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
|
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
|
||||||
<analyzer>
|
<analyzer>
|
||||||
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
|
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
|
||||||
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
-->
|
-->
|
||||||
|
@ -347,7 +347,7 @@
|
||||||
<!-- "default" values can be specified for fields, indicating which
|
<!-- "default" values can be specified for fields, indicating which
|
||||||
value should be used if no value is specified when adding a document.
|
value should be used if no value is specified when adding a document.
|
||||||
-->
|
-->
|
||||||
<field name="popularity" type="sint" indexed="true" stored="true" default="0"/>
|
<field name="popularity" type="sint" indexed="true" stored="true"/>
|
||||||
<field name="inStock" type="boolean" indexed="true" stored="true"/>
|
<field name="inStock" type="boolean" indexed="true" stored="true"/>
|
||||||
|
|
||||||
<!-- Some sample docs exists solely to demonstrate the spellchecker
|
<!-- Some sample docs exists solely to demonstrate the spellchecker
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[2e6629706d0dc36f2a3d6bef6f6bbc2dec9716cb] was removed in git history.
|
AnyObjectId[3913f541b7e2915956524f4fc7ee4254dabc1449] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[e2eea446e51f23e97164689936003016f555f807] was removed in git history.
|
AnyObjectId[51b1184b0a653dbe09561e08cb7bb30936ccdd19] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[c072989b47055f39817417190760c4180da20bd1] was removed in git history.
|
AnyObjectId[8732882f60d8c2c314257d02e1fb35e662313c14] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[358e8188673e16f06243bfd926405a2a5659e0c8] was removed in git history.
|
AnyObjectId[4a6bad8fd3391c2dabdd8762d7fdff47511c8012] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[150d8a5ed1d794a503ded3becfaab7d0ef16a131] was removed in git history.
|
AnyObjectId[89ffe35842473c57edcbecd24a116b6993826ae1] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[4b73ea8711ca57acc3e4ad465e488dbed07880a2] was removed in git history.
|
AnyObjectId[a2210e09cef58fe74c62b3cd67b995263477c999] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[b039d8393c248d6d4bb093137d247d7ee157be68] was removed in git history.
|
AnyObjectId[53b91de6e65f2610ba49f5870efb18df8b6f8398] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
||||||
AnyObjectId[b4ae0e32b791ab75b21d23a27eae798a6fccc499] was removed in git history.
|
AnyObjectId[668555685e6f196033d4aff7aaf22e1913205c23] was removed in git history.
|
||||||
Apache SVN contains full history.
|
Apache SVN contains full history.
|
|
@ -1,75 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public abstract class BaseCharFilter extends CharFilter {
|
|
||||||
|
|
||||||
private List<OffCorrectMap> pcmList;
|
|
||||||
|
|
||||||
public BaseCharFilter( CharStream in ){
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected int correct( int currentOff ){
|
|
||||||
if( pcmList == null || pcmList.isEmpty() ) return currentOff;
|
|
||||||
for( int i = pcmList.size() - 1; i >= 0; i-- ){
|
|
||||||
if( currentOff >= pcmList.get( i ).off )
|
|
||||||
return currentOff + pcmList.get( i ).cumulativeDiff;
|
|
||||||
}
|
|
||||||
return currentOff;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected int getLastCumulativeDiff(){
|
|
||||||
return pcmList == null || pcmList.isEmpty() ? 0 : pcmList.get( pcmList.size() - 1 ).cumulativeDiff;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void addOffCorrectMap( int off, int cumulativeDiff ){
|
|
||||||
if( pcmList == null ) pcmList = new ArrayList<OffCorrectMap>();
|
|
||||||
pcmList.add( new OffCorrectMap( off, cumulativeDiff ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static class OffCorrectMap {
|
|
||||||
|
|
||||||
int off;
|
|
||||||
int cumulativeDiff;
|
|
||||||
|
|
||||||
OffCorrectMap( int off, int cumulativeDiff ){
|
|
||||||
this.off = off;
|
|
||||||
this.cumulativeDiff = cumulativeDiff;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString(){
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append('(');
|
|
||||||
sb.append(off);
|
|
||||||
sb.append(',');
|
|
||||||
sb.append(cumulativeDiff);
|
|
||||||
sb.append(')');
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,75 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* Subclasses of CharFilter can be chained to filter CharStream.
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public abstract class CharFilter extends CharStream {
|
|
||||||
|
|
||||||
protected CharStream input;
|
|
||||||
|
|
||||||
protected CharFilter( CharStream in ){
|
|
||||||
input = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* Subclass may want to override to correct the current offset.
|
|
||||||
*
|
|
||||||
* @param currentOff current offset
|
|
||||||
* @return corrected offset
|
|
||||||
*/
|
|
||||||
protected int correct( int currentOff ){
|
|
||||||
return currentOff;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public final int correctOffset(int currentOff) {
|
|
||||||
return input.correctOffset( correct( currentOff ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
input.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
|
||||||
return input.read(cbuf, off, len);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean markSupported(){
|
|
||||||
return input.markSupported();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void mark( int readAheadLimit ) throws IOException {
|
|
||||||
input.mark(readAheadLimit);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset() throws IOException {
|
|
||||||
input.reset();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -19,6 +19,8 @@ package org.apache.solr.analysis;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
|
|
@ -1,69 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public final class CharReader extends CharStream {
|
|
||||||
|
|
||||||
protected Reader input;
|
|
||||||
|
|
||||||
public static CharStream get( Reader input ){
|
|
||||||
return input instanceof CharStream ?
|
|
||||||
(CharStream)input : new CharReader(input);
|
|
||||||
}
|
|
||||||
|
|
||||||
private CharReader( Reader in ){
|
|
||||||
input = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int correctOffset(int currentOff) {
|
|
||||||
return currentOff;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
input.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
|
||||||
return input.read(cbuf, off, len );
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean markSupported(){
|
|
||||||
return input.markSupported();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void mark( int readAheadLimit ) throws IOException {
|
|
||||||
input.mark(readAheadLimit);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset() throws IOException {
|
|
||||||
input.reset();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,38 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public abstract class CharStream extends Reader {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* called by CharFilter(s) and Tokenizer to correct token offset.
|
|
||||||
*
|
|
||||||
* @param currentOff current offset
|
|
||||||
* @return corrected token offset
|
|
||||||
*/
|
|
||||||
public abstract int correctOffset( int currentOff );
|
|
||||||
}
|
|
|
@ -1,276 +0,0 @@
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
|
||||||
* most European languages. It performs other token methods for double-byte
|
|
||||||
* Characters: the token will return at each two characters with overlap match.<br>
|
|
||||||
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
|
|
||||||
* also need filter filter zero length token ""<br>
|
|
||||||
* for Digit: digit, '+', '#' will token as letter<br>
|
|
||||||
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
|
|
||||||
* please search <a
|
|
||||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* LUCENE-973 is applied
|
|
||||||
*/
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public final class CharStreamAwareCJKTokenizer extends Tokenizer {
|
|
||||||
//~ Static fields/initializers ---------------------------------------------
|
|
||||||
/** Word token type */
|
|
||||||
static final int WORD_TYPE = 0;
|
|
||||||
|
|
||||||
/** Single byte token type */
|
|
||||||
static final int SINGLE_TOKEN_TYPE = 1;
|
|
||||||
|
|
||||||
/** Double byte token type */
|
|
||||||
static final int DOUBLE_TOKEN_TYPE = 2;
|
|
||||||
|
|
||||||
/** Names for token types */
|
|
||||||
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
|
|
||||||
|
|
||||||
/** Max word length */
|
|
||||||
private static final int MAX_WORD_LEN = 255;
|
|
||||||
|
|
||||||
/** buffer size: */
|
|
||||||
private static final int IO_BUFFER_SIZE = 256;
|
|
||||||
|
|
||||||
//~ Instance fields --------------------------------------------------------
|
|
||||||
|
|
||||||
/** word offset, used to imply which character(in ) is parsed */
|
|
||||||
private int offset = 0;
|
|
||||||
|
|
||||||
/** the index used only for ioBuffer */
|
|
||||||
private int bufferIndex = 0;
|
|
||||||
|
|
||||||
/** data length */
|
|
||||||
private int dataLen = 0;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* character buffer, store the characters which are used to compose <br>
|
|
||||||
* the returned Token
|
|
||||||
*/
|
|
||||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
|
||||||
|
|
||||||
/**
|
|
||||||
* I/O buffer, used to store the content of the input(one of the <br>
|
|
||||||
* members of Tokenizer)
|
|
||||||
*/
|
|
||||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
|
||||||
|
|
||||||
/** word type: single=>ASCII double=>non-ASCII word=>default */
|
|
||||||
private int tokenType = WORD_TYPE;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* tag: previous character is a cached double-byte character "C1C2C3C4"
|
|
||||||
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
|
|
||||||
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
|
|
||||||
*/
|
|
||||||
private boolean preIsTokened = false;
|
|
||||||
|
|
||||||
//~ Constructors -----------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Construct a token stream processing the given input.
|
|
||||||
*
|
|
||||||
* @param in I/O reader
|
|
||||||
*/
|
|
||||||
public CharStreamAwareCJKTokenizer(CharStream in) {
|
|
||||||
input = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
//~ Methods ----------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the next token in the stream, or null at EOS.
|
|
||||||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
|
||||||
* for detail.
|
|
||||||
*
|
|
||||||
* @param reusableToken a reusable token
|
|
||||||
* @return Token
|
|
||||||
*
|
|
||||||
* @throws java.io.IOException - throw IOException when read error <br>
|
|
||||||
* happened in the InputStream
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
|
||||||
/** how many character(s) has been stored in buffer */
|
|
||||||
assert reusableToken != null;
|
|
||||||
int length = 0;
|
|
||||||
|
|
||||||
/** the position used to create Token */
|
|
||||||
int start = offset;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
/** current character */
|
|
||||||
char c;
|
|
||||||
|
|
||||||
/** unicode block of current character for detail */
|
|
||||||
Character.UnicodeBlock ub;
|
|
||||||
|
|
||||||
offset++;
|
|
||||||
|
|
||||||
if (bufferIndex >= dataLen) {
|
|
||||||
dataLen = input.read(ioBuffer);
|
|
||||||
bufferIndex = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dataLen == -1) {
|
|
||||||
if (length > 0) {
|
|
||||||
if (preIsTokened == true) {
|
|
||||||
length = 0;
|
|
||||||
preIsTokened = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//get current character
|
|
||||||
c = ioBuffer[bufferIndex++];
|
|
||||||
|
|
||||||
//get the UnicodeBlock of the current character
|
|
||||||
ub = Character.UnicodeBlock.of(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
//if the current character is ASCII or Extend ASCII
|
|
||||||
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|
|
||||||
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
|
|
||||||
) {
|
|
||||||
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
|
|
||||||
// convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
|
|
||||||
int i = (int) c;
|
|
||||||
i = i - 65248;
|
|
||||||
c = (char) i;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if the current character is a letter or "_" "+" "#"
|
|
||||||
if (Character.isLetterOrDigit(c)
|
|
||||||
|| ((c == '_') || (c == '+') || (c == '#'))
|
|
||||||
) {
|
|
||||||
if (length == 0) {
|
|
||||||
// "javaC1C2C3C4linux" <br>
|
|
||||||
// ^--: the current character begin to token the ASCII
|
|
||||||
// letter
|
|
||||||
start = offset - 1;
|
|
||||||
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
|
|
||||||
// "javaC1C2C3C4linux" <br>
|
|
||||||
// ^--: the previous non-ASCII
|
|
||||||
// : the current character
|
|
||||||
offset--;
|
|
||||||
bufferIndex--;
|
|
||||||
|
|
||||||
if (preIsTokened == true) {
|
|
||||||
// there is only one non-ASCII has been stored
|
|
||||||
length = 0;
|
|
||||||
preIsTokened = false;
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// store the LowerCase(c) in the buffer
|
|
||||||
buffer[length++] = Character.toLowerCase(c);
|
|
||||||
tokenType = SINGLE_TOKEN_TYPE;
|
|
||||||
|
|
||||||
// break the procedure if buffer overflowed!
|
|
||||||
if (length == MAX_WORD_LEN) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else if (length > 0) {
|
|
||||||
if (preIsTokened == true) {
|
|
||||||
length = 0;
|
|
||||||
preIsTokened = false;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// non-ASCII letter, e.g."C1C2C3C4"
|
|
||||||
if (Character.isLetter(c)) {
|
|
||||||
if (length == 0) {
|
|
||||||
start = offset - 1;
|
|
||||||
buffer[length++] = c;
|
|
||||||
tokenType = DOUBLE_TOKEN_TYPE;
|
|
||||||
} else {
|
|
||||||
if (tokenType == SINGLE_TOKEN_TYPE) {
|
|
||||||
offset--;
|
|
||||||
bufferIndex--;
|
|
||||||
|
|
||||||
//return the previous ASCII characters
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
buffer[length++] = c;
|
|
||||||
tokenType = DOUBLE_TOKEN_TYPE;
|
|
||||||
|
|
||||||
if (length == 2) {
|
|
||||||
offset--;
|
|
||||||
bufferIndex--;
|
|
||||||
preIsTokened = true;
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (length > 0) {
|
|
||||||
if (preIsTokened == true) {
|
|
||||||
// empty the buffer
|
|
||||||
length = 0;
|
|
||||||
preIsTokened = false;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (length > 0) {
|
|
||||||
// Because of "CharStream aware" tokenizer, using correctOffset() to
|
|
||||||
// correct start/end offsets
|
|
||||||
return reusableToken.reinit
|
|
||||||
(buffer, 0, length,
|
|
||||||
((CharStream)input).correctOffset( start ),
|
|
||||||
((CharStream)input).correctOffset( start+length ),
|
|
||||||
TOKEN_TYPE_NAMES[tokenType]);
|
|
||||||
} else if (dataLen != -1) {
|
|
||||||
// Don't return an empty string - recurse to get the next token
|
|
||||||
return next(reusableToken);
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class CharStreamAwareCJKTokenizerFactory extends BaseTokenizerFactory {
|
|
||||||
|
|
||||||
public CharStreamAwareCJKTokenizer create(Reader input) {
|
|
||||||
return new CharStreamAwareCJKTokenizer( CharReader.get(input) );
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,102 +0,0 @@
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
|
||||||
public abstract class CharStreamAwareCharTokenizer extends Tokenizer {
|
|
||||||
public CharStreamAwareCharTokenizer(CharStream input) {
|
|
||||||
super(input);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
|
||||||
private static final int MAX_WORD_LEN = 255;
|
|
||||||
private static final int IO_BUFFER_SIZE = 4096;
|
|
||||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
|
||||||
|
|
||||||
/** Returns true iff a character should be included in a token. This
|
|
||||||
* tokenizer generates as tokens adjacent sequences of characters which
|
|
||||||
* satisfy this predicate. Characters for which this is false are used to
|
|
||||||
* define token boundaries and are not included in tokens. */
|
|
||||||
protected abstract boolean isTokenChar(char c);
|
|
||||||
|
|
||||||
/** Called on each token character to normalize it before it is added to the
|
|
||||||
* token. The default implementation does nothing. Subclasses may use this
|
|
||||||
* to, e.g., lowercase tokens. */
|
|
||||||
protected char normalize(char c) {
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
|
||||||
assert reusableToken != null;
|
|
||||||
reusableToken.clear();
|
|
||||||
int length = 0;
|
|
||||||
int start = bufferIndex;
|
|
||||||
char[] buffer = reusableToken.termBuffer();
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
if (bufferIndex >= dataLen) {
|
|
||||||
offset += dataLen;
|
|
||||||
dataLen = input.read(ioBuffer);
|
|
||||||
if (dataLen == -1) {
|
|
||||||
if (length > 0)
|
|
||||||
break;
|
|
||||||
else
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
bufferIndex = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
final char c = ioBuffer[bufferIndex++];
|
|
||||||
|
|
||||||
if (isTokenChar(c)) { // if it's a token char
|
|
||||||
|
|
||||||
if (length == 0) // start of token
|
|
||||||
start = offset + bufferIndex - 1;
|
|
||||||
else if (length == buffer.length)
|
|
||||||
buffer = reusableToken.resizeTermBuffer(1+length);
|
|
||||||
|
|
||||||
buffer[length++] = normalize(c); // buffer it, normalized
|
|
||||||
|
|
||||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
|
||||||
break;
|
|
||||||
|
|
||||||
} else if (length > 0) // at non-Letter w/ chars
|
|
||||||
break; // return 'em
|
|
||||||
}
|
|
||||||
|
|
||||||
reusableToken.setTermLength(length);
|
|
||||||
// Because of "CharStream aware" tokenizer, using correctOffset() to
|
|
||||||
// correct start/end offsets
|
|
||||||
reusableToken.setStartOffset(((CharStream)input).correctOffset(start));
|
|
||||||
reusableToken.setEndOffset(((CharStream)input).correctOffset(start+length));
|
|
||||||
return reusableToken;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void reset(Reader input) throws IOException {
|
|
||||||
super.reset(input);
|
|
||||||
bufferIndex = 0;
|
|
||||||
offset = 0;
|
|
||||||
dataLen = 0;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
|
||||||
* Adjacent sequences of non-Whitespace characters form tokens. */
|
|
||||||
public class CharStreamAwareWhitespaceTokenizer extends CharStreamAwareCharTokenizer {
|
|
||||||
/** Construct a new WhitespaceTokenizer. */
|
|
||||||
public CharStreamAwareWhitespaceTokenizer(CharStream in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Collects only characters which do not satisfy
|
|
||||||
* {@link Character#isWhitespace(char)}.*/
|
|
||||||
protected boolean isTokenChar(char c) {
|
|
||||||
return !Character.isWhitespace(c);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class CharStreamAwareWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
|
||||||
|
|
||||||
public CharStreamAwareWhitespaceTokenizer create(Reader input) {
|
|
||||||
return new CharStreamAwareWhitespaceTokenizer( CharReader.get(input) );
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,123 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class MappingCharFilter extends BaseCharFilter {
|
|
||||||
|
|
||||||
private final NormalizeMap normMap;
|
|
||||||
private LinkedList<Character> buffer;
|
|
||||||
private String replacement;
|
|
||||||
private int charPointer;
|
|
||||||
private int nextCharCounter;
|
|
||||||
|
|
||||||
public MappingCharFilter( NormalizeMap normMap, CharStream in ){
|
|
||||||
super( in );
|
|
||||||
this.normMap = normMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int read() throws IOException {
|
|
||||||
while( true ){
|
|
||||||
if( replacement != null && charPointer < replacement.length() )
|
|
||||||
return replacement.charAt( charPointer++ );
|
|
||||||
|
|
||||||
int firstChar = nextChar();
|
|
||||||
if( firstChar == -1 ) return -1;
|
|
||||||
NormalizeMap nm = normMap.submap != null ?
|
|
||||||
normMap.submap.get( (char)firstChar ) : null;
|
|
||||||
if( nm == null ) return firstChar;
|
|
||||||
NormalizeMap result = match( nm );
|
|
||||||
if( result == null ) return firstChar;
|
|
||||||
replacement = result.normStr;
|
|
||||||
charPointer = 0;
|
|
||||||
if( result.diff != 0 ){
|
|
||||||
int prevCumulativeDiff = getLastCumulativeDiff();
|
|
||||||
if( result.diff < 0 ){
|
|
||||||
for( int i = 0; i < -result.diff ; i++ )
|
|
||||||
addOffCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i );
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
addOffCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) ;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int nextChar() throws IOException {
|
|
||||||
nextCharCounter++;
|
|
||||||
if( buffer != null && !buffer.isEmpty() )
|
|
||||||
return buffer.removeFirst();
|
|
||||||
return input.read();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void pushChar( int c ){
|
|
||||||
nextCharCounter--;
|
|
||||||
if( buffer == null )
|
|
||||||
buffer = new LinkedList<Character>();
|
|
||||||
buffer.addFirst( (char)c );
|
|
||||||
}
|
|
||||||
|
|
||||||
private void pushLastChar( int c ){
|
|
||||||
if( buffer == null )
|
|
||||||
buffer = new LinkedList<Character>();
|
|
||||||
buffer.addLast( (char)c );
|
|
||||||
}
|
|
||||||
|
|
||||||
private NormalizeMap match( NormalizeMap map ) throws IOException {
|
|
||||||
NormalizeMap result = null;
|
|
||||||
if( map.submap != null ){
|
|
||||||
int chr = nextChar();
|
|
||||||
if( chr != -1 ){
|
|
||||||
NormalizeMap subMap = map.submap.get( (char)chr );
|
|
||||||
if( subMap != null ){
|
|
||||||
result = match( subMap );
|
|
||||||
}
|
|
||||||
if( result == null )
|
|
||||||
pushChar( chr );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if( result == null && map.normStr != null )
|
|
||||||
result = map;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int read( char[] cbuf, int off, int len ) throws IOException {
|
|
||||||
char[] tmp = new char[len];
|
|
||||||
int l = input.read( tmp, 0, len );
|
|
||||||
if( l != -1 ){
|
|
||||||
for( int i = 0; i < l; i++ )
|
|
||||||
pushLastChar( tmp[i] );
|
|
||||||
}
|
|
||||||
l = 0;
|
|
||||||
for( int i = off; i < off + len; i++ ){
|
|
||||||
int c = read();
|
|
||||||
if( c == -1 ) break;
|
|
||||||
cbuf[i] = (char)c;
|
|
||||||
l++;
|
|
||||||
}
|
|
||||||
return l == 0 ? -1 : l;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -24,6 +24,9 @@ import java.util.List;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.apache.lucene.analysis.MappingCharFilter;
|
||||||
|
import org.apache.lucene.analysis.NormalizeCharMap;
|
||||||
import org.apache.solr.common.ResourceLoader;
|
import org.apache.solr.common.ResourceLoader;
|
||||||
import org.apache.solr.common.util.StrUtils;
|
import org.apache.solr.common.util.StrUtils;
|
||||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
@ -37,7 +40,7 @@ import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||||
ResourceLoaderAware {
|
ResourceLoaderAware {
|
||||||
|
|
||||||
protected NormalizeMap normMap;
|
protected NormalizeCharMap normMap;
|
||||||
private String mapping;
|
private String mapping;
|
||||||
|
|
||||||
public void inform(ResourceLoader loader) {
|
public void inform(ResourceLoader loader) {
|
||||||
|
@ -62,7 +65,7 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||||
catch( IOException e ){
|
catch( IOException e ){
|
||||||
throw new RuntimeException( e );
|
throw new RuntimeException( e );
|
||||||
}
|
}
|
||||||
normMap = new NormalizeMap();
|
normMap = new NormalizeCharMap();
|
||||||
parseRules( wlist, normMap );
|
parseRules( wlist, normMap );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,7 +77,7 @@ public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||||
// "source" => "target"
|
// "source" => "target"
|
||||||
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
|
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
|
||||||
|
|
||||||
protected void parseRules( List<String> rules, NormalizeMap normMap ){
|
protected void parseRules( List<String> rules, NormalizeCharMap normMap ){
|
||||||
for( String rule : rules ){
|
for( String rule : rules ){
|
||||||
Matcher m = p.matcher( rule );
|
Matcher m = p.matcher( rule );
|
||||||
if( !m.find() )
|
if( !m.find() )
|
||||||
|
|
|
@ -1,55 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @since Solr 1.4
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public class NormalizeMap {
|
|
||||||
|
|
||||||
Map<Character, NormalizeMap> submap;
|
|
||||||
String normStr;
|
|
||||||
int diff;
|
|
||||||
|
|
||||||
public void add( String singleMatch, String replacement ){
|
|
||||||
NormalizeMap currMap = this;
|
|
||||||
for( int i = 0; i < singleMatch.length(); i++ ){
|
|
||||||
char c = singleMatch.charAt( i );
|
|
||||||
if( currMap.submap == null ){
|
|
||||||
currMap.submap = new HashMap<Character, NormalizeMap>( 1 );
|
|
||||||
}
|
|
||||||
NormalizeMap map = currMap.submap.get( c );
|
|
||||||
if( map == null ){
|
|
||||||
map = new NormalizeMap();
|
|
||||||
currMap.submap.put( c, map );
|
|
||||||
}
|
|
||||||
currMap = map;
|
|
||||||
}
|
|
||||||
if( currMap.normStr != null ){
|
|
||||||
throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
|
|
||||||
}
|
|
||||||
currMap.normStr = replacement;
|
|
||||||
currMap.diff = singleMatch.length() - replacement.length();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -18,7 +18,8 @@
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.solr.analysis.TokenizerFactory;
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
|
|
@ -1,81 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
|
||||||
import org.apache.lucene.search.trie.TrieUtils;
|
|
||||||
import org.apache.solr.common.SolrException;
|
|
||||||
import org.apache.solr.schema.DateField;
|
|
||||||
import org.apache.solr.schema.TrieField;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Query time tokenizer for trie fields. It uses methods in TrieUtils to create a prefix coded representation of the
|
|
||||||
* given number which is used for term queries.
|
|
||||||
* <p/>
|
|
||||||
* Note that queries on trie date types are not tokenized and returned as is.
|
|
||||||
*
|
|
||||||
* @version $Id$
|
|
||||||
* @see org.apache.lucene.search.trie.TrieUtils
|
|
||||||
* @see org.apache.solr.schema.TrieField
|
|
||||||
* @since solr 1.4
|
|
||||||
*/
|
|
||||||
public class TrieQueryTokenizerFactory extends BaseTokenizerFactory {
|
|
||||||
protected static final DateField dateField = new DateField();
|
|
||||||
protected final TrieField.TrieTypes type;
|
|
||||||
|
|
||||||
public TrieQueryTokenizerFactory(TrieField.TrieTypes type) {
|
|
||||||
this.type = type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TokenStream create(Reader reader) {
|
|
||||||
try {
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
char[] buf = new char[8];
|
|
||||||
int len;
|
|
||||||
while ((len = reader.read(buf)) != -1)
|
|
||||||
builder.append(buf, 0, len);
|
|
||||||
String value, number = builder.toString();
|
|
||||||
switch (type) {
|
|
||||||
case INTEGER:
|
|
||||||
value = TrieUtils.intToPrefixCoded(Integer.parseInt(number));
|
|
||||||
break;
|
|
||||||
case FLOAT:
|
|
||||||
value = TrieUtils.intToPrefixCoded(TrieUtils.floatToSortableInt(Float.parseFloat(number)));
|
|
||||||
break;
|
|
||||||
case LONG:
|
|
||||||
value = TrieUtils.longToPrefixCoded(Long.parseLong(number));
|
|
||||||
break;
|
|
||||||
case DOUBLE:
|
|
||||||
value = TrieUtils.longToPrefixCoded(TrieUtils.doubleToSortableLong(Double.parseDouble(number)));
|
|
||||||
break;
|
|
||||||
case DATE:
|
|
||||||
value = TrieUtils.longToPrefixCoded(dateField.parseMath(null, number).getTime());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
|
||||||
}
|
|
||||||
return new KeywordTokenizer(new StringReader(value));
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create trie query tokenizer", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -16,12 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.analysis;
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.NumericTokenStream;
|
||||||
import org.apache.lucene.search.trie.TrieUtils;
|
|
||||||
import org.apache.lucene.search.trie.IntTrieTokenStream;
|
|
||||||
import org.apache.lucene.search.trie.LongTrieTokenStream;
|
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.schema.DateField;
|
import org.apache.solr.schema.DateField;
|
||||||
import static org.apache.solr.schema.TrieField.TrieTypes;
|
import static org.apache.solr.schema.TrieField.TrieTypes;
|
||||||
|
@ -30,22 +26,23 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Index time tokenizer for trie fields. It uses methods in TrieUtils to create multiple trie encoded string per number.
|
* Tokenizer for trie fields. It uses NumericTokenStream to create multiple trie encoded string per number.
|
||||||
* Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
|
* Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
|
||||||
|
* For query time token streams that only contain the highest precision term, use 32/64 as precisionStep.
|
||||||
* <p/>
|
* <p/>
|
||||||
* Refer to {@linkplain org.apache.lucene.search.trie package description} for more details.
|
* Refer to {@link org.apache.lucene.search.NumericRangeQuery} for more details.
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
* @see org.apache.lucene.search.trie.TrieUtils
|
* @see org.apache.lucene.search.NumericRangeQuery
|
||||||
* @see org.apache.solr.schema.TrieField
|
* @see org.apache.solr.schema.TrieField
|
||||||
* @since solr 1.4
|
* @since solr 1.4
|
||||||
*/
|
*/
|
||||||
public class TrieIndexTokenizerFactory extends BaseTokenizerFactory {
|
public class TrieTokenizerFactory extends BaseTokenizerFactory {
|
||||||
protected static final DateField dateField = new DateField();
|
protected static final DateField dateField = new DateField();
|
||||||
protected final int precisionStep;
|
protected final int precisionStep;
|
||||||
protected final TrieTypes type;
|
protected final TrieTypes type;
|
||||||
|
|
||||||
public TrieIndexTokenizerFactory(TrieTypes type, int precisionStep) {
|
public TrieTokenizerFactory(TrieTypes type, int precisionStep) {
|
||||||
this.type = type;
|
this.type = type;
|
||||||
this.precisionStep = precisionStep;
|
this.precisionStep = precisionStep;
|
||||||
}
|
}
|
||||||
|
@ -59,15 +56,15 @@ public class TrieIndexTokenizerFactory extends BaseTokenizerFactory {
|
||||||
builder.append(buf, 0, len);
|
builder.append(buf, 0, len);
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case INTEGER:
|
case INTEGER:
|
||||||
return new IntTrieTokenStream(Integer.parseInt(builder.toString()), precisionStep);
|
return new NumericTokenStream(precisionStep).setIntValue(Integer.parseInt(builder.toString()));
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
return new IntTrieTokenStream(TrieUtils.floatToSortableInt(Float.parseFloat(builder.toString())), precisionStep);
|
return new NumericTokenStream(precisionStep).setFloatValue(Float.parseFloat(builder.toString()));
|
||||||
case LONG:
|
case LONG:
|
||||||
return new LongTrieTokenStream(Long.parseLong(builder.toString()), precisionStep);
|
return new NumericTokenStream(precisionStep).setLongValue(Long.parseLong(builder.toString()));
|
||||||
case DOUBLE:
|
case DOUBLE:
|
||||||
return new LongTrieTokenStream(TrieUtils.doubleToSortableLong(Double.parseDouble(builder.toString())), precisionStep);
|
return new NumericTokenStream(precisionStep).setDoubleValue(Double.parseDouble(builder.toString()));
|
||||||
case DATE:
|
case DATE:
|
||||||
return new LongTrieTokenStream(dateField.parseMath(null, builder.toString()).getTime(), precisionStep);
|
return new NumericTokenStream(precisionStep).setLongValue(dateField.parseMath(null, builder.toString()).getTime());
|
||||||
default:
|
default:
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
||||||
}
|
}
|
|
@ -21,8 +21,6 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.NIOFSDirectory;
|
|
||||||
import org.apache.lucene.util.Constants;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Directory provider which mimics original Solr FSDirectory based behavior.
|
* Directory provider which mimics original Solr FSDirectory based behavior.
|
||||||
|
@ -31,10 +29,6 @@ import org.apache.lucene.util.Constants;
|
||||||
public class StandardDirectoryFactory extends DirectoryFactory {
|
public class StandardDirectoryFactory extends DirectoryFactory {
|
||||||
|
|
||||||
public Directory open(String path) throws IOException {
|
public Directory open(String path) throws IOException {
|
||||||
if (!Constants.WINDOWS) {
|
return FSDirectory.open(new File(path));
|
||||||
return new NIOFSDirectory(new File(path), null);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new FSDirectory(new File(path), null);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,12 +21,11 @@ import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache.DoubleParser;
|
import org.apache.lucene.search.FieldCache.DoubleParser;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache.LongParser;
|
import org.apache.lucene.search.FieldCache.LongParser;
|
||||||
import org.apache.lucene.search.FieldCache.FloatParser;
|
import org.apache.lucene.search.FieldCache.FloatParser;
|
||||||
import org.apache.lucene.search.FieldCache.IntParser;
|
import org.apache.lucene.search.FieldCache.IntParser;
|
||||||
import org.apache.lucene.search.FieldCache.Parser;
|
import org.apache.lucene.search.FieldCache.Parser;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.queryParser.ParseException;
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.solr.common.SolrDocument;
|
import org.apache.solr.common.SolrDocument;
|
||||||
import org.apache.solr.common.SolrDocumentList;
|
import org.apache.solr.common.SolrDocumentList;
|
||||||
|
@ -45,7 +44,6 @@ import org.apache.solr.search.*;
|
||||||
import org.apache.solr.util.SolrPluginUtils;
|
import org.apache.solr.util.SolrPluginUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.text.Collator;
|
import java.text.Collator;
|
||||||
|
@ -615,7 +613,7 @@ public class QueryComponent extends SearchComponent
|
||||||
static ScoreDocComparator comparatorLong (final IndexReader reader, final String fieldname, Parser parser)
|
static ScoreDocComparator comparatorLong (final IndexReader reader, final String fieldname, Parser parser)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final String field = fieldname.intern();
|
final String field = fieldname.intern();
|
||||||
final long[] fieldOrder = parser == null ? ExtendedFieldCache.EXT_DEFAULT.getLongs(reader, field) : ExtendedFieldCache.EXT_DEFAULT.getLongs(reader, field, (LongParser) parser);
|
final long[] fieldOrder = parser == null ? FieldCache.DEFAULT.getLongs(reader, field) : FieldCache.DEFAULT.getLongs(reader, field, (LongParser) parser);
|
||||||
return new ScoreDocComparator() {
|
return new ScoreDocComparator() {
|
||||||
|
|
||||||
public final int compare (final ScoreDoc i, final ScoreDoc j) {
|
public final int compare (final ScoreDoc i, final ScoreDoc j) {
|
||||||
|
@ -680,7 +678,7 @@ public class QueryComponent extends SearchComponent
|
||||||
static ScoreDocComparator comparatorDouble(final IndexReader reader, final String fieldname, Parser parser)
|
static ScoreDocComparator comparatorDouble(final IndexReader reader, final String fieldname, Parser parser)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
final String field = fieldname.intern();
|
final String field = fieldname.intern();
|
||||||
final double[] fieldOrder = parser == null ? ExtendedFieldCache.EXT_DEFAULT.getDoubles(reader, field) : ExtendedFieldCache.EXT_DEFAULT.getDoubles(reader, field, (DoubleParser) parser);
|
final double[] fieldOrder = parser == null ? FieldCache.DEFAULT.getDoubles(reader, field) : FieldCache.DEFAULT.getDoubles(reader, field, (DoubleParser) parser);
|
||||||
return new ScoreDocComparator () {
|
return new ScoreDocComparator () {
|
||||||
|
|
||||||
public final int compare (final ScoreDoc i, final ScoreDoc j) {
|
public final int compare (final ScoreDoc i, final ScoreDoc j) {
|
||||||
|
|
|
@ -309,7 +309,7 @@ public class SimpleFacets {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
DocSet hasVal = searcher.getDocSet
|
DocSet hasVal = searcher.getDocSet
|
||||||
(new ConstantScoreRangeQuery(fieldName, null, null, false, false));
|
(new TermRangeQuery(fieldName, null, null, false, false));
|
||||||
return docs.andNotSize(hasVal);
|
return docs.andNotSize(hasVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -665,14 +665,13 @@ public class SimpleFacets {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Macro for getting the numDocs of a ConstantScoreRangeQuery over docs
|
* Macro for getting the numDocs of a TermRangeQuery over docs
|
||||||
* @see SolrIndexSearcher#numDocs
|
* @see SolrIndexSearcher#numDocs
|
||||||
* @see ConstantScoreRangeQuery
|
* @see TermRangeQuery
|
||||||
*/
|
*/
|
||||||
protected int rangeCount(String field, String low, String high,
|
protected int rangeCount(String field, String low, String high,
|
||||||
boolean iLow, boolean iHigh) throws IOException {
|
boolean iLow, boolean iHigh) throws IOException {
|
||||||
return searcher.numDocs(new ConstantScoreRangeQuery(field,low,high,
|
return searcher.numDocs(new TermRangeQuery(field,low,high,iLow,iHigh),
|
||||||
iLow,iHigh),
|
|
||||||
base);
|
base);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.RangeQuery;
|
import org.apache.lucene.search.TermRangeQuery;
|
||||||
import org.apache.solr.search.function.ValueSource;
|
import org.apache.solr.search.function.ValueSource;
|
||||||
import org.apache.solr.search.function.OrdFieldSource;
|
import org.apache.solr.search.function.OrdFieldSource;
|
||||||
import org.apache.solr.search.Sorting;
|
import org.apache.solr.search.Sorting;
|
||||||
|
@ -436,23 +436,22 @@ public abstract class FieldType extends FieldProperties {
|
||||||
* handle nulls in part1 and/or part2 as well as unequal minInclusive and maxInclusive parameters gracefully.
|
* handle nulls in part1 and/or part2 as well as unequal minInclusive and maxInclusive parameters gracefully.
|
||||||
*
|
*
|
||||||
* @param parser
|
* @param parser
|
||||||
*@param field the name of the field
|
* @param field the name of the field
|
||||||
* @param part1 the lower boundary of the range, nulls are allowed.
|
* @param part1 the lower boundary of the range, nulls are allowed.
|
||||||
* @param part2 the upper boundary of the range, nulls are allowed
|
* @param part2 the upper boundary of the range, nulls are allowed
|
||||||
* @param minInclusive whether the minimum of the range is inclusive or not
|
* @param minInclusive whether the minimum of the range is inclusive or not
|
||||||
* @param maxInclusive whether the maximum of the range is inclusive or not
|
* @param maxInclusive whether the maximum of the range is inclusive or not
|
||||||
* @return a Query instance to perform range search according to given parameters
|
* @return a Query instance to perform range search according to given parameters
|
||||||
*
|
*
|
||||||
* @see org.apache.solr.search.SolrQueryParser#getRangeQuery(String, String, String, boolean)
|
* @see org.apache.solr.search.SolrQueryParser#getRangeQuery(String, String, String, boolean)
|
||||||
*/
|
*/
|
||||||
public Query getRangeQuery(QParser parser, String field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
|
public Query getRangeQuery(QParser parser, String field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
|
||||||
RangeQuery rangeQuery = new RangeQuery(
|
// constant score mode is now enabled per default
|
||||||
|
return new TermRangeQuery(
|
||||||
field,
|
field,
|
||||||
part1 == null ? null : toInternal(part1),
|
part1 == null ? null : toInternal(part1),
|
||||||
part2 == null ? null : toInternal(part2),
|
part2 == null ? null : toInternal(part2),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
rangeQuery.setConstantScoreRewrite(true);
|
|
||||||
return rangeQuery;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,9 +19,8 @@ package org.apache.solr.schema;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.SortField;
|
import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.trie.IntTrieRangeQuery;
|
import org.apache.lucene.search.NumericRangeQuery;
|
||||||
import org.apache.lucene.search.trie.LongTrieRangeQuery;
|
import org.apache.lucene.search.FieldCache;
|
||||||
import org.apache.lucene.search.trie.TrieUtils;
|
|
||||||
import org.apache.solr.analysis.*;
|
import org.apache.solr.analysis.*;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.request.TextResponseWriter;
|
import org.apache.solr.request.TextResponseWriter;
|
||||||
|
@ -33,8 +32,9 @@ import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides field types to support for Lucene's Trie Range Queries. See {@linkplain org.apache.lucene.search.trie
|
* Provides field types to support for Lucene's Trie Range Queries.
|
||||||
* package description} for more details. It supports integer, float, long, double and date types.
|
* See {@link org.apache.lucene.search.NumericRangeQuery} for more details.
|
||||||
|
* It supports integer, float, long, double and date types.
|
||||||
* <p/>
|
* <p/>
|
||||||
* For each number being added to this field, multiple terms are generated as per the algorithm described in the above
|
* For each number being added to this field, multiple terms are generated as per the algorithm described in the above
|
||||||
* link. The possible number of terms increases dramatically with higher precision steps (factor 2^precisionStep). For
|
* link. The possible number of terms increases dramatically with higher precision steps (factor 2^precisionStep). For
|
||||||
|
@ -46,7 +46,7 @@ import java.util.Map;
|
||||||
* generated, range search will be no faster than any other number field, but sorting will be possible.
|
* generated, range search will be no faster than any other number field, but sorting will be possible.
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
* @see org.apache.lucene.search.trie.TrieUtils
|
* @see org.apache.lucene.search.NumericRangeQuery
|
||||||
* @since solr 1.4
|
* @since solr 1.4
|
||||||
*/
|
*/
|
||||||
public class TrieField extends FieldType {
|
public class TrieField extends FieldType {
|
||||||
|
@ -81,8 +81,9 @@ public class TrieField extends FieldType {
|
||||||
|
|
||||||
CharFilterFactory[] filterFactories = new CharFilterFactory[0];
|
CharFilterFactory[] filterFactories = new CharFilterFactory[0];
|
||||||
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
|
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
|
||||||
analyzer = new TokenizerChain(filterFactories, new TrieIndexTokenizerFactory(type, precisionStep), tokenFilterFactories);
|
analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, precisionStep), tokenFilterFactories);
|
||||||
queryAnalyzer = new TokenizerChain(filterFactories, new TrieQueryTokenizerFactory(type), tokenFilterFactories);
|
// for query time we only need one token, so we use the biggest possible precisionStep:
|
||||||
|
queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, Integer.MAX_VALUE), tokenFilterFactories);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -107,12 +108,14 @@ public class TrieField extends FieldType {
|
||||||
public SortField getSortField(SchemaField field, boolean top) {
|
public SortField getSortField(SchemaField field, boolean top) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case INTEGER:
|
case INTEGER:
|
||||||
|
return new SortField(field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER, top);
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
return TrieUtils.getIntSortField(field.getName(), top);
|
return new SortField(field.getName(), FieldCache.NUMERIC_UTILS_FLOAT_PARSER, top);
|
||||||
case LONG:
|
|
||||||
case DOUBLE:
|
|
||||||
case DATE:
|
case DATE:
|
||||||
return TrieUtils.getLongSortField(field.getName(), top);
|
case LONG:
|
||||||
|
return new SortField(field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER, top);
|
||||||
|
case DOUBLE:
|
||||||
|
return new SortField(field.getName(), FieldCache.NUMERIC_UTILS_DOUBLE_PARSER, top);
|
||||||
default:
|
default:
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name);
|
||||||
}
|
}
|
||||||
|
@ -121,15 +124,14 @@ public class TrieField extends FieldType {
|
||||||
public ValueSource getValueSource(SchemaField field) {
|
public ValueSource getValueSource(SchemaField field) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case INTEGER:
|
case INTEGER:
|
||||||
return new IntFieldSource(field.getName(), TrieUtils.FIELD_CACHE_INT_PARSER);
|
return new IntFieldSource(field.getName(), FieldCache.NUMERIC_UTILS_INT_PARSER);
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
return new FloatFieldSource(field.getName(), TrieUtils.FIELD_CACHE_FLOAT_PARSER);
|
return new FloatFieldSource(field.getName(), FieldCache.NUMERIC_UTILS_FLOAT_PARSER);
|
||||||
case LONG:
|
|
||||||
return new LongFieldSource(field.getName(), TrieUtils.FIELD_CACHE_LONG_PARSER);
|
|
||||||
case DOUBLE:
|
|
||||||
return new DoubleFieldSource(field.getName(), TrieUtils.FIELD_CACHE_DOUBLE_PARSER);
|
|
||||||
case DATE:
|
case DATE:
|
||||||
return new LongFieldSource(field.getName(), TrieUtils.FIELD_CACHE_LONG_PARSER);
|
case LONG:
|
||||||
|
return new LongFieldSource(field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER);
|
||||||
|
case DOUBLE:
|
||||||
|
return new DoubleFieldSource(field.getName(), FieldCache.NUMERIC_UTILS_DOUBLE_PARSER);
|
||||||
default:
|
default:
|
||||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name);
|
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + field.name);
|
||||||
}
|
}
|
||||||
|
@ -167,31 +169,31 @@ public class TrieField extends FieldType {
|
||||||
Query query = null;
|
Query query = null;
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case INTEGER:
|
case INTEGER:
|
||||||
query = new IntTrieRangeQuery(field, precisionStep,
|
query = NumericRangeQuery.newIntRange(field, precisionStep,
|
||||||
min == null ? null : Integer.parseInt(min),
|
min == null ? null : Integer.parseInt(min),
|
||||||
max == null ? null : Integer.parseInt(max),
|
max == null ? null : Integer.parseInt(max),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
break;
|
break;
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
query = new IntTrieRangeQuery(field, precisionStep,
|
query = NumericRangeQuery.newFloatRange(field, precisionStep,
|
||||||
min == null ? null : TrieUtils.floatToSortableInt(Float.parseFloat(min)),
|
min == null ? null : Float.parseFloat(min),
|
||||||
max == null ? null : TrieUtils.floatToSortableInt(Float.parseFloat(max)),
|
max == null ? null : Float.parseFloat(max),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
break;
|
break;
|
||||||
case LONG:
|
case LONG:
|
||||||
query = new LongTrieRangeQuery(field, precisionStep,
|
query = NumericRangeQuery.newLongRange(field, precisionStep,
|
||||||
min == null ? null : Long.parseLong(min),
|
min == null ? null : Long.parseLong(min),
|
||||||
max == null ? null : Long.parseLong(max),
|
max == null ? null : Long.parseLong(max),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
break;
|
break;
|
||||||
case DOUBLE:
|
case DOUBLE:
|
||||||
query = new LongTrieRangeQuery(field, precisionStep,
|
query = NumericRangeQuery.newDoubleRange(field, precisionStep,
|
||||||
min == null ? null : TrieUtils.doubleToSortableLong(Double.parseDouble(min)),
|
min == null ? null : Double.parseDouble(min),
|
||||||
max == null ? null : TrieUtils.doubleToSortableLong(Double.parseDouble(max)),
|
max == null ? null : Double.parseDouble(max),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
break;
|
break;
|
||||||
case DATE:
|
case DATE:
|
||||||
query = new LongTrieRangeQuery(field, precisionStep,
|
query = NumericRangeQuery.newLongRange(field, precisionStep,
|
||||||
min == null ? null : dateField.parseMath(null, min).getTime(),
|
min == null ? null : dateField.parseMath(null, min).getTime(),
|
||||||
max == null ? null : dateField.parseMath(null, max).getTime(),
|
max == null ? null : dateField.parseMath(null, max).getTime(),
|
||||||
minInclusive, maxInclusive);
|
minInclusive, maxInclusive);
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.search;
|
||||||
|
|
||||||
import org.apache.lucene.util.OpenBitSet;
|
import org.apache.lucene.util.OpenBitSet;
|
||||||
import org.apache.lucene.util.OpenBitSetIterator;
|
import org.apache.lucene.util.OpenBitSetIterator;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <code>BitDocSet</code> represents an unordered set of Lucene Document Ids
|
* <code>BitDocSet</code> represents an unordered set of Lucene Document Ids
|
||||||
|
@ -84,7 +85,7 @@ public class BitDocSet extends DocSetBase {
|
||||||
private final OpenBitSetIterator iter = new OpenBitSetIterator(bits);
|
private final OpenBitSetIterator iter = new OpenBitSetIterator(bits);
|
||||||
private int pos = iter.nextDoc();
|
private int pos = iter.nextDoc();
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return pos>=0;
|
return pos != DocIdSetIterator.NO_MORE_DOCS;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer next() {
|
public Integer next() {
|
||||||
|
|
|
@ -84,6 +84,10 @@ class DocSetCollector extends Collector {
|
||||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||||
this.base = docBase;
|
this.base = docBase;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class DocSetDelegateCollector extends DocSetCollector {
|
class DocSetDelegateCollector extends DocSetCollector {
|
||||||
|
|
|
@ -317,13 +317,13 @@ public class QueryParsing {
|
||||||
Term t = q.getTerm();
|
Term t = q.getTerm();
|
||||||
FieldType ft = writeFieldName(t.field(), schema, out, flags);
|
FieldType ft = writeFieldName(t.field(), schema, out, flags);
|
||||||
writeFieldVal(t.text(), ft, out, flags);
|
writeFieldVal(t.text(), ft, out, flags);
|
||||||
} else if (query instanceof ConstantScoreRangeQuery) {
|
} else if (query instanceof TermRangeQuery) {
|
||||||
ConstantScoreRangeQuery q = (ConstantScoreRangeQuery)query;
|
TermRangeQuery q = (TermRangeQuery)query;
|
||||||
String fname = q.getField();
|
String fname = q.getField();
|
||||||
FieldType ft = writeFieldName(fname, schema, out, flags);
|
FieldType ft = writeFieldName(fname, schema, out, flags);
|
||||||
out.append( q.includesLower() ? '[' : '{' );
|
out.append( q.includesLower() ? '[' : '{' );
|
||||||
String lt = q.getLowerVal();
|
String lt = q.getLowerTerm();
|
||||||
String ut = q.getUpperVal();
|
String ut = q.getUpperTerm();
|
||||||
if (lt==null) {
|
if (lt==null) {
|
||||||
out.append('*');
|
out.append('*');
|
||||||
} else {
|
} else {
|
||||||
|
@ -339,17 +339,17 @@ public class QueryParsing {
|
||||||
}
|
}
|
||||||
|
|
||||||
out.append( q.includesUpper() ? ']' : '}' );
|
out.append( q.includesUpper() ? ']' : '}' );
|
||||||
} else if (query instanceof RangeQuery) {
|
} else if (query instanceof NumericRangeQuery) {
|
||||||
RangeQuery q = (RangeQuery)query;
|
NumericRangeQuery q = (NumericRangeQuery)query;
|
||||||
String fname = q.getField();
|
String fname = q.getField();
|
||||||
FieldType ft = writeFieldName(fname, schema, out, flags);
|
FieldType ft = writeFieldName(fname, schema, out, flags);
|
||||||
out.append( q.isInclusive() ? '[' : '{' );
|
out.append( q.includesMin() ? '[' : '{' );
|
||||||
Term lt = q.getLowerTerm();
|
Number lt = q.getMin();
|
||||||
Term ut = q.getUpperTerm();
|
Number ut = q.getMax();
|
||||||
if (lt==null) {
|
if (lt==null) {
|
||||||
out.append('*');
|
out.append('*');
|
||||||
} else {
|
} else {
|
||||||
writeFieldVal(lt.text(), ft, out, flags);
|
writeFieldVal(lt.toString(), ft, out, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
out.append(" TO ");
|
out.append(" TO ");
|
||||||
|
@ -357,11 +357,10 @@ public class QueryParsing {
|
||||||
if (ut==null) {
|
if (ut==null) {
|
||||||
out.append('*');
|
out.append('*');
|
||||||
} else {
|
} else {
|
||||||
writeFieldVal(ut.text(), ft, out, flags);
|
writeFieldVal(ut.toString(), ft, out, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
out.append( q.isInclusive() ? ']' : '}' );
|
out.append( q.includesMax() ? ']' : '}' );
|
||||||
|
|
||||||
} else if (query instanceof BooleanQuery) {
|
} else if (query instanceof BooleanQuery) {
|
||||||
BooleanQuery q = (BooleanQuery)query;
|
BooleanQuery q = (BooleanQuery)query;
|
||||||
boolean needParens=false;
|
boolean needParens=false;
|
||||||
|
|
|
@ -934,6 +934,9 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
}
|
}
|
||||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||||
}
|
}
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
collector = new Collector() {
|
collector = new Collector() {
|
||||||
|
@ -948,6 +951,9 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
}
|
}
|
||||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||||
}
|
}
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1051,6 +1057,9 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
}
|
}
|
||||||
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
public void setNextReader(IndexReader reader, int docBase) throws IOException {
|
||||||
}
|
}
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
package org.apache.solr.search.function;
|
package org.apache.solr.search.function;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache;
|
import org.apache.lucene.search.FieldCache;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -31,13 +31,13 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class DoubleFieldSource extends FieldCacheSource {
|
public class DoubleFieldSource extends FieldCacheSource {
|
||||||
protected ExtendedFieldCache.DoubleParser parser;
|
protected FieldCache.DoubleParser parser;
|
||||||
|
|
||||||
public DoubleFieldSource(String field) {
|
public DoubleFieldSource(String field) {
|
||||||
this(field, null);
|
this(field, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DoubleFieldSource(String field, ExtendedFieldCache.DoubleParser parser) {
|
public DoubleFieldSource(String field, FieldCache.DoubleParser parser) {
|
||||||
super(field);
|
super(field);
|
||||||
this.parser = parser;
|
this.parser = parser;
|
||||||
}
|
}
|
||||||
|
@ -48,8 +48,8 @@ public class DoubleFieldSource extends FieldCacheSource {
|
||||||
|
|
||||||
public DocValues getValues(IndexReader reader) throws IOException {
|
public DocValues getValues(IndexReader reader) throws IOException {
|
||||||
final double[] arr = (parser == null) ?
|
final double[] arr = (parser == null) ?
|
||||||
((ExtendedFieldCache) cache).getDoubles(reader, field) :
|
((FieldCache) cache).getDoubles(reader, field) :
|
||||||
((ExtendedFieldCache) cache).getDoubles(reader, field, parser);
|
((FieldCache) cache).getDoubles(reader, field, parser);
|
||||||
return new DocValues() {
|
return new DocValues() {
|
||||||
public float floatVal(int doc) {
|
public float floatVal(int doc) {
|
||||||
return (float) arr[doc];
|
return (float) arr[doc];
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
package org.apache.solr.search.function;
|
package org.apache.solr.search.function;
|
||||||
|
|
||||||
import org.apache.lucene.search.FieldCache;
|
import org.apache.lucene.search.FieldCache;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A base class for ValueSource implementations that retrieve values for
|
* A base class for ValueSource implementations that retrieve values for
|
||||||
|
@ -28,7 +27,7 @@ import org.apache.lucene.search.ExtendedFieldCache;
|
||||||
*/
|
*/
|
||||||
public abstract class FieldCacheSource extends ValueSource {
|
public abstract class FieldCacheSource extends ValueSource {
|
||||||
protected String field;
|
protected String field;
|
||||||
protected FieldCache cache = ExtendedFieldCache.EXT_DEFAULT;
|
protected FieldCache cache = FieldCache.DEFAULT;
|
||||||
|
|
||||||
public FieldCacheSource(String field) {
|
public FieldCacheSource(String field) {
|
||||||
this.field=field;
|
this.field=field;
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
package org.apache.solr.search.function;
|
package org.apache.solr.search.function;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache;
|
import org.apache.lucene.search.FieldCache;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -32,13 +32,13 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class LongFieldSource extends FieldCacheSource {
|
public class LongFieldSource extends FieldCacheSource {
|
||||||
protected ExtendedFieldCache.LongParser parser;
|
protected FieldCache.LongParser parser;
|
||||||
|
|
||||||
public LongFieldSource(String field) {
|
public LongFieldSource(String field) {
|
||||||
this(field, null);
|
this(field, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongFieldSource(String field, ExtendedFieldCache.LongParser parser) {
|
public LongFieldSource(String field, FieldCache.LongParser parser) {
|
||||||
super(field);
|
super(field);
|
||||||
this.parser = parser;
|
this.parser = parser;
|
||||||
}
|
}
|
||||||
|
@ -49,8 +49,8 @@ public class LongFieldSource extends FieldCacheSource {
|
||||||
|
|
||||||
public DocValues getValues(IndexReader reader) throws IOException {
|
public DocValues getValues(IndexReader reader) throws IOException {
|
||||||
final long[] arr = (parser == null) ?
|
final long[] arr = (parser == null) ?
|
||||||
((ExtendedFieldCache) cache).getLongs(reader, field) :
|
((FieldCache) cache).getLongs(reader, field) :
|
||||||
((ExtendedFieldCache) cache).getLongs(reader, field, parser);
|
((FieldCache) cache).getLongs(reader, field, parser);
|
||||||
return new DocValues() {
|
return new DocValues() {
|
||||||
public float floatVal(int doc) {
|
public float floatVal(int doc) {
|
||||||
return (float) arr[doc];
|
return (float) arr[doc];
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
package org.apache.solr.search.function;
|
package org.apache.solr.search.function;
|
||||||
|
|
||||||
import org.apache.lucene.search.FieldCache;
|
import org.apache.lucene.search.FieldCache;
|
||||||
import org.apache.lucene.search.ExtendedFieldCache;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
|
@ -1,69 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
public class TestCharFilter extends TestCase {
|
|
||||||
|
|
||||||
public void testCharFilter1() throws Exception {
|
|
||||||
CharStream cs = new CharFilter1( CharReader.get( new StringReader("") ) );
|
|
||||||
assertEquals( "corrected offset is invalid", 1, cs.correctOffset( 0 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testCharFilter2() throws Exception {
|
|
||||||
CharStream cs = new CharFilter2( CharReader.get( new StringReader("") ) );
|
|
||||||
assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testCharFilter12() throws Exception {
|
|
||||||
CharStream cs = new CharFilter2( new CharFilter1( CharReader.get( new StringReader("") ) ) );
|
|
||||||
assertEquals( "corrected offset is invalid", 3, cs.correctOffset( 0 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testCharFilter11() throws Exception {
|
|
||||||
CharStream cs = new CharFilter1( new CharFilter1( CharReader.get( new StringReader("") ) ) );
|
|
||||||
assertEquals( "corrected offset is invalid", 2, cs.correctOffset( 0 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static class CharFilter1 extends CharFilter {
|
|
||||||
|
|
||||||
protected CharFilter1(CharStream in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int correct(int currentOff) {
|
|
||||||
return currentOff + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static class CharFilter2 extends CharFilter {
|
|
||||||
|
|
||||||
protected CharFilter2(CharStream in) {
|
|
||||||
super(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int correct(int currentOff) {
|
|
||||||
return currentOff + 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,176 +0,0 @@
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.solr.analysis;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
public class TestMappingCharFilter extends BaseTokenTestCase {
|
|
||||||
|
|
||||||
NormalizeMap normMap;
|
|
||||||
|
|
||||||
public void setUp() throws Exception {
|
|
||||||
normMap = new NormalizeMap();
|
|
||||||
|
|
||||||
normMap.add( "aa", "a" );
|
|
||||||
normMap.add( "bbb", "b" );
|
|
||||||
normMap.add( "cccc", "cc" );
|
|
||||||
|
|
||||||
normMap.add( "h", "i" );
|
|
||||||
normMap.add( "j", "jj" );
|
|
||||||
normMap.add( "k", "kkk" );
|
|
||||||
normMap.add( "ll", "llll" );
|
|
||||||
|
|
||||||
normMap.add( "empty", "" );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testReaderReset() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
|
|
||||||
char[] buf = new char[10];
|
|
||||||
int len = cs.read(buf, 0, 10);
|
|
||||||
assertEquals( 1, len );
|
|
||||||
assertEquals( 'x', buf[0]) ;
|
|
||||||
len = cs.read(buf, 0, 10);
|
|
||||||
assertEquals( -1, len );
|
|
||||||
|
|
||||||
// rewind
|
|
||||||
cs.reset();
|
|
||||||
len = cs.read(buf, 0, 10);
|
|
||||||
assertEquals( 1, len );
|
|
||||||
assertEquals( 'x', buf[0]) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testNothingChange() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "x" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test1to1() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "i" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test1to2() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "jj,1,0,1" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test1to3() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "kkk,1,0,1" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test2to4() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "llll,1,0,2" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test2to1() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "a,1,0,2" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test3to1() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "b,1,0,3" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test4to2() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "cc,1,0,4" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
public void test5to0() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
assertEquals( 0, real.size() );
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// 1111111111222
|
|
||||||
// 01234567890123456789012
|
|
||||||
//(in) h i j k ll cccc bbb aa
|
|
||||||
//
|
|
||||||
// 1111111111222
|
|
||||||
// 01234567890123456789012
|
|
||||||
//(out) i i jj kkk llll cc b a
|
|
||||||
//
|
|
||||||
// h, 0, 1 => i, 0, 1
|
|
||||||
// i, 2, 3 => i, 2, 3
|
|
||||||
// j, 4, 5 => jj, 4, 5
|
|
||||||
// k, 6, 7 => kkk, 6, 7
|
|
||||||
// ll, 8,10 => llll, 8,10
|
|
||||||
// cccc,11,15 => cc,11,15
|
|
||||||
// bbb,16,19 => b,16,19
|
|
||||||
// aa,20,22 => a,20,22
|
|
||||||
//
|
|
||||||
public void testTokenStream() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// 0123456789
|
|
||||||
//(in) aaaa ll h
|
|
||||||
//(out-1) aa llll i
|
|
||||||
//(out-2) a llllllll i
|
|
||||||
//
|
|
||||||
// aaaa,0,4 => a,0,4
|
|
||||||
// ll,5,7 => llllllll,5,7
|
|
||||||
// h,8,9 => i,8,9
|
|
||||||
public void testChained() throws Exception {
|
|
||||||
CharStream cs = new MappingCharFilter( normMap,
|
|
||||||
new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
|
|
||||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
|
||||||
List<Token> real = getTokens( ts );
|
|
||||||
List<Token> expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
|
|
||||||
assertTokEqualOff( expect, real );
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -23,6 +23,7 @@ import java.util.Random;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
|
||||||
import org.apache.lucene.util.OpenBitSetIterator;
|
import org.apache.lucene.util.OpenBitSetIterator;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @deprecated
|
* @deprecated
|
||||||
|
@ -62,7 +63,7 @@ public class TestOpenBitSet extends TestCase {
|
||||||
iterator.skipTo(bb+1);
|
iterator.skipTo(bb+1);
|
||||||
bb = iterator.doc();
|
bb = iterator.doc();
|
||||||
}
|
}
|
||||||
assertEquals(aa,bb);
|
assertEquals(aa == -1 ? DocIdSetIterator.NO_MORE_DOCS : aa, bb);
|
||||||
} while (aa>=0);
|
} while (aa>=0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,8 @@
|
||||||
org.apache.lucene.analysis.Token,
|
org.apache.lucene.analysis.Token,
|
||||||
org.apache.lucene.analysis.TokenStream,
|
org.apache.lucene.analysis.TokenStream,
|
||||||
org.apache.lucene.index.Payload,
|
org.apache.lucene.index.Payload,
|
||||||
org.apache.solr.analysis.CharReader,
|
org.apache.lucene.analysis.CharReader,
|
||||||
org.apache.solr.analysis.CharStream,
|
org.apache.lucene.analysis.CharStream,
|
||||||
org.apache.solr.analysis.CharFilterFactory,
|
org.apache.solr.analysis.CharFilterFactory,
|
||||||
org.apache.solr.analysis.TokenFilterFactory,
|
org.apache.solr.analysis.TokenFilterFactory,
|
||||||
org.apache.solr.analysis.TokenizerChain,
|
org.apache.solr.analysis.TokenizerChain,
|
||||||
|
|
Loading…
Reference in New Issue