Fix for LUCENE-4362, ban tabs-indent

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1386681 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erick Erickson 2012-09-17 16:01:56 +00:00
parent 310eb39792
commit ded01621a4
189 changed files with 5405 additions and 5433 deletions

View File

@ -50,7 +50,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Brazilian Portuguese stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
@ -74,19 +74,19 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
}
/**
* Contains words that should be indexed but not stemmed.
*/
private CharArraySet excltable = CharArraySet.EMPTY_SET;
/**
* Contains words that should be indexed but not stemmed.
*/
private CharArraySet excltable = CharArraySet.EMPTY_SET;
/**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*/
public BrazilianAnalyzer(Version matchVersion) {
/**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*/
public BrazilianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
}
/**
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion

View File

@ -25,37 +25,37 @@ import java.util.Locale;
public class BrazilianStemmer {
private static final Locale locale = new Locale("pt", "BR");
/**
* Changed term
*/
private String TERM ;
private String CT ;
private String R1 ;
private String R2 ;
private String RV ;
/**
* Changed term
*/
private String TERM ;
private String CT ;
private String R1 ;
private String R2 ;
private String RV ;
public BrazilianStemmer() {
}
public BrazilianStemmer() {
}
/**
* Stems the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
protected String stem( String term ) {
/**
* Stems the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
protected String stem( String term ) {
boolean altered = false ; // altered the term
// creates CT
createCT(term) ;
if ( !isIndexable( CT ) ) {
return null;
}
if ( !isStemmable( CT ) ) {
return CT ;
}
if ( !isIndexable( CT ) ) {
return null;
}
if ( !isStemmable( CT ) ) {
return CT ;
}
R1 = getR1(CT) ;
R2 = getR1(R1) ;
@ -76,38 +76,38 @@ public class BrazilianStemmer {
step5() ;
return CT ;
}
}
/**
* Checks a term if it can be processed correctly.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term ) {
for ( int c = 0; c < term.length(); c++ ) {
// Discard terms that contain non-letter characters.
if ( !Character.isLetter(term.charAt(c))) {
return false;
}
}
return true;
}
/**
* Checks a term if it can be processed correctly.
*
* @return true if, and only if, the given term consists in letters.
*/
private boolean isStemmable( String term ) {
for ( int c = 0; c < term.length(); c++ ) {
// Discard terms that contain non-letter characters.
if ( !Character.isLetter(term.charAt(c))) {
return false;
}
}
return true;
}
/**
* Checks a term if it can be processed indexed.
*
* @return true if it can be indexed
*/
private boolean isIndexable( String term ) {
return (term.length() < 30) && (term.length() > 2) ;
}
/**
* Checks a term if it can be processed indexed.
*
* @return true if it can be indexed
*/
private boolean isIndexable( String term ) {
return (term.length() < 30) && (term.length() > 2) ;
}
/**
* See if string is 'a','e','i','o','u'
/**
* See if string is 'a','e','i','o','u'
*
* @return true if is vowel
*/
private boolean isVowel( char value ) {
*/
private boolean isVowel( char value ) {
return (value == 'a') ||
(value == 'e') ||
(value == 'i') ||
@ -115,16 +115,16 @@ public class BrazilianStemmer {
(value == 'u') ;
}
/**
* Gets R1
/**
* Gets R1
*
* R1 - is the region after the first non-vowel following a vowel,
* or is the null region at the end of the word if there is
* no such non-vowel.
*
* @return null or a string representing R1
*/
private String getR1( String value ) {
*/
private String getR1( String value ) {
int i;
int j;
@ -159,8 +159,8 @@ public class BrazilianStemmer {
return value.substring(j+1) ;
}
/**
* Gets RV
/**
* Gets RV
*
* RV - IF the second letter is a consonant, RV is the region after
* the next following vowel,
@ -175,8 +175,8 @@ public class BrazilianStemmer {
* found.
*
* @return null or a string representing RV
*/
private String getRV( String value ) {
*/
private String getRV( String value ) {
int i;
int j;
@ -229,15 +229,15 @@ public class BrazilianStemmer {
return null ;
}
/**
/**
* 1) Turn to lowercase
* 2) Remove accents
* 3) ã -> a ; õ -> o
* 4) ç -> c
*
* @return null or a string transformed
*/
private String changeTerm( String value ) {
*/
private String changeTerm( String value ) {
int j;
String r = "" ;
@ -282,12 +282,12 @@ public class BrazilianStemmer {
return r ;
}
/**
/**
* Check if a string ends with a suffix
*
* @return true if the string ends with the specified suffix
*/
private boolean suffix( String value, String suffix ) {
*/
private boolean suffix( String value, String suffix ) {
// be-safe !!!
if ((value == null) || (suffix == null)) {
@ -301,12 +301,12 @@ public class BrazilianStemmer {
return value.substring(value.length()-suffix.length()).equals(suffix);
}
/**
/**
* Replace a string suffix by another
*
* @return the replaced String
*/
private String replaceSuffix( String value, String toReplace, String changeTo ) {
*/
private String replaceSuffix( String value, String toReplace, String changeTo ) {
String vvalue ;
// be-safe !!!
@ -325,12 +325,12 @@ public class BrazilianStemmer {
}
}
/**
/**
* Remove a string suffix
*
* @return the String without the suffix
*/
private String removeSuffix( String value, String toRemove ) {
*/
private String removeSuffix( String value, String toRemove ) {
// be-safe !!!
if ((value == null) ||
(toRemove == null) ||
@ -341,12 +341,12 @@ public class BrazilianStemmer {
return value.substring(0,value.length()-toRemove.length()) ;
}
/**
/**
* See if a suffix is preceded by a String
*
* @return true if the suffix is preceded
*/
private boolean suffixPreceded( String value, String suffix, String preceded ) {
*/
private boolean suffixPreceded( String value, String suffix, String preceded ) {
// be-safe !!!
if ((value == null) ||
(suffix == null) ||
@ -358,10 +358,10 @@ public class BrazilianStemmer {
return suffix(removeSuffix(value,suffix),preceded) ;
}
/**
* Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
*/
private void createCT( String term ) {
/**
* Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
*/
private void createCT( String term ) {
CT = changeTerm(term) ;
if (CT.length() < 2) return ;
@ -396,14 +396,14 @@ public class BrazilianStemmer {
}
/**
* Standard suffix removal.
/**
* Standard suffix removal.
* Search for the longest among the following suffixes, and perform
* the following actions:
*
* @return false if no ending was removed
*/
private boolean step1() {
*/
private boolean step1() {
if (CT == null) return false ;
// suffix length = 7
@ -559,15 +559,15 @@ public class BrazilianStemmer {
}
/**
* Verb suffixes.
/**
* Verb suffixes.
*
* Search for the longest among the following suffixes in RV,
* and if found, delete.
*
* @return false if no ending was removed
*/
private boolean step2() {
*/
private boolean step2() {
if (RV == null) return false ;
// suffix lenght = 7
@ -941,11 +941,11 @@ public class BrazilianStemmer {
return false ;
}
/**
* Delete suffix 'i' if in RV and preceded by 'c'
/**
* Delete suffix 'i' if in RV and preceded by 'c'
*
*/
private void step3() {
*/
private void step3() {
if (RV == null) return ;
if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) {
@ -954,14 +954,14 @@ public class BrazilianStemmer {
}
/**
* Residual suffix
/**
* Residual suffix
*
* If the word ends with one of the suffixes (os a i o á í ó)
* in RV, delete it
*
*/
private void step4() {
*/
private void step4() {
if (RV == null) return ;
if (suffix(RV,"os")) {
@ -979,15 +979,15 @@ public class BrazilianStemmer {
}
/**
* If the word ends with one of ( e é ê) in RV,delete it,
/**
* If the word ends with one of ( e é ê) in RV,delete it,
* and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
* delete the 'u' (or 'i')
*
* Or if the word ends ç remove the cedilha
*
*/
private void step5() {
*/
private void step5() {
if (RV == null) return ;
if (suffix(RV,"e")) {
@ -1007,18 +1007,18 @@ public class BrazilianStemmer {
}
}
/**
* For log and debug purpose
*
* @return TERM, CT, RV, R1 and R2
*/
public String log() {
/**
* For log and debug purpose
*
* @return TERM, CT, RV, R1 and R2
*/
public String log() {
return " (TERM = " + TERM + ")" +
" (CT = " + CT +")" +
" (RV = " + RV +")" +
" (R1 = " + R1 +")" +
" (R2 = " + R2 +")" ;
}
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
package org.apache.lucene.analysis.charfilter;
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
* on 9/17/12 9:29 AM from the specification file
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@ -52,29 +52,29 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
private static final int DOUBLE_QUOTED_STRING = 28;
private static final int CHARACTER_REFERENCE_TAIL = 6;
private static final int START_TAG_TAIL_EXCLUDE = 38;
private static final int SCRIPT = 14;
private static final int CDATA = 22;
private static final int LEFT_ANGLE_BRACKET = 8;
private static final int END_TAG_TAIL_EXCLUDE = 32;
private static final int SERVER_SIDE_INCLUDE = 24;
private static final int END_TAG_TAIL_SUBSTITUTE = 34;
private static final int SINGLE_QUOTED_STRING = 26;
private static final int YYINITIAL = 0;
private static final int STYLE = 42;
private static final int START_TAG_TAIL_INCLUDE = 36;
private static final int AMPERSAND = 2;
private static final int BANG = 10;
private static final int LEFT_ANGLE_BRACKET_SLASH = 18;
private static final int START_TAG_TAIL_SUBSTITUTE = 40;
private static final int COMMENT = 12;
private static final int SCRIPT_COMMENT = 16;
private static final int LEFT_ANGLE_BRACKET_SPACE = 20;
private static final int STYLE_COMMENT = 44;
private static final int NUMERIC_CHARACTER = 4;
private static final int CHARACTER_REFERENCE_TAIL = 6;
private static final int LEFT_ANGLE_BRACKET = 8;
private static final int BANG = 10;
private static final int COMMENT = 12;
private static final int SCRIPT = 14;
private static final int SCRIPT_COMMENT = 16;
private static final int LEFT_ANGLE_BRACKET_SLASH = 18;
private static final int LEFT_ANGLE_BRACKET_SPACE = 20;
private static final int CDATA = 22;
private static final int SERVER_SIDE_INCLUDE = 24;
private static final int SINGLE_QUOTED_STRING = 26;
private static final int DOUBLE_QUOTED_STRING = 28;
private static final int END_TAG_TAIL_INCLUDE = 30;
private static final int END_TAG_TAIL_EXCLUDE = 32;
private static final int END_TAG_TAIL_SUBSTITUTE = 34;
private static final int START_TAG_TAIL_INCLUDE = 36;
private static final int START_TAG_TAIL_EXCLUDE = 38;
private static final int START_TAG_TAIL_SUBSTITUTE = 40;
private static final int STYLE = 42;
private static final int STYLE_COMMENT = 44;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
@ -30967,7 +30967,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
}
}
// numRead < 0
// numRead < 0
return true;
}
@ -31247,135 +31247,24 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 15:
{
case 1:
{ return zzBuffer[zzStartRead];
}
case 54: break;
case 39:
{ yybegin(STYLE);
}
case 55: break;
case 27:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 56: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
case 57: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 58: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
case 59: break;
case 2:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
case 60: break;
case 44:
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
case 55: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
case 61: break;
case 21:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
case 62: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
case 63: break;
case 35:
{ yybegin(SCRIPT);
}
case 64: break;
case 42:
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 65: break;
case 10:
{ inputSegment.append('!'); yybegin(BANG);
}
case 66: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 67: break;
case 56: break;
case 4:
{ yypushback(1);
outputSegment = inputSegment;
@ -31383,166 +31272,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 68: break;
case 43:
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
case 57: break;
case 5:
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 69: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 70: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 71: break;
case 50:
{ // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
case 72: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 73: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
case 74: break;
case 26:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 75: break;
case 20:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 76: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 77: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 78: break;
case 23:
{ yybegin(restoreState); restoreState = previousRestoreState;
}
case 79: break;
case 32:
{ yybegin(COMMENT);
}
case 80: break;
case 24:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 81: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
case 82: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 83: break;
case 14:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 84: break;
case 58: break;
case 6:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
@ -31576,50 +31310,26 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return outputSegment.nextChar();
}
}
case 85: break;
case 34:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
case 59: break;
case 7:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 86: break;
case 5:
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 87: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
case 88: break;
case 18:
case 60: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
case 89: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
case 90: break;
case 37:
{ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
case 91: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
}
case 92: break;
case 61: break;
case 9:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31629,57 +31339,55 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
case 93: break;
case 49:
{ inputSegment.clear();
case 62: break;
case 10:
{ inputSegment.append('!'); yybegin(BANG);
}
case 63: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
case 64: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
}
case 65: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
case 66: break;
case 14:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 94: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
case 67: break;
case 15:
{
}
case 95: break;
case 68: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 69: break;
case 17:
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 96: break;
case 45:
{ yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
case 70: break;
case 18:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
case 97: break;
case 7:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 98: break;
case 71: break;
case 19:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
@ -31689,7 +31397,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
case 99: break;
case 72: break;
case 20:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 73: break;
case 21:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
case 74: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
case 75: break;
case 23:
{ yybegin(restoreState); restoreState = previousRestoreState;
}
case 76: break;
case 24:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 77: break;
case 25:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
@ -31699,7 +31434,45 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
case 100: break;
case 78: break;
case 26:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 79: break;
case 27:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 80: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 81: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 82: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
case 83: break;
case 31:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
@ -31734,7 +31507,262 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return outputSegment.nextChar();
}
}
case 84: break;
case 32:
{ yybegin(COMMENT);
}
case 85: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 86: break;
case 34:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 87: break;
case 35:
{ yybegin(SCRIPT);
}
case 88: break;
case 36:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
case 89: break;
case 37:
{ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
case 90: break;
case 38:
{ yybegin(restoreState);
}
case 91: break;
case 39:
{ yybegin(STYLE);
}
case 92: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
case 93: break;
case 41:
{ yybegin(STYLE_COMMENT);
}
case 94: break;
case 42:
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 95: break;
case 43:
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 96: break;
case 44:
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 97: break;
case 45:
{ yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 98: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 99: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 100: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 101: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 102: break;
case 50:
{ // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
case 103: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 104: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 105: break;
case 53:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
@ -31770,34 +31798,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 102: break;
case 36:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
case 103: break;
case 38:
{ yybegin(restoreState);
}
case 104: break;
case 41:
{ yybegin(STYLE_COMMENT);
}
case 105: break;
case 1:
{ return zzBuffer[zzStartRead];
}
case 106: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -90,18 +90,18 @@ EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
[bB][lL][uU][rR] |
[cC][hH][aA][nN][gG][eE] |
[cC][lL][iI][cC][kK] |
[dD][bB][lL][cC][lL][iI][cC][kK] |
[dD][bB][lL][cC][lL][iI][cC][kK] |
[eE][rR][rR][oO][rR] |
[fF][oO][cC][uU][sS] |
[kK][eE][yY][dD][oO][wW][nN] |
[kK][eE][yY][pP][rR][eE][sS][sS] |
[kK][eE][yY][uU][pP] |
[kK][eE][yY][dD][oO][wW][nN] |
[kK][eE][yY][pP][rR][eE][sS][sS] |
[kK][eE][yY][uU][pP] |
[lL][oO][aA][dD] |
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
[mM][oO][uU][sS][eE][oO][uU][tT] |
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
[mM][oO][uU][sS][eE][uU][pP] |
[mM][oO][uU][sS][eE][uU][pP] |
[rR][eE][sS][eE][tT] |
[sS][eE][lL][eE][cC][tT] |
[sS][uU][bB][mM][iI][tT] |

View File

@ -30,7 +30,7 @@ import java.io.IOException;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
* minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/&gt;
* minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*

View File

@ -50,7 +50,7 @@ import org.xml.sax.InputSource;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
* dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/&gt;
* dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*

View File

@ -50,24 +50,24 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
*
* @return a set of default Czech-stopwords
*/
public static final CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
public static final CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder {
private static final CharArraySet DEFAULT_SET;
private static class DefaultSetHolder {
private static final CharArraySet DEFAULT_SET;
static {
try {
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
static {
try {
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
private final CharArraySet stemExclusionTable;
@ -77,9 +77,9 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
*
* @param matchVersion Lucene version to match
*/
public CzechAnalyzer(Version matchVersion) {
public CzechAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
}
}
/**
* Builds an analyzer with the given stop words.

View File

@ -49,8 +49,8 @@ import java.util.StringTokenizer;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
* keep="java solr lucene" keepIgnoreCase="false"
* okPrefix="McK McD McA"/&gt;
* keep="java solr lucene" keepIgnoreCase="false"
* okPrefix="McK McD McA"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*

View File

@ -31,8 +31,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Example field definition in schema.xml:
* <pre class="prettyprint">
* &lt;fieldtype name="text" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/&gt;
* &lt;filter class="solr.StopFilterFactory" ignoreCase="true"/&gt;
* &lt;filter class="solr.HyphenatedWordsFilterFactory"/&gt;

View File

@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
*
*/
public class HyphenatedWordsFilterFactory extends TokenFilterFactory {
public HyphenatedWordsFilter create(TokenStream input) {
return new HyphenatedWordsFilter(input);
}
public HyphenatedWordsFilter create(TokenStream input) {
return new HyphenatedWordsFilter(input);
}
}

View File

@ -43,10 +43,10 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
* <pre class="prettyprint" >
* &lt;fieldType name="descendent_path" class="solr.TextField"&gt;
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /&gt;
* &lt;tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /&gt;
* &lt;/analyzer&gt;
* &lt;analyzer type="query"&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory" /&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
@ -61,10 +61,10 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
* <pre class="prettyprint" >
* &lt;fieldType name="descendent_path" class="solr.TextField"&gt;
* &lt;analyzer type="index"&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory" /&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory" /&gt;
* &lt;/analyzer&gt;
* &lt;analyzer type="query"&gt;
* &lt;tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /&gt;
* &lt;tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>

View File

@ -211,6 +211,6 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
}
}
return allStopWords.toArray(new Term[allStopWords.size()]);
}
}
}

View File

@ -395,7 +395,7 @@ public final class ShingleFilter extends TokenFilter {
exhausted = true;
}
return newTarget;
}
}
/**
* <p>Fills {@link #inputWindow} with input stream tokens, if available,

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:28 AM */
package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
* on 9/17/12 9:28 AM from the specification file
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -453,7 +453,7 @@ public final void getText(CharTermAttribute t) {
}
}
// numRead < 0
// numRead < 0
return true;
}
@ -674,44 +674,44 @@ public final void getText(CharTermAttribute t) {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 10:
{ return EMAIL;
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 11: break;
case 2:
{ return ALPHANUM;
}
case 12: break;
case 4:
{ return HOST;
case 3:
{ return CJ;
}
case 13: break;
case 8:
{ return ACRONYM_DEP;
case 4:
{ return HOST;
}
case 14: break;
case 5:
{ return NUM;
}
case 15: break;
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 16: break;
case 9:
{ return ACRONYM;
}
case 17: break;
case 7:
{ return COMPANY;
}
case 18: break;
case 6:
{ return APOSTROPHE;
}
case 16: break;
case 7:
{ return COMPANY;
}
case 17: break;
case 8:
{ return ACRONYM_DEP;
}
case 18: break;
case 9:
{ return ACRONYM;
}
case 19: break;
case 3:
{ return CJ;
case 10:
{ return EMAIL;
}
case 20: break;
default:

View File

@ -79,7 +79,7 @@ APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// use a post-filter to remove dots
ACRONYM = {LETTER} "." ({LETTER} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
@ -100,7 +100,7 @@ NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
P = ("_"|"-"|"/"|"."|",")
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
// Generated using ICU4J 49.1.0.0 on Monday, September 17, 2012 1:28:46 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:28 AM */
package org.apache.lucene.analysis.standard;
@ -936,7 +936,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
}
}
// numRead < 0
// numRead < 0
return true;
}
@ -1157,36 +1157,36 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 1:
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 9: break;
case 2:
{ return WORD_TYPE;
}
case 9: break;
case 5:
{ return SOUTH_EAST_ASIAN_TYPE;
}
case 10: break;
case 3:
{ return NUMERIC_TYPE;
}
case 11: break;
case 4:
{ return KATAKANA_TYPE;
}
case 11: break;
case 6:
{ return IDEOGRAPHIC_TYPE;
}
case 12: break;
case 8:
{ return HANGUL_TYPE;
case 5:
{ return SOUTH_EAST_ASIAN_TYPE;
}
case 13: break;
case 3:
{ return NUMERIC_TYPE;
case 6:
{ return IDEOGRAPHIC_TYPE;
}
case 14: break;
case 7:
{ return HIRAGANA_TYPE;
}
case 15: break;
case 1:
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
case 8:
{ return HANGUL_TYPE;
}
case 16: break;
default:

View File

@ -115,8 +115,8 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
package org.apache.lucene.analysis.standard;
@ -4126,7 +4126,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
}
}
// numRead < 0
// numRead < 0
return true;
}
@ -4347,51 +4347,51 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 11:
// lookahead expression with fixed base length
zzMarkedPos = zzStartRead + 6;
{ return WORD_TYPE;
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 12: break;
case 2:
{ return WORD_TYPE;
}
case 13: break;
case 5:
{ return SOUTH_EAST_ASIAN_TYPE;
}
case 14: break;
case 1:
{ /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
}
case 15: break;
case 10:
{ return URL_TYPE;
}
case 16: break;
case 9:
{ return EMAIL_TYPE;
}
case 17: break;
case 4:
{ return KATAKANA_TYPE;
}
case 18: break;
case 6:
{ return IDEOGRAPHIC_TYPE;
}
case 19: break;
case 8:
{ return HANGUL_TYPE;
}
case 20: break;
case 3:
{ return NUMERIC_TYPE;
}
case 21: break;
case 14: break;
case 4:
{ return KATAKANA_TYPE;
}
case 15: break;
case 5:
{ return SOUTH_EAST_ASIAN_TYPE;
}
case 16: break;
case 6:
{ return IDEOGRAPHIC_TYPE;
}
case 17: break;
case 7:
{ return HIRAGANA_TYPE;
}
case 18: break;
case 8:
{ return HANGUL_TYPE;
}
case 19: break;
case 9:
{ return EMAIL_TYPE;
}
case 20: break;
case 10:
{ return URL_TYPE;
}
case 21: break;
case 11:
// lookahead expression with fixed base length
zzMarkedPos = zzStartRead + 6;
{ return WORD_TYPE;
}
case 22: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -200,8 +200,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return StandardTokenizerInterface.YYEOF; }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
package org.apache.lucene.analysis.wikipedia;
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
* on 9/17/12 9:29 AM from the specification file
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@ -37,16 +37,16 @@ class WikipediaTokenizerImpl {
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int THREE_SINGLE_QUOTES_STATE = 10;
public static final int EXTERNAL_LINK_STATE = 6;
public static final int DOUBLE_EQUALS_STATE = 14;
public static final int INTERNAL_LINK_STATE = 4;
public static final int DOUBLE_BRACE_STATE = 16;
public static final int CATEGORY_STATE = 2;
public static final int YYINITIAL = 0;
public static final int STRING = 18;
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
public static final int CATEGORY_STATE = 2;
public static final int INTERNAL_LINK_STATE = 4;
public static final int EXTERNAL_LINK_STATE = 6;
public static final int TWO_SINGLE_QUOTES_STATE = 8;
public static final int THREE_SINGLE_QUOTES_STATE = 10;
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
public static final int DOUBLE_EQUALS_STATE = 14;
public static final int DOUBLE_BRACE_STATE = 16;
public static final int STRING = 18;
/**
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
@ -589,7 +589,7 @@ final void reset() {
}
}
// numRead < 0
// numRead < 0
return true;
}
@ -810,188 +810,188 @@ final void reset() {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 44:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
case 1:
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 47: break;
case 37:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 48: break;
case 16:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
case 49: break;
case 20:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 50: break;
case 40:
{ positionInc = 1; return ACRONYM;
}
case 51: break;
case 5:
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 52: break;
case 36:
{ positionInc = 1; return COMPANY;
}
case 53: break;
case 10:
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 54: break;
case 15:
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
}
case 55: break;
case 22:
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
}
case 56: break;
case 35:
{ positionInc = 1; return NUM;
}
case 57: break;
case 33:
{ positionInc = 1; return APOSTROPHE;
}
case 58: break;
case 21:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
case 59: break;
case 18:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
}
case 60: break;
case 2:
{ positionInc = 1; return ALPHANUM;
}
case 48: break;
case 3:
{ positionInc = 1; return CJ;
}
case 49: break;
case 4:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 50: break;
case 5:
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
}
case 51: break;
case 6:
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
}
case 52: break;
case 7:
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
case 53: break;
case 8:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 54: break;
case 9:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
case 55: break;
case 10:
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 56: break;
case 11:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 57: break;
case 12:
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
case 58: break;
case 13:
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 59: break;
case 14:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
case 60: break;
case 15:
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
}
case 61: break;
case 1:
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
case 16:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
case 62: break;
case 17:
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
}
case 63: break;
case 39:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
case 18:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
}
case 64: break;
case 29:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 65: break;
case 46:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 66: break;
case 27:
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 67: break;
case 4:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 68: break;
case 38:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
}
case 69: break;
case 13:
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 70: break;
case 3:
{ positionInc = 1; return CJ;
}
case 71: break;
case 45:
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 72: break;
case 6:
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
}
case 73: break;
case 11:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 74: break;
case 25:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 75: break;
case 8:
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
}
case 76: break;
case 19:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
}
case 77: break;
case 43:
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
case 65: break;
case 20:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 78: break;
case 42:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
case 66: break;
case 21:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
case 79: break;
case 30:
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
case 67: break;
case 22:
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
}
case 80: break;
case 14:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
case 81: break;
case 9:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
case 82: break;
case 7:
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
case 83: break;
case 41:
{ positionInc = 1; return EMAIL;
}
case 84: break;
case 28:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 85: break;
case 68: break;
case 23:
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 86: break;
case 34:
{ positionInc = 1; return HOST;
}
case 87: break;
case 32:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 88: break;
case 12:
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
case 89: break;
case 69: break;
case 24:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 90: break;
case 70: break;
case 25:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 71: break;
case 26:
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
}
case 72: break;
case 27:
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 73: break;
case 28:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 74: break;
case 29:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 75: break;
case 30:
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
}
case 76: break;
case 31:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
}
case 77: break;
case 32:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 78: break;
case 33:
{ positionInc = 1; return APOSTROPHE;
}
case 79: break;
case 34:
{ positionInc = 1; return HOST;
}
case 80: break;
case 35:
{ positionInc = 1; return NUM;
}
case 81: break;
case 36:
{ positionInc = 1; return COMPANY;
}
case 82: break;
case 37:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 83: break;
case 38:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
}
case 84: break;
case 39:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
}
case 85: break;
case 40:
{ positionInc = 1; return ACRONYM;
}
case 86: break;
case 41:
{ positionInc = 1; return EMAIL;
}
case 87: break;
case 42:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
}
case 88: break;
case 43:
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 89: break;
case 44:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
}
case 90: break;
case 45:
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 91: break;
case 26:
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
case 46:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
}
case 92: break;
default:

View File

@ -136,7 +136,7 @@ NUM = ({ALPHANUM} {P} {HAS_DIGIT}
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
// punctuation
P = ("_"|"-"|"/"|"."|",")
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT =

View File

@ -43,25 +43,26 @@ import java.lang.reflect.Method;
* reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
*/
public class Among {
private static final Class<?>[] EMPTY_PARAMS = new Class[0];
public Among (String s, int substring_i, int result,
String methodname, SnowballProgram methodobject) {
this.s_size = s.length();
this.s = s.toCharArray();
this.substring_i = substring_i;
this.result = result;
this.methodobject = methodobject;
if (methodname.length() == 0) {
this.method = null;
} else {
try {
this.method = methodobject.getClass().
getDeclaredMethod(methodname, EMPTY_PARAMS);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
private static final Class<?>[] EMPTY_PARAMS = new Class[0];
public Among(String s, int substring_i, int result,
String methodname, SnowballProgram methodobject) {
this.s_size = s.length();
this.s = s.toCharArray();
this.substring_i = substring_i;
this.result = result;
this.methodobject = methodobject;
if (methodname.length() == 0) {
this.method = null;
} else {
try {
this.method = methodobject.getClass().
getDeclaredMethod(methodname, EMPTY_PARAMS);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
}
public final int s_size; /* search string */
public final char[] s; /* search string */

View File

@ -51,8 +51,8 @@ public abstract class SnowballProgram {
protected SnowballProgram()
{
current = new char[8];
setCurrent("");
current = new char[8];
setCurrent("");
}
public abstract boolean stem();
@ -62,12 +62,12 @@ public abstract class SnowballProgram {
*/
public void setCurrent(String value)
{
current = value.toCharArray();
cursor = 0;
limit = value.length();
limit_backward = 0;
bra = cursor;
ket = limit;
current = value.toCharArray();
cursor = 0;
limit = value.length();
limit_backward = 0;
bra = cursor;
ket = limit;
}
/**
@ -130,354 +130,350 @@ public abstract class SnowballProgram {
protected void copy_from(SnowballProgram other)
{
current = other.current;
cursor = other.cursor;
limit = other.limit;
limit_backward = other.limit_backward;
bra = other.bra;
ket = other.ket;
current = other.current;
cursor = other.cursor;
limit = other.limit;
limit_backward = other.limit_backward;
bra = other.bra;
ket = other.ket;
}
protected boolean in_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor++;
return true;
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor++;
return true;
}
protected boolean in_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor--;
return true;
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor--;
return true;
}
protected boolean out_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) {
cursor++;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor ++;
return true;
}
return false;
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) {
cursor++;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor ++;
return true;
}
return false;
}
protected boolean out_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) {
cursor--;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor--;
return true;
}
return false;
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) {
cursor--;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor--;
return true;
}
return false;
}
protected boolean in_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
cursor++;
return true;
if (cursor >= limit) return false;
char ch = current[cursor];
if (ch > max || ch < min) return false;
cursor++;
return true;
}
protected boolean in_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
cursor--;
return true;
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if (ch > max || ch < min) return false;
cursor--;
return true;
}
protected boolean out_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current[cursor];
if (!(ch > max || ch < min)) return false;
cursor++;
return true;
if (cursor >= limit) return false;
char ch = current[cursor];
if (!(ch > max || ch < min)) return false;
cursor++;
return true;
}
protected boolean out_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if(!(ch > max || ch < min)) return false;
cursor--;
return true;
if (cursor <= limit_backward) return false;
char ch = current[cursor - 1];
if(!(ch > max || ch < min)) return false;
cursor--;
return true;
}
protected boolean eq_s(int s_size, CharSequence s)
{
if (limit - cursor < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor + i] != s.charAt(i)) return false;
}
cursor += s_size;
return true;
if (limit - cursor < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor + i] != s.charAt(i)) return false;
}
cursor += s_size;
return true;
}
protected boolean eq_s_b(int s_size, CharSequence s)
{
if (cursor - limit_backward < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor - s_size + i] != s.charAt(i)) return false;
}
cursor -= s_size;
return true;
if (cursor - limit_backward < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current[cursor - s_size + i] != s.charAt(i)) return false;
}
cursor -= s_size;
return true;
}
protected boolean eq_v(CharSequence s)
{
return eq_s(s.length(), s);
return eq_s(s.length(), s);
}
protected boolean eq_v_b(CharSequence s)
{ return eq_s_b(s.length(), s);
{
return eq_s_b(s.length(), s);
}
protected int find_among(Among v[], int v_size)
{
int i = 0;
int j = v_size;
int i = 0;
int j = v_size;
int c = cursor;
int l = limit;
int c = cursor;
int l = limit;
int common_i = 0;
int common_j = 0;
int common_i = 0;
int common_j = 0;
boolean first_key_inspected = false;
boolean first_key_inspected = false;
while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; // smaller
Among w = v[k];
int i2;
for (i2 = common; i2 < w.s_size; i2++) {
if (c + common == l) {
diff = -1;
break;
}
diff = current[c + common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; // v->s has been inspected
if (j == i) break; // only one item in v
while (true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; // smaller
Among w = v[k];
int i2;
for (i2 = common; i2 < w.s_size; i2++) {
if (c + common == l) {
diff = -1;
break;
}
diff = current[c + common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; // v->s has been inspected
if (j == i) break; // only one item in v
// - but now we need to go round once more to get
// v->s inspected. This looks messy, but is actually
// the optimal approach.
// - but now we need to go round once more to get
// v->s inspected. This looks messy, but is actually
// the optimal approach.
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c + w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c + w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while (true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c + w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c + w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}
// find_among_b is for backwards processing. Same comments apply
// find_among_b is for backwards processing. Same comments apply
protected int find_among_b(Among v[], int v_size)
{
int i = 0;
int j = v_size;
int i = 0;
int j = v_size;
int c = cursor;
int lb = limit_backward;
int c = cursor;
int lb = limit_backward;
int common_i = 0;
int common_j = 0;
int common_i = 0;
int common_j = 0;
boolean first_key_inspected = false;
boolean first_key_inspected = false;
while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
Among w = v[k];
int i2;
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
if (c - common == lb) {
diff = -1;
break;
}
diff = current[c - 1 - common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c - w.s_size;
if (w.method == null) return w.result;
while (true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
Among w = v[k];
int i2;
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
if (c - common == lb) {
diff = -1;
break;
}
diff = current[c - 1 - common] - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while (true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c - w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c - w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c - w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}
/* to replace chars between c_bra and c_ket in current by the
/* to replace chars between c_bra and c_ket in current by the
* chars in s.
*/
protected int replace_s(int c_bra, int c_ket, CharSequence s)
{
final int adjustment = s.length() - (c_ket - c_bra);
final int newLength = limit + adjustment;
//resize if necessary
if (newLength > current.length) {
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(current, 0, newBuffer, 0, limit);
current = newBuffer;
}
// if the substring being replaced is longer or shorter than the
// replacement, need to shift things around
if (adjustment != 0 && c_ket < limit) {
System.arraycopy(current, c_ket, current, c_bra + s.length(),
limit - c_ket);
}
// insert the replacement text
// Note, faster is s.getChars(0, s.length(), current, c_bra);
// but would have to duplicate this method for both String and StringBuilder
for (int i = 0; i < s.length(); i++)
current[c_bra + i] = s.charAt(i);
limit += adjustment;
if (cursor >= c_ket) cursor += adjustment;
else if (cursor > c_bra) cursor = c_bra;
return adjustment;
protected int replace_s(int c_bra, int c_ket, CharSequence s) {
final int adjustment = s.length() - (c_ket - c_bra);
final int newLength = limit + adjustment;
//resize if necessary
if (newLength > current.length) {
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
System.arraycopy(current, 0, newBuffer, 0, limit);
current = newBuffer;
}
protected void slice_check()
{
if (bra < 0 ||
bra > ket ||
ket > limit)
{
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
*/
}
// if the substring being replaced is longer or shorter than the
// replacement, need to shift things around
if (adjustment != 0 && c_ket < limit) {
System.arraycopy(current, c_ket, current, c_bra + s.length(),
limit - c_ket);
}
// insert the replacement text
// Note, faster is s.getChars(0, s.length(), current, c_bra);
// but would have to duplicate this method for both String and StringBuilder
for (int i = 0; i < s.length(); i++)
current[c_bra + i] = s.charAt(i);
protected void slice_from(CharSequence s)
{
slice_check();
replace_s(bra, ket, s);
limit += adjustment;
if (cursor >= c_ket) cursor += adjustment;
else if (cursor > c_bra) cursor = c_bra;
return adjustment;
}
protected void slice_check() {
if (bra < 0 ||
bra > ket ||
ket > limit) {
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
*/
}
}
protected void slice_del()
{
slice_from((CharSequence)"");
}
protected void slice_from(CharSequence s) {
slice_check();
replace_s(bra, ket, s);
}
protected void insert(int c_bra, int c_ket, CharSequence s)
protected void slice_del() {
slice_from((CharSequence) "");
}
protected void insert(int c_bra, int c_ket, CharSequence s)
{
int adjustment = replace_s(c_bra, c_ket, s);
if (c_bra <= bra) bra += adjustment;
if (c_bra <= ket) ket += adjustment;
int adjustment = replace_s(c_bra, c_ket, s);
if (c_bra <= bra) bra += adjustment;
if (c_bra <= ket) ket += adjustment;
}
/* Copy the slice into the supplied StringBuffer */
protected StringBuilder slice_to(StringBuilder s)
{
slice_check();
int len = ket - bra;
s.setLength(0);
s.append(current, bra, len);
return s;
slice_check();
int len = ket - bra;
s.setLength(0);
s.append(current, bra, len);
return s;
}
protected StringBuilder assign_to(StringBuilder s)
{
s.setLength(0);
s.append(current, 0, limit);
return s;
s.setLength(0);
s.append(current, 0, limit);
return s;
}
/*

View File

@ -38,87 +38,87 @@ import org.apache.lucene.analysis.util.CharArraySet;
public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
public void testWithSnowballExamples() throws Exception {
check("boa", "boa");
check("boainain", "boainain");
check("boas", "boas");
check("bôas", "boas"); // removes diacritic: different from snowball portugese
check("boassu", "boassu");
check("boataria", "boat");
check("boate", "boat");
check("boates", "boat");
check("boatos", "boat");
check("bob", "bob");
check("boba", "bob");
check("bobagem", "bobag");
check("bobagens", "bobagens");
check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
check("bobear", "bob");
check("bobeira", "bobeir");
check("bobinho", "bobinh");
check("bobinhos", "bobinh");
check("bobo", "bob");
check("bobs", "bobs");
check("boca", "boc");
check("bocadas", "boc");
check("bocadinho", "bocadinh");
check("bocado", "boc");
check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
check("bocarra", "bocarr");
check("bocas", "boc");
check("bode", "bod");
check("bodoque", "bodoqu");
check("body", "body");
check("boeing", "boeing");
check("boem", "boem");
check("boemia", "boem");
check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
check("bogotá", "bogot");
check("boi", "boi");
check("bóia", "boi"); // removes diacritic: different from snowball portuguese
check("boiando", "boi");
check("quiabo", "quiab");
check("quicaram", "quic");
check("quickly", "quickly");
check("quieto", "quiet");
check("quietos", "quiet");
check("quilate", "quilat");
check("quilates", "quilat");
check("quilinhos", "quilinh");
check("quilo", "quil");
check("quilombo", "quilomb");
check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
check("quilos", "quil");
check("quimica", "quimic");
check("quilos", "quil");
check("quimica", "quimic");
check("quimicas", "quimic");
check("quimico", "quimic");
check("quimicos", "quimic");
check("quimioterapia", "quimioterap");
check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
check("quimono", "quimon");
check("quincas", "quinc");
check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
check("quinhentos", "quinhent");
check("quinn", "quinn");
check("quino", "quin");
check("quinta", "quint");
check("quintal", "quintal");
check("quintana", "quintan");
check("quintanilha", "quintanilh");
check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
check("quintino", "quintin");
check("quinto", "quint");
check("quintos", "quint");
check("quintuplicou", "quintuplic");
check("quinze", "quinz");
check("quinzena", "quinzen");
check("quiosque", "quiosqu");
check("boa", "boa");
check("boainain", "boainain");
check("boas", "boas");
check("bôas", "boas"); // removes diacritic: different from snowball portugese
check("boassu", "boassu");
check("boataria", "boat");
check("boate", "boat");
check("boates", "boat");
check("boatos", "boat");
check("bob", "bob");
check("boba", "bob");
check("bobagem", "bobag");
check("bobagens", "bobagens");
check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
check("bobear", "bob");
check("bobeira", "bobeir");
check("bobinho", "bobinh");
check("bobinhos", "bobinh");
check("bobo", "bob");
check("bobs", "bobs");
check("boca", "boc");
check("bocadas", "boc");
check("bocadinho", "bocadinh");
check("bocado", "boc");
check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
check("bocarra", "bocarr");
check("bocas", "boc");
check("bode", "bod");
check("bodoque", "bodoqu");
check("body", "body");
check("boeing", "boeing");
check("boem", "boem");
check("boemia", "boem");
check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
check("bogotá", "bogot");
check("boi", "boi");
check("bóia", "boi"); // removes diacritic: different from snowball portuguese
check("boiando", "boi");
check("quiabo", "quiab");
check("quicaram", "quic");
check("quickly", "quickly");
check("quieto", "quiet");
check("quietos", "quiet");
check("quilate", "quilat");
check("quilates", "quilat");
check("quilinhos", "quilinh");
check("quilo", "quil");
check("quilombo", "quilomb");
check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
check("quilos", "quil");
check("quimica", "quimic");
check("quilos", "quil");
check("quimica", "quimic");
check("quimicas", "quimic");
check("quimico", "quimic");
check("quimicos", "quimic");
check("quimioterapia", "quimioterap");
check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
check("quimono", "quimon");
check("quincas", "quinc");
check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
check("quinhentos", "quinhent");
check("quinn", "quinn");
check("quino", "quin");
check("quinta", "quint");
check("quintal", "quintal");
check("quintana", "quintan");
check("quintanilha", "quintanilh");
check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
check("quintino", "quintin");
check("quinto", "quint");
check("quintos", "quint");
check("quintuplicou", "quintuplic");
check("quinze", "quinz");
check("quinzena", "quinzen");
check("quiosque", "quiosqu");
}
public void testNormalization() throws Exception {

View File

@ -31,93 +31,92 @@ import org.apache.lucene.util.Version;
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
public void testAnalyzer() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
public void testAnalyzer() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(fa, "", new String[] {
});
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien chat cheval",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
"chien CHAT CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(
fa,
" chien ,? + = - CHAT /: > CHEVAL",
new String[] { "chien", "chat", "cheval" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguilemet" });
assertAnalyzesTo(
fa,
"mot \"entreguillemet\"",
new String[] { "mot", "entreguilemet" });
// let's do some french specific tests now
// let's do some french specific tests now
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "francoi" });
/* 1. couldn't resist
I would expect this to stay one term as in French the minus
sign is often used for composing words */
assertAnalyzesTo(
fa,
"Jean-François",
new String[] { "jean", "francoi" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// 2. stopwords
assertAnalyzesTo(
fa,
"le la chien les aux chat du des à cheval",
new String[] { "chien", "chat", "cheval" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habitabl",
"chist",
"element",
"captif" });
// some nouns and adjectives
assertAnalyzesTo(
fa,
"lances chismes habitable chiste éléments captifs",
new String[] {
"lanc",
"chism",
"habitabl",
"chist",
"element",
"captif" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "finision", "soufrirent", "rugisant" });
// some verbs
assertAnalyzesTo(
fa,
"finissions souffrirent rugissante",
new String[] { "finision", "soufrirent", "rugisant" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïaöuaä",
"anticonstitutionel",
"java" });
// some everything else
// aujourd'hui stays one term which is OK
assertAnalyzesTo(
fa,
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
new String[] {
"c3po",
"aujourd'hui",
"oeuf",
"ïaöuaä",
"anticonstitutionel",
"java" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
// some more everything else
// here 1940-1945 stays as one term, 1940:1945 not ?
assertAnalyzesTo(
fa,
"33Bis 1940-1945 1940:1945 (---i+++)*",
new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
}
}
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
// stopwords
assertAnalyzesToReuse(
fa,
"le la chien les aux chat du des à cheval",
@ -134,7 +133,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
"chist",
"element",
"captif" });
}
}
public void testExclusionTableViaCtor() throws Exception {
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);

View File

@ -32,36 +32,36 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
* HyphenatedWordsFilter test
*/
public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
}
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
// first test
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
}
/**
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
*/
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}
/**
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
*/
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}
public void testOffsets() throws Exception {
String input = "abc- def geh 1234- 5678-";
public void testOffsets() throws Exception {
String input = "abc- def geh 1234- 5678-";
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
ts = new HyphenatedWordsFilter(ts);
assertTokenStreamContents(ts,
new String[] { "abcdef", "geh", "12345678-" },
new int[] { 0, 9, 13 },
new int[] { 8, 12, 24 });
}
}
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {

View File

@ -34,83 +34,83 @@ import org.apache.lucene.util.Version;
public class TestDutchStemmer extends BaseTokenStreamTestCase {
public void testWithSnowballExamples() throws Exception {
check("lichaamsziek", "lichaamsziek");
check("lichamelijk", "licham");
check("lichamelijke", "licham");
check("lichamelijkheden", "licham");
check("lichamen", "licham");
check("lichere", "licher");
check("licht", "licht");
check("lichtbeeld", "lichtbeeld");
check("lichtbruin", "lichtbruin");
check("lichtdoorlatende", "lichtdoorlat");
check("lichte", "licht");
check("lichten", "licht");
check("lichtende", "lichtend");
check("lichtenvoorde", "lichtenvoord");
check("lichter", "lichter");
check("lichtere", "lichter");
check("lichters", "lichter");
check("lichtgevoeligheid", "lichtgevoel");
check("lichtgewicht", "lichtgewicht");
check("lichtgrijs", "lichtgrijs");
check("lichthoeveelheid", "lichthoevel");
check("lichtintensiteit", "lichtintensiteit");
check("lichtje", "lichtj");
check("lichtjes", "lichtjes");
check("lichtkranten", "lichtkrant");
check("lichtkring", "lichtkring");
check("lichtkringen", "lichtkring");
check("lichtregelsystemen", "lichtregelsystem");
check("lichtste", "lichtst");
check("lichtstromende", "lichtstrom");
check("lichtte", "licht");
check("lichtten", "licht");
check("lichttoetreding", "lichttoetred");
check("lichtverontreinigde", "lichtverontreinigd");
check("lichtzinnige", "lichtzinn");
check("lid", "lid");
check("lidia", "lidia");
check("lidmaatschap", "lidmaatschap");
check("lidstaten", "lidstat");
check("lidvereniging", "lidveren");
check("opgingen", "opging");
check("opglanzing", "opglanz");
check("opglanzingen", "opglanz");
check("opglimlachten", "opglimlacht");
check("opglimpen", "opglimp");
check("opglimpende", "opglimp");
check("opglimping", "opglimp");
check("opglimpingen", "opglimp");
check("opgraven", "opgrav");
check("opgrijnzen", "opgrijnz");
check("opgrijzende", "opgrijz");
check("opgroeien", "opgroei");
check("opgroeiende", "opgroei");
check("opgroeiplaats", "opgroeiplat");
check("ophaal", "ophal");
check("ophaaldienst", "ophaaldienst");
check("ophaalkosten", "ophaalkost");
check("ophaalsystemen", "ophaalsystem");
check("ophaalt", "ophaalt");
check("ophaaltruck", "ophaaltruck");
check("ophalen", "ophal");
check("ophalend", "ophal");
check("ophalers", "ophaler");
check("ophef", "ophef");
check("opheldering", "ophelder");
check("ophemelde", "ophemeld");
check("ophemelen", "ophemel");
check("opheusden", "opheusd");
check("ophief", "ophief");
check("ophield", "ophield");
check("ophieven", "ophiev");
check("ophoepelt", "ophoepelt");
check("ophoog", "ophog");
check("ophoogzand", "ophoogzand");
check("ophopen", "ophop");
check("ophoping", "ophop");
check("ophouden", "ophoud");
check("lichaamsziek", "lichaamsziek");
check("lichamelijk", "licham");
check("lichamelijke", "licham");
check("lichamelijkheden", "licham");
check("lichamen", "licham");
check("lichere", "licher");
check("licht", "licht");
check("lichtbeeld", "lichtbeeld");
check("lichtbruin", "lichtbruin");
check("lichtdoorlatende", "lichtdoorlat");
check("lichte", "licht");
check("lichten", "licht");
check("lichtende", "lichtend");
check("lichtenvoorde", "lichtenvoord");
check("lichter", "lichter");
check("lichtere", "lichter");
check("lichters", "lichter");
check("lichtgevoeligheid", "lichtgevoel");
check("lichtgewicht", "lichtgewicht");
check("lichtgrijs", "lichtgrijs");
check("lichthoeveelheid", "lichthoevel");
check("lichtintensiteit", "lichtintensiteit");
check("lichtje", "lichtj");
check("lichtjes", "lichtjes");
check("lichtkranten", "lichtkrant");
check("lichtkring", "lichtkring");
check("lichtkringen", "lichtkring");
check("lichtregelsystemen", "lichtregelsystem");
check("lichtste", "lichtst");
check("lichtstromende", "lichtstrom");
check("lichtte", "licht");
check("lichtten", "licht");
check("lichttoetreding", "lichttoetred");
check("lichtverontreinigde", "lichtverontreinigd");
check("lichtzinnige", "lichtzinn");
check("lid", "lid");
check("lidia", "lidia");
check("lidmaatschap", "lidmaatschap");
check("lidstaten", "lidstat");
check("lidvereniging", "lidveren");
check("opgingen", "opging");
check("opglanzing", "opglanz");
check("opglanzingen", "opglanz");
check("opglimlachten", "opglimlacht");
check("opglimpen", "opglimp");
check("opglimpende", "opglimp");
check("opglimping", "opglimp");
check("opglimpingen", "opglimp");
check("opgraven", "opgrav");
check("opgrijnzen", "opgrijnz");
check("opgrijzende", "opgrijz");
check("opgroeien", "opgroei");
check("opgroeiende", "opgroei");
check("opgroeiplaats", "opgroeiplat");
check("ophaal", "ophal");
check("ophaaldienst", "ophaaldienst");
check("ophaalkosten", "ophaalkost");
check("ophaalsystemen", "ophaalsystem");
check("ophaalt", "ophaalt");
check("ophaaltruck", "ophaaltruck");
check("ophalen", "ophal");
check("ophalend", "ophal");
check("ophalers", "ophaler");
check("ophef", "ophef");
check("opheldering", "ophelder");
check("ophemelde", "ophemeld");
check("ophemelen", "ophemel");
check("opheusden", "opheusd");
check("ophief", "ophief");
check("ophield", "ophield");
check("ophieven", "ophiev");
check("ophoepelt", "ophoepelt");
check("ophoog", "ophog");
check("ophoogzand", "ophoogzand");
check("ophopen", "ophop");
check("ophoping", "ophop");
check("ophouden", "ophoud");
}
public void testSnowballCorrectness() throws Exception {

View File

@ -37,7 +37,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestPatternTokenizer extends BaseTokenStreamTestCase
{
public void testSplitting() throws Exception
public void testSplitting() throws Exception
{
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
String[][] tests = {
@ -71,7 +71,7 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
}
}*/
}
}
}
public void testOffsetCorrection() throws Exception {
final String INPUT = "G&uuml;nther G&uuml;nther is here";

View File

@ -44,25 +44,25 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
super.setUp();
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
}
/*
* testcase for offsets
*/
public void testOffsets() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
/*
* testcase for offsets
*/
public void testOffsets() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
}
public void testStopWords() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "แสดง", "งาน", "ดี" },
new int[] { 13, 20, 23 },
new int[] { 17, 23, 25 },
new int[] { 5, 2, 1 });
}
public void testStopWords() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
new String[] { "แสดง", "งาน", "ดี" },
new int[] { 13, 20, 23 },
new int[] { 17, 23, 25 },
new int[] { 5, 2, 1 });
}
public void testTokenType() throws Exception {
public void testTokenType() throws Exception {
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
@ -70,31 +70,31 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
"<NUM>" });
}
}
/*
* Test that position increments are adjusted correctly for stopwords.
*/
// note this test uses stopfilter's stopset
public void testPositionIncrements() throws Exception {
final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
/*
* Test that position increments are adjusted correctly for stopwords.
*/
// note this test uses stopfilter's stopset
public void testPositionIncrements() throws Exception {
final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
// case that a stopword is adjacent to thai text, with no whitespace
// case that a stopword is adjacent to thai text, with no whitespace
assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
}
}
public void testReusableTokenStream() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
assertAnalyzesToReuse(analyzer, "", new String[] {});
public void testReusableTokenStream() throws Exception {
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
assertAnalyzesToReuse(analyzer, "", new String[] {});
assertAnalyzesToReuse(
analyzer,
@ -105,7 +105,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
analyzer,
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
}
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {

View File

@ -46,8 +46,8 @@ public class TestWordlistLoader extends LuceneTestCase {
private void checkSet(CharArraySet wordset) {
assertEquals(3, wordset.size());
assertTrue(wordset.contains("ONE")); // case is not modified
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
assertTrue(wordset.contains("ONE")); // case is not modified
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
assertTrue(wordset.contains("three"));
assertFalse(wordset.contains("four"));
}

View File

@ -294,7 +294,7 @@ public final class JapaneseTokenizer extends Tokenizer {
break;
}
}
if (allKanji) { // Process only Kanji keywords
if (allKanji) { // Process only Kanji keywords
return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;

View File

@ -162,7 +162,7 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 2); // Skip id
return buffer.getShort(wordId + 2); // Skip id
}
@Override

View File

@ -28,21 +28,21 @@ public interface Dictionary {
/**
* Get left id of specified word
* @param wordId
* @return left id
* @return left id
*/
public int getLeftId(int wordId);
/**
* Get right id of specified word
* @param wordId
* @return left id
* @return left id
*/
public int getRightId(int wordId);
/**
* Get word cost of specified word
* @param wordId
* @return left id
* @return left id
*/
public int getWordCost(int wordId);

View File

@ -326,12 +326,12 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
public void testSegmentation() throws Exception {
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "", "クワン", "", "優勝", "", "まし", "", "",
// "スペース", "ステーション", "", "行き", "ます", "",
// "うたがわしい", ""
// };
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "", "クワン", "", "優勝", "", "まし", "", "",
// "スペース", "ステーション", "", "行き", "ます", "",
// "うたがわしい", ""
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "", "行き", "ます", "",

View File

@ -174,26 +174,26 @@ public class TokenInfoDictionaryBuilder {
/*
* IPADIC features
*
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form
* 11 - reading
* 12 - pronounciation
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form
* 11 - reading
* 12 - pronounciation
*
* UniDic features
*
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
* 11 - base form
* 12 - surface form
* 13 - surface reading
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
* 11 - base form
* 12 - surface form
* 13 - surface reading
*/
public String[] formatEntry(String[] features) {

View File

@ -107,8 +107,8 @@ public class UnknownDictionaryBuilder {
continue;
}
if(line.startsWith("0x")) { // Category mapping
String[] values = line.split(" ", 2); // Split only first space
if(line.startsWith("0x")) { // Category mapping
String[] values = line.split(" ", 2); // Split only first space
if(!values[0].contains("..")) {
int cp = Integer.decode(values[0]).intValue();
@ -122,7 +122,7 @@ public class UnknownDictionaryBuilder {
dictionary.putCharacterCategory(i, values[1]);
}
}
} else { // Invoke definition
} else { // Invoke definition
String[] values = line.split(" "); // Consecutive space is merged above
String characterClassName = values[0];
int invoke = Integer.parseInt(values[1]);

View File

@ -175,23 +175,23 @@ public class TestPhoneticFilterFactory extends BaseTokenStreamTestCase {
}
public void testSpeed() throws Exception {
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
checkSpeedEncoding("Soundex", "easgasg", "E220");
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
checkSpeedEncoding("Soundex", "easgasg", "E220");
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
}
private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception {
long start = System.currentTimeMillis();
for ( int i=0; i<REPEATS; i++) {
assertAlgorithm(encoder, "false", toBeEncoded,
new String[] { estimated });
}
long duration = System.currentTimeMillis()-start;
if (VERBOSE)
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
long start = System.currentTimeMillis();
for ( int i=0; i<REPEATS; i++) {
assertAlgorithm(encoder, "false", toBeEncoded,
new String[] { estimated });
}
long duration = System.currentTimeMillis()-start;
if (VERBOSE)
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
}
}

View File

@ -115,7 +115,7 @@ abstract class AbstractDictionary {
}
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
// Therefore, each code page only has 16*6-2=94 characters.
// Therefore, each code page only has 16*6-2=94 characters.
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);

View File

@ -37,43 +37,43 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
*/
public class StreamUtils {
/** Buffer size used across the benchmark package */
public static final int BUFFER_SIZE = 1 << 16; // 64K
/** Buffer size used across the benchmark package */
public static final int BUFFER_SIZE = 1 << 16; // 64K
/** File format type */
public enum Type {
/** BZIP2 is automatically used for <b>.bz2</b> and <b>.bzip2</b> extensions. */
BZIP2(CompressorStreamFactory.BZIP2),
/** GZIP is automatically used for <b>.gz</b> and <b>.gzip</b> extensions. */
GZIP(CompressorStreamFactory.GZIP),
/** Plain text is used for anything which is not GZIP or BZIP. */
PLAIN(null);
private final String csfType;
Type(String csfType) {
this.csfType = csfType;
}
private InputStream inputStream(InputStream in) throws IOException {
try {
return csfType==null ? in : new CompressorStreamFactory().createCompressorInputStream(csfType, in);
} catch (CompressorException e) {
IOException ioe = new IOException(e.getMessage());
ioe.initCause(e);
throw ioe; }
}
private OutputStream outputStream(OutputStream os) throws IOException {
try {
return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os);
} catch (CompressorException e) {
IOException ioe = new IOException(e.getMessage());
ioe.initCause(e);
throw ioe;
}
}
}
/** File format type */
public enum Type {
/** BZIP2 is automatically used for <b>.bz2</b> and <b>.bzip2</b> extensions. */
BZIP2(CompressorStreamFactory.BZIP2),
/** GZIP is automatically used for <b>.gz</b> and <b>.gzip</b> extensions. */
GZIP(CompressorStreamFactory.GZIP),
/** Plain text is used for anything which is not GZIP or BZIP. */
PLAIN(null);
private final String csfType;
Type(String csfType) {
this.csfType = csfType;
}
private InputStream inputStream(InputStream in) throws IOException {
try {
return csfType==null ? in : new CompressorStreamFactory().createCompressorInputStream(csfType, in);
} catch (CompressorException e) {
IOException ioe = new IOException(e.getMessage());
ioe.initCause(e);
throw ioe; }
}
private OutputStream outputStream(OutputStream os) throws IOException {
try {
return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os);
} catch (CompressorException e) {
IOException ioe = new IOException(e.getMessage());
ioe.initCause(e);
throw ioe;
}
}
}
private static final Map<String,Type> extensionToType = new HashMap<String,Type>();
static {
// these in are lower case, we will lower case at the test as well
// these in are lower case, we will lower case at the test as well
extensionToType.put(".bz2", Type.BZIP2);
extensionToType.put(".bzip", Type.BZIP2);
extensionToType.put(".gz", Type.GZIP);
@ -95,14 +95,14 @@ public class StreamUtils {
/** Return the type of the file, or null if unknown */
private static Type fileType(File file) {
Type type = null;
Type type = null;
String fileName = file.getName();
int idx = fileName.lastIndexOf('.');
if (idx != -1) {
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ROOT));
}
return type==null ? Type.PLAIN : type;
}
}
/**
* Returns an {@link OutputStream} over the requested file, identifying

View File

@ -157,16 +157,16 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
String expDate, String expBody) throws Exception {
InputStream in = new FileInputStream(file);
switch(fileType) {
case BZIP2:
in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in);
break;
case GZIP:
in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in);
break;
case PLAIN:
break; // nothing to do
default:
assertFalse("Unknown file type!",true); //fail, should not happen
case BZIP2:
in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in);
break;
case GZIP:
in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in);
break;
case PLAIN:
break; // nothing to do
default:
assertFalse("Unknown file type!",true); //fail, should not happen
}
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
try {

View File

@ -57,38 +57,38 @@ public class StreamUtilsTest extends BenchmarkTestCase {
@Test
public void testGetInputStreamBzip2() throws Exception {
assertReadText(rawBzip2File("bz2"));
assertReadText(rawBzip2File("bzip"));
assertReadText(rawBzip2File("BZ2"));
assertReadText(rawBzip2File("BZIP"));
assertReadText(rawBzip2File("bz2"));
assertReadText(rawBzip2File("bzip"));
assertReadText(rawBzip2File("BZ2"));
assertReadText(rawBzip2File("BZIP"));
}
@Test
public void testGetOutputStreamBzip2() throws Exception {
assertReadText(autoOutFile("bz2"));
assertReadText(autoOutFile("bzip"));
assertReadText(autoOutFile("BZ2"));
assertReadText(autoOutFile("BZIP"));
assertReadText(autoOutFile("bz2"));
assertReadText(autoOutFile("bzip"));
assertReadText(autoOutFile("BZ2"));
assertReadText(autoOutFile("BZIP"));
}
@Test
public void testGetOutputStreamGzip() throws Exception {
assertReadText(autoOutFile("gz"));
assertReadText(autoOutFile("gzip"));
assertReadText(autoOutFile("GZ"));
assertReadText(autoOutFile("GZIP"));
assertReadText(autoOutFile("gz"));
assertReadText(autoOutFile("gzip"));
assertReadText(autoOutFile("GZ"));
assertReadText(autoOutFile("GZIP"));
}
@Test
public void testGetOutputStreamPlain() throws Exception {
assertReadText(autoOutFile("txt"));
assertReadText(autoOutFile("text"));
assertReadText(autoOutFile("TXT"));
assertReadText(autoOutFile("TEXT"));
assertReadText(autoOutFile("txt"));
assertReadText(autoOutFile("text"));
assertReadText(autoOutFile("TXT"));
assertReadText(autoOutFile("TEXT"));
}
private File rawTextFile(String ext) throws Exception {
File f = new File(testDir,"testfile." + ext);
File f = new File(testDir,"testfile." + ext);
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8));
w.write(TEXT);
w.newLine();
@ -97,32 +97,32 @@ public class StreamUtilsTest extends BenchmarkTestCase {
}
private File rawGzipFile(String ext) throws Exception {
File f = new File(testDir,"testfile." + ext);
File f = new File(testDir,"testfile." + ext);
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f));
writeText(os);
return f;
}
private File rawBzip2File(String ext) throws Exception {
File f = new File(testDir,"testfile." + ext);
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
writeText(os);
return f;
File f = new File(testDir,"testfile." + ext);
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
writeText(os);
return f;
}
private File autoOutFile(String ext) throws Exception {
File f = new File(testDir,"testfile." + ext);
OutputStream os = StreamUtils.outputStream(f);
writeText(os);
return f;
File f = new File(testDir,"testfile." + ext);
OutputStream os = StreamUtils.outputStream(f);
writeText(os);
return f;
}
private void writeText(OutputStream os) throws IOException {
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
w.write(TEXT);
w.newLine();
w.close();
}
private void writeText(OutputStream os) throws IOException {
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
w.write(TEXT);
w.newLine();
w.close();
}
private void assertReadText(File f) throws Exception {
InputStream ir = StreamUtils.inputStream(f);

View File

@ -170,7 +170,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
@Override
public long seek(BytesRef target) {
int lo = 0; // binary search
int lo = 0; // binary search
int hi = fieldIndex.numIndexTerms - 1;
assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;

View File

@ -163,7 +163,7 @@ final class BitVector implements Cloneable, MutableBits {
int c = 0;
int end = bits.length;
for (int i = 0; i < end; i++) {
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
}
count = c;
}
@ -176,12 +176,12 @@ final class BitVector implements Cloneable, MutableBits {
int c = 0;
int end = bits.length;
for (int i = 0; i < end; i++) {
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
}
return c;
}
private static final byte[] BYTE_COUNTS = { // table of bits/byte
private static final byte[] BYTE_COUNTS = { // table of bits/byte
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,

View File

@ -1672,7 +1672,7 @@ public class CheckIndex {
" times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
" You can't use this with the -fix option\n" +
" -dir-impl X: use a specific " + FSDirectory.class.getSimpleName() + " implementation. " +
"If no package is specified the " + FSDirectory.class.getPackage().getName() + " package will be used.\n" +
"If no package is specified the " + FSDirectory.class.getPackage().getName() + " package will be used.\n" +
"\n" +
"**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
"documents (perhaps many) to be permanently removed from the index. Always make\n" +

View File

@ -571,7 +571,7 @@ final class IndexFileDeleter {
infoStream.message("IFD", "delete \"" + fileName + "\"");
}
directory.deleteFile(fileName);
} catch (IOException e) { // if delete fails
} catch (IOException e) { // if delete fails
if (directory.fileExists(fileName)) {
// Some operating systems (e.g. Windows) don't

View File

@ -2847,7 +2847,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
final boolean anySegmentFlushed;
synchronized (fullFlushLock) {
boolean flushSuccess = false;
boolean flushSuccess = false;
try {
anySegmentFlushed = docWriter.flushAllThreads();
flushSuccess = true;

View File

@ -42,9 +42,9 @@ public final class SegmentInfo {
public static final int NO = -1; // e.g. no norms; no deletes;
public static final int YES = 1; // e.g. have norms; have deletes;
public final String name; // unique name in dir
private int docCount; // number of docs in seg
public final Directory dir; // where segment resides
public final String name; // unique name in dir
private int docCount; // number of docs in seg
public final Directory dir; // where segment resides
private boolean isCompoundFile;

View File

@ -404,7 +404,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
public Query rewrite(IndexReader reader) throws IOException {
if (minNrShouldMatch == 0 && clauses.size() == 1) { // optimize 1-clause queries
BooleanClause c = clauses.get(0);
if (!c.isProhibited()) { // just return clause
if (!c.isProhibited()) { // just return clause
Query query = c.getQuery().rewrite(reader); // rewrite first
@ -475,7 +475,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
Query subQuery = c.getQuery();
if (subQuery != null) {
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
buffer.append("(");
buffer.append(subQuery.toString(field));
buffer.append(")");

View File

@ -110,7 +110,7 @@ public class MultiPhraseQuery extends Query {
* Do not modify the List or its contents.
*/
public List<Term[]> getTermArrays() {
return Collections.unmodifiableList(termArrays);
return Collections.unmodifiableList(termArrays);
}
/**

View File

@ -24,13 +24,13 @@ import org.apache.lucene.index.*;
* Position of a term in a document that takes into account the term offset within the phrase.
*/
final class PhrasePositions {
int doc; // current doc
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
int doc; // current doc
int position; // position in doc
int count; // remaining pos in this doc
int offset; // position in phrase
final int ord; // unique across all PhrasePositions instances
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
final DocsAndPositionsEnum postings; // stream of docs & positions
PhrasePositions next; // used to make lists
int rptGroup = -1; // >=0 indicates that this is a repeating PP
int rptInd; // index in the rptGroup
final Term[] terms; // for repetitions initialization
@ -42,7 +42,7 @@ final class PhrasePositions {
this.terms = terms;
}
final boolean next() throws IOException { // increments to next doc
final boolean next() throws IOException { // increments to next doc
doc = postings.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
return false;
@ -59,7 +59,7 @@ final class PhrasePositions {
}
final void firstPosition() throws IOException {
count = postings.freq(); // read first pos
count = postings.freq(); // read first pos
nextPosition();
}
@ -70,7 +70,7 @@ final class PhrasePositions {
* have exactly the same <code>position</code>.
*/
final boolean nextPosition() throws IOException {
if (count-- > 0) { // read subsequent pos's
if (count-- > 0) { // read subsequent pos's
position = postings.nextPosition() - offset;
return true;
} else

View File

@ -281,7 +281,7 @@ public class PhraseQuery extends Query {
ArrayUtil.mergeSort(postingsFreqs);
}
if (slop == 0) { // optimize exact case
if (slop == 0) { // optimize exact case
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactSimScorer(stats, context));
if (s.noDocs) {
return null;

View File

@ -24,12 +24,12 @@ package org.apache.lucene.search.payloads;
public class MinPayloadFunction extends PayloadFunction {
@Override
public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
if (numPayloadsSeen == 0) {
return currentPayloadScore;
} else {
return Math.min(currentPayloadScore, currentScore);
}
return Math.min(currentPayloadScore, currentScore);
}
}
@Override

View File

@ -56,10 +56,10 @@ public abstract class PayloadFunction {
public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore);
public Explanation explain(int docId, String field, int numPayloadsSeen, float payloadScore){
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ".docScore()");
result.setValue(docScore(docId, field, numPayloadsSeen, payloadScore));
return result;
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ".docScore()");
result.setValue(docScore(docId, field, numPayloadsSeen, payloadScore));
return result;
};
@Override

View File

@ -117,7 +117,7 @@ public class NearSpansOrdered extends Spans {
public int end() { return matchEnd; }
public Spans[] getSubSpans() {
return subSpans;
return subSpans;
}
// TODO: Remove warning after API has been finalized

View File

@ -151,7 +151,7 @@ public class NearSpansUnordered extends Spans {
}
}
public Spans[] getSubSpans() {
return subSpans;
return subSpans;
}
@Override
public boolean next() throws IOException {
@ -286,7 +286,7 @@ public class NearSpansUnordered extends Spans {
}
private void addToList(SpansCell cell) {
if (last != null) { // add next to end of list
if (last != null) { // add next to end of list
last.next = cell;
} else
first = cell;
@ -295,7 +295,7 @@ public class NearSpansUnordered extends Spans {
}
private void firstToLast() {
last.next = first; // move first to end of list
last.next = first; // move first to end of list
last = first;
first = first.next;
last.next = null;

View File

@ -92,9 +92,9 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
@Override
public void extractTerms(Set<Term> terms) {
for (final SpanQuery clause : clauses) {
clause.extractTerms(terms);
}
for (final SpanQuery clause : clauses) {
clause.extractTerms(terms);
}
}

View File

@ -57,7 +57,7 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
@Override
public void extractTerms(Set<Term> terms) {
match.extractTerms(terms);
match.extractTerms(terms);
}
/**

View File

@ -34,7 +34,7 @@ public abstract class Spans {
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* return false;
* } while (target > doc());
* return true;
* }

View File

@ -41,9 +41,9 @@ public abstract class BufferedIndexInput extends IndexInput {
protected byte[] buffer;
private long bufferStart = 0; // position in file of buffer
private int bufferLength = 0; // end of valid bytes
private int bufferPosition = 0; // next byte to read
private long bufferStart = 0; // position in file of buffer
private int bufferLength = 0; // end of valid bytes
private int bufferPosition = 0; // next byte to read
@Override
public final byte readByte() throws IOException {
@ -259,7 +259,7 @@ public abstract class BufferedIndexInput extends IndexInput {
private void refill() throws IOException {
long start = bufferStart + bufferPosition;
long end = start + bufferSize;
if (end > length()) // don't read past EOF
if (end > length()) // don't read past EOF
end = length();
int newLength = (int)(end - start);
if (newLength <= 0)
@ -294,7 +294,7 @@ public abstract class BufferedIndexInput extends IndexInput {
else {
bufferStart = pos;
bufferPosition = 0;
bufferLength = 0; // trigger refill() on read()
bufferLength = 0; // trigger refill() on read()
seekInternal(pos);
}
}

View File

@ -135,7 +135,7 @@ public abstract class Lock {
return doBody();
} finally {
if (locked)
lock.release();
lock.release();
}
}
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.LucenePackage;
**/
public final class Constants {
private Constants() {} // can't construct
private Constants() {} // can't construct
/** JVM vendor info. */
public static final String JVM_VENDOR = System.getProperty("java.vm.vendor");

View File

@ -177,11 +177,11 @@ public abstract class PriorityQueue<T> {
time. */
public final T pop() {
if (size > 0) {
T result = heap[1]; // save first value
heap[1] = heap[size]; // move last to first
heap[size] = null; // permit GC of objects
T result = heap[1]; // save first value
heap[1] = heap[size]; // move last to first
heap[size] = null; // permit GC of objects
size--;
downHeap(); // adjust heap
downHeap(); // adjust heap
return result;
} else
return null;
@ -226,26 +226,26 @@ public abstract class PriorityQueue<T> {
private final void upHeap() {
int i = size;
T node = heap[i]; // save bottom node
T node = heap[i]; // save bottom node
int j = i >>> 1;
while (j > 0 && lessThan(node, heap[j])) {
heap[i] = heap[j]; // shift parents down
heap[i] = heap[j]; // shift parents down
i = j;
j = j >>> 1;
}
heap[i] = node; // install saved node
heap[i] = node; // install saved node
}
private final void downHeap() {
int i = 1;
T node = heap[i]; // save top node
int j = i << 1; // find smaller child
T node = heap[i]; // save top node
int j = i << 1; // find smaller child
int k = j + 1;
if (k <= size && lessThan(heap[k], heap[j])) {
j = k;
}
while (j <= size && lessThan(heap[j], node)) {
heap[i] = heap[j]; // shift up child
heap[i] = heap[j]; // shift up child
i = j;
j = i << 1;
k = j + 1;
@ -253,7 +253,7 @@ public abstract class PriorityQueue<T> {
j = k;
}
}
heap[i] = node; // install saved node
heap[i] = node; // install saved node
}
/** This method returns the internal heap array as Object[].

View File

@ -111,26 +111,26 @@ public class TestLongPostings extends LuceneTestCase {
}
final IndexReader r;
final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy());
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
iwc.setMaxBufferedDocs(-1);
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy());
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
iwc.setMaxBufferedDocs(-1);
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
for(int idx=0;idx<NUM_DOCS;idx++) {
final Document doc = new Document();
String s = isS1.get(idx) ? s1 : s2;
final Field f = newTextField("field", s, Field.Store.NO);
final int count = _TestUtil.nextInt(random(), 1, 4);
for(int ct=0;ct<count;ct++) {
doc.add(f);
}
riw.addDocument(doc);
}
for(int idx=0;idx<NUM_DOCS;idx++) {
final Document doc = new Document();
String s = isS1.get(idx) ? s1 : s2;
final Field f = newTextField("field", s, Field.Store.NO);
final int count = _TestUtil.nextInt(random(), 1, 4);
for(int ct=0;ct<count;ct++) {
doc.add(f);
}
riw.addDocument(doc);
}
r = riw.getReader();
riw.close();
r = riw.getReader();
riw.close();
/*
if (VERBOSE) {

View File

@ -145,7 +145,7 @@ public class TestCachingCollector extends LuceneTestCase {
try {
cc.replay(new NoOpCollector(false)); // this call should fail
fail("should have failed if an in-order Collector was given to replay(), " +
"while CachingCollector was initialized with out-of-order collection");
"while CachingCollector was initialized with out-of-order collection");
} catch (IllegalArgumentException e) {
// ok
}

View File

@ -162,8 +162,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
assertEquals("should be 100 hits", 100, hits.totalHits);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
}
@ -192,71 +192,71 @@ public class TestPayloadNearQuery extends LuceneTestCase {
}
public void testAverageFunction() throws IOException {
PayloadNearQuery query;
TopDocs hits;
PayloadNearQuery query;
TopDocs hits;
query = newPhraseQuery("field", "twenty two", true, new AveragePayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
// and all the similarity factors are set to 1
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("AveragePayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 3, explain.getValue() == 3f);
}
query = newPhraseQuery("field", "twenty two", true, new AveragePayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
// and all the similarity factors are set to 1
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("AveragePayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 3, explain.getValue() == 3f);
}
}
public void testMaxFunction() throws IOException {
PayloadNearQuery query;
TopDocs hits;
PayloadNearQuery query;
TopDocs hits;
query = newPhraseQuery("field", "twenty two", true, new MaxPayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 4 (max payload value)
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 4, doc.score == 4);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("MaxPayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 4, explain.getValue() == 4f);
}
query = newPhraseQuery("field", "twenty two", true, new MaxPayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 4 (max payload value)
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 4, doc.score == 4);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("MaxPayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 4, explain.getValue() == 4f);
}
}
public void testMinFunction() throws IOException {
PayloadNearQuery query;
TopDocs hits;
PayloadNearQuery query;
TopDocs hits;
query = newPhraseQuery("field", "twenty two", true, new MinPayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 2 (min payload value)
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("MinPayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 2, explain.getValue() == 2f);
}
query = newPhraseQuery("field", "twenty two", true, new MinPayloadFunction());
QueryUtils.check(query);
// all 10 hits should have score = 2 (min payload value)
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
String exp = explain.toString();
assertTrue(exp, exp.indexOf("MinPayloadFunction") > -1);
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 2, explain.getValue() == 2f);
}
}
private SpanQuery[] getClauses() {
SpanNearQuery q1, q2;
q1 = spanNearQuery("field2", "twenty two");
q2 = spanNearQuery("field2", "twenty three");
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = q1;
clauses[1] = q2;
return clauses;
SpanNearQuery q1, q2;
q1 = spanNearQuery("field2", "twenty two");
q2 = spanNearQuery("field2", "twenty three");
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = q1;
clauses[1] = q2;
return clauses;
}
private SpanNearQuery spanNearQuery(String fieldName, String words) {
String[] wordList = words.split("[\\s]+");
@ -274,8 +274,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
ScoreDoc doc = hits.scoreDocs[0];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue("there should only be one hit", hits.totalHits == 1);
// should have score = 3 because adjacent terms have payloads of 2,4
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
@ -299,8 +299,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
assertTrue("should only be one hit", hits.scoreDocs.length == 1);
// the score should be 3 - the average of all the underlying payloads
ScoreDoc doc = hits.scoreDocs[0];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}

View File

@ -582,21 +582,21 @@ public class TestBasics extends LuceneTestCase {
@Test
public void testSpansSkipTo() throws Exception {
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "seventy"));
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "seventy"));
Spans s1 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t1);
Spans s2 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t2);
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "seventy"));
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "seventy"));
Spans s1 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t1);
Spans s2 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t2);
assertTrue(s1.next());
assertTrue(s2.next());
assertTrue(s1.next());
assertTrue(s2.next());
boolean hasMore = true;
boolean hasMore = true;
do {
hasMore = skipToAccoringToJavaDocs(s1, s1.doc());
assertEquals(hasMore, s2.skipTo(s2.doc()));
assertEquals(s1.doc(), s2.doc());
} while (hasMore);
do {
hasMore = skipToAccoringToJavaDocs(s1, s1.doc());
assertEquals(hasMore, s2.skipTo(s2.doc()));
assertEquals(s1.doc(), s2.doc());
} while (hasMore);
}
/** Skips to the first match beyond the current, whose document number is

View File

@ -71,12 +71,12 @@ public class TestBitUtil extends LuceneTestCase {
long sumRes = 0;
while (iters-- >= 0) {
for (int i = 1; i <= 63; i++) {
long a = testArg(i);
sumRes += BitUtil.nlz(a);
sumRes += BitUtil.nlz(a+1);
sumRes += BitUtil.nlz(a-1);
sumRes += BitUtil.nlz(a+10);
sumRes += BitUtil.nlz(a-10);
long a = testArg(i);
sumRes += BitUtil.nlz(a);
sumRes += BitUtil.nlz(a + 1);
sumRes += BitUtil.nlz(a - 1);
sumRes += BitUtil.nlz(a + 10);
sumRes += BitUtil.nlz(a - 10);
}
}
return sumRes;
@ -86,12 +86,12 @@ public class TestBitUtil extends LuceneTestCase {
long sumRes = 0;
while (iters-- >= 0) {
for (int i = 1; i <= 63; i++) {
long a = testArg(i);
sumRes += Long.numberOfLeadingZeros(a);
sumRes += Long.numberOfLeadingZeros(a+1);
sumRes += Long.numberOfLeadingZeros(a-1);
sumRes += Long.numberOfLeadingZeros(a+10);
sumRes += Long.numberOfLeadingZeros(a-10);
long a = testArg(i);
sumRes += Long.numberOfLeadingZeros(a);
sumRes += Long.numberOfLeadingZeros(a + 1);
sumRes += Long.numberOfLeadingZeros(a - 1);
sumRes += Long.numberOfLeadingZeros(a + 10);
sumRes += Long.numberOfLeadingZeros(a - 10);
}
}
return sumRes;

View File

@ -49,7 +49,7 @@ public class TestFixedBitSet extends LuceneTestCase {
// aa = a.prevSetBit(aa-1);
aa--;
while ((aa >= 0) && (! a.get(aa))) {
aa--;
aa--;
}
if (b.length() == 0) {
bb = -1;

View File

@ -71,7 +71,7 @@ public class TestOpenBitSet extends LuceneTestCase {
// aa = a.prevSetBit(aa-1);
aa--;
while ((aa >= 0) && (! a.get(aa))) {
aa--;
aa--;
}
bb = b.prevSetBit(bb-1);
assertEquals(aa,bb);
@ -85,7 +85,7 @@ public class TestOpenBitSet extends LuceneTestCase {
// aa = a.prevSetBit(aa-1);
aa--;
while ((aa >= 0) && (! a.get(aa))) {
aa--;
aa--;
}
bb = (int) b.prevSetBit((long) (bb-1));
assertEquals(aa,bb);

View File

@ -356,7 +356,7 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
// only possible writer, and it is "synchronized" to avoid this case).
DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
if (r2 == null) {
return false; // no changes, nothing to do
return false; // no changes, nothing to do
}
// validate that a refresh is valid at this point, i.e. that the taxonomy
@ -364,13 +364,13 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
if (t1==null) {
if (t2!=null) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
}
if (t2!=null) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
}
} else if (!t1.equals(t2)) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
}
IndexReader oldreader = indexReader;

View File

@ -21,12 +21,12 @@ package org.apache.lucene.search.highlight;
*/
public class DefaultEncoder implements Encoder
{
public DefaultEncoder()
{
}
public DefaultEncoder()
{
}
public String encodeText(String originalText)
{
return originalText;
}
public String encodeText(String originalText)
{
return originalText;
}
}

View File

@ -22,8 +22,8 @@ package org.apache.lucene.search.highlight;
*/
public interface Encoder
{
/**
* @param originalText The section of text being output
*/
String encodeText(String originalText);
/**
* @param originalText The section of text being output
*/
String encodeText(String originalText);
}

View File

@ -24,10 +24,10 @@ package org.apache.lucene.search.highlight;
*/
public interface Formatter
{
/**
* @param originalText The section of text being considered for markup
* @param tokenGroup contains one or several overlapping Tokens along with
* their scores and positions.
*/
String highlightTerm(String originalText, TokenGroup tokenGroup);
/**
* @param originalText The section of text being considered for markup
* @param tokenGroup contains one or several overlapping Tokens along with
* their scores and positions.
*/
String highlightTerm(String originalText, TokenGroup tokenGroup);
}

View File

@ -42,7 +42,7 @@ public class GradientFormatter implements Formatter
*
* @param maxScore
* The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight
* which can be used to calibrate scoring scale)
* which can be used to calibrate scoring scale)
* @param minForegroundColor
* The hex color used for representing IDF scores of zero eg
* #FFFFFF (white) or null if no foreground color required

View File

@ -38,445 +38,445 @@ public class Highlighter
public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
private Formatter formatter;
private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
private Formatter formatter;
private Encoder encoder;
private Fragmenter textFragmenter=new SimpleFragmenter();
private Scorer fragmentScorer=null;
public Highlighter(Scorer fragmentScorer)
{
this(new SimpleHTMLFormatter(),fragmentScorer);
}
public Highlighter(Scorer fragmentScorer)
{
this(new SimpleHTMLFormatter(),fragmentScorer);
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this(formatter,new DefaultEncoder(),fragmentScorer);
}
public Highlighter(Formatter formatter, Scorer fragmentScorer)
{
this(formatter,new DefaultEncoder(),fragmentScorer);
}
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
}
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
{
this.formatter = formatter;
this.encoder = encoder;
this.fragmentScorer = fragmentScorer;
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* This is a convenience method that calls
* {@link #getBestFragment(TokenStream, String)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text);
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* This is a convenience method that calls
* {@link #getBestFragment(TokenStream, String)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragment(tokenStream, text);
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score
* is returned
*
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficiently
* by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(TokenStream tokenStream, String text)
throws IOException, InvalidTokenOffsetsException
{
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
{
return results[0];
}
return null;
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score
* is returned
*
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficiently
* by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragment(TokenStream tokenStream, String text)
throws IOException, InvalidTokenOffsetsException
{
String[] results = getBestFragments(tokenStream,text, 1);
if (results.length > 0)
{
return results[0];
}
return null;
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* This is a convenience method that calls
* {@link #getBestFragments(TokenStream, String, int)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param fieldName the name of the field being highlighted (used by analyzer)
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
Analyzer analyzer,
String fieldName,
String text,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* This is a convenience method that calls
* {@link #getBestFragments(TokenStream, String, int)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param fieldName the name of the field being highlighted (used by analyzer)
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
Analyzer analyzer,
String fieldName,
String text,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
return getBestFragments(tokenStream, text, maxNumFragments);
}
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability)
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability)
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String[] getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
//Get text
ArrayList<String> fragTexts = new ArrayList<String>();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i].toString());
}
}
return fragTexts.toArray(new String[0]);
}
//Get text
ArrayList<String> fragTexts = new ArrayList<String>();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i].toString());
}
}
return fragTexts.toArray(new String[0]);
}
/**
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
StringBuilder newText=new StringBuilder();
/**
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
* @throws IOException
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final TextFragment[] getBestTextFragments(
TokenStream tokenStream,
String text,
boolean mergeContiguousFragments,
int maxNumFragments)
throws IOException, InvalidTokenOffsetsException
{
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
StringBuilder newText=new StringBuilder();
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
if (fragmentScorer instanceof QueryScorer) {
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
}
TokenStream newStream = fragmentScorer.init(tokenStream);
if(newStream != null) {
tokenStream = newStream;
}
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
TokenStream newStream = fragmentScorer.init(tokenStream);
if(newStream != null) {
tokenStream = newStream;
}
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
try
{
try
{
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text, tokenStream);
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
textFragmenter.start(text, tokenStream);
TokenGroup tokenGroup=new TokenGroup(tokenStream);
TokenGroup tokenGroup=new TokenGroup(tokenStream);
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{
if( (offsetAtt.endOffset()>text.length())
||
(offsetAtt.startOffset()>text.length())
)
{
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
+" exceeds length of provided text sized "+text.length());
}
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{
if( (offsetAtt.endOffset()>text.length())
||
(offsetAtt.startOffset()>text.length())
)
{
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
+" exceeds length of provided text sized "+text.length());
}
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(endOffset, lastEndOffset);
tokenGroup.clear();
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment())
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment())
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
currentFrag.textEndPos = newText.length();
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
}
}
tokenGroup.addToken(fragmentScorer.getTokenScore());
tokenGroup.addToken(fragmentScorer.getTokenScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
}
currentFrag.setScore(fragmentScorer.getFragmentScore());
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(lastEndOffset,endOffset);
}
if(tokenGroup.numTokens>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup.matchStartOffset;
endOffset = tokenGroup.matchEndOffset;
tokenText = text.substring(startOffset, endOffset);
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset)
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
newText.append(markedUpText);
lastEndOffset=Math.max(lastEndOffset,endOffset);
}
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
// and that text is not too large...
(text.length()<= maxDocCharsToAnalyze)
)
{
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
//Test what remains of the original text beyond the point where we stopped analyzing
if (
// if there is text beyond the last token considered..
(lastEndOffset < text.length())
&&
// and that text is not too large...
(text.length()<= maxDocCharsToAnalyze)
)
{
//append it to the last fragment
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
}
currentFrag.textEndPos = newText.length();
currentFrag.textEndPos = newText.length();
//sort the most relevant sections of the text
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
{
currentFrag = i.next();
//sort the most relevant sections of the text
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
{
currentFrag = i.next();
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*
if (currentFrag.getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
fragQueue.pop(); // remove lowest in hit queue
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insertWithOverflow(currentFrag);
}
}
*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
fragQueue.insertWithOverflow(currentFrag);
}
//return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = fragQueue.pop();
}
//return the most relevant fragments
TextFragment frag[] = new TextFragment[fragQueue.size()];
for (int i = frag.length - 1; i >= 0; i--)
{
frag[i] = fragQueue.pop();
}
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= fragTexts.toArray(new TextFragment[0]);
}
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
mergeContiguousFragments(frag);
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
for (int i = 0; i < frag.length; i++)
{
if ((frag[i] != null) && (frag[i].getScore() > 0))
{
fragTexts.add(frag[i]);
}
}
frag= fragTexts.toArray(new TextFragment[0]);
}
return frag;
return frag;
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.end();
tokenStream.close();
}
catch (Exception e)
{
}
}
}
}
}
finally
{
if (tokenStream != null)
{
try
{
tokenStream.end();
tokenStream.close();
}
catch (Exception e)
{
}
}
}
}
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
private void mergeContiguousFragments(TextFragment[] frag)
{
boolean mergingStillBeingDone;
if (frag.length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else
if (frag[x].follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.getScore() > frag2.getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
}
while (mergingStillBeingDone);
}
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
private void mergeContiguousFragments(TextFragment[] frag)
{
boolean mergingStillBeingDone;
if (frag.length > 1)
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int i = 0; i < frag.length; i++)
{
if (frag[i] == null)
{
continue;
}
//merge any contiguous blocks
for (int x = 0; x < frag.length; x++)
{
if (frag[x] == null)
{
continue;
}
if (frag[i] == null)
{
break;
}
TextFragment frag1 = null;
TextFragment frag2 = null;
int frag1Num = 0;
int frag2Num = 0;
int bestScoringFragNum;
int worstScoringFragNum;
//if blocks are contiguous....
if (frag[i].follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else
if (frag[x].follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != null)
{
if (frag1.getScore() > frag2.getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1.merge(frag2);
frag[worstScoringFragNum] = null;
mergingStillBeingDone = true;
frag[bestScoringFragNum] = frag1;
}
}
}
}
while (mergingStillBeingDone);
}
/**
* Highlights terms in the text , extracting the most relevant sections
* and concatenating the chosen fragments with a separator (typically "...").
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned in order as "separator" delimited strings.
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments,
String separator)
throws IOException, InvalidTokenOffsetsException
{
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuilder result = new StringBuilder();
for (int i = 0; i < sections.length; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
}
/**
* Highlights terms in the text , extracting the most relevant sections
* and concatenating the chosen fragments with a separator (typically "...").
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned in order as "separator" delimited strings.
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
*/
public final String getBestFragments(
TokenStream tokenStream,
String text,
int maxNumFragments,
String separator)
throws IOException, InvalidTokenOffsetsException
{
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
StringBuilder result = new StringBuilder();
for (int i = 0; i < sections.length; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
return result.toString();
}
public int getMaxDocCharsToAnalyze() {
return maxDocCharsToAnalyze;
@ -487,35 +487,35 @@ public class Highlighter
}
public Fragmenter getTextFragmenter()
{
return textFragmenter;
}
public Fragmenter getTextFragmenter()
{
return textFragmenter;
}
/**
* @param fragmenter
*/
public void setTextFragmenter(Fragmenter fragmenter)
{
textFragmenter = fragmenter;
}
/**
* @param fragmenter
*/
public void setTextFragmenter(Fragmenter fragmenter)
{
textFragmenter = fragmenter;
}
/**
* @return Object used to score each text fragment
*/
public Scorer getFragmentScorer()
{
return fragmentScorer;
}
/**
* @return Object used to score each text fragment
*/
public Scorer getFragmentScorer()
{
return fragmentScorer;
}
/**
* @param scorer
*/
public void setFragmentScorer(Scorer scorer)
{
fragmentScorer = scorer;
}
/**
* @param scorer
*/
public void setFragmentScorer(Scorer scorer)
{
fragmentScorer = scorer;
}
public Encoder getEncoder()
{
@ -528,17 +528,17 @@ public class Highlighter
}
class FragmentQueue extends PriorityQueue<TextFragment>
{
public FragmentQueue(int size)
{
super(size);
}
public FragmentQueue(int size)
{
super(size);
}
@Override
public final boolean lessThan(TextFragment fragA, TextFragment fragB)
{
if (fragA.getScore() == fragB.getScore())
return fragA.fragNum > fragB.fragNum;
else
return fragA.getScore() < fragB.getScore();
}
@Override
public final boolean lessThan(TextFragment fragA, TextFragment fragB)
{
if (fragA.getScore() == fragB.getScore())
return fragA.fragNum > fragB.fragNum;
else
return fragA.getScore() < fragB.getScore();
}
}

View File

@ -23,9 +23,9 @@ package org.apache.lucene.search.highlight;
public class InvalidTokenOffsetsException extends Exception
{
public InvalidTokenOffsetsException(String message)
{
super(message);
}
public InvalidTokenOffsetsException(String message)
{
super(message);
}
}

View File

@ -37,126 +37,118 @@ import org.apache.lucene.search.Query;
public final class QueryTermExtractor
{
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getTerms(Query query)
{
return getTerms(query,false);
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getTerms(Query query)
{
return getTerms(query,false);
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param reader used to compute IDF which can be used to a) score selected fragments better
* b) use graded highlights eg changing intensity of font color
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
WeightedTerm[] terms=getTerms(query,false, fieldName);
int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++)
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param reader used to compute IDF which can be used to a) score selected fragments better
* b) use graded highlights eg changing intensity of font color
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
WeightedTerm[] terms=getTerms(query,false, fieldName);
int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++)
{
try
try
{
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
//IDF algorithm taken from DefaultSimilarity class
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf;
}
catch (IOException e)
catch (IOException e)
{
//ignore
//ignore
}
}
return terms;
}
return terms;
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
* @param fieldName The fieldName used to filter query terms
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
* @param fieldName The fieldName used to filter query terms
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName)
{
HashSet<WeightedTerm> terms=new HashSet<WeightedTerm>();
getTerms(query,terms,prohibited,fieldName);
return terms.toArray(new WeightedTerm[0]);
}
public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName)
{
HashSet<WeightedTerm> terms=new HashSet<WeightedTerm>();
getTerms(query,terms,prohibited,fieldName);
return terms.toArray(new WeightedTerm[0]);
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param prohibited <code>true</code> to extract "prohibited" terms, too
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
{
return getTerms(query,prohibited,null);
}
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
{
return getTerms(query,prohibited,null);
}
private static final void getTerms(Query query, HashSet<WeightedTerm> terms,boolean prohibited, String fieldName)
{
try
{
if (query instanceof BooleanQuery)
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
else
if(query instanceof FilteredQuery)
getTermsFromFilteredQuery((FilteredQuery)query, terms,prohibited, fieldName);
else
{
HashSet<Term> nonWeightedTerms=new HashSet<Term>();
query.extractTerms(nonWeightedTerms);
for (Iterator<Term> iter = nonWeightedTerms.iterator(); iter.hasNext();)
{
Term term = iter.next();
if((fieldName==null)||(term.field().equals(fieldName)))
{
terms.add(new WeightedTerm(query.getBoost(),term.text()));
}
}
}
}
catch(UnsupportedOperationException ignore)
{
//this is non-fatal for our purposes
}
}
private static final void getTerms(Query query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName) {
try {
if (query instanceof BooleanQuery)
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
else if (query instanceof FilteredQuery)
getTermsFromFilteredQuery((FilteredQuery) query, terms, prohibited, fieldName);
else {
HashSet<Term> nonWeightedTerms = new HashSet<Term>();
query.extractTerms(nonWeightedTerms);
for (Iterator<Term> iter = nonWeightedTerms.iterator(); iter.hasNext(); ) {
Term term = iter.next();
if ((fieldName == null) || (term.field().equals(fieldName))) {
terms.add(new WeightedTerm(query.getBoost(), term.text()));
}
}
}
} catch (UnsupportedOperationException ignore) {
//this is non-fatal for our purposes
}
}
/**
* extractTerms is currently the only query-independent means of introspecting queries but it only reveals
* a list of terms for that query - not the boosts each individual term in that query may or may not have.
* "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
* in each child element.
* Some discussion around this topic here:
* http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
* Unfortunately there seemed to be limited interest in requiring all Query objects to implement
* something common which would allow access to child queries so what follows here are query-specific
* implementations for accessing embedded query elements.
*/
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
{
BooleanClause[] queryClauses = query.getClauses();
for (int i = 0; i < queryClauses.length; i++)
{
if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
}
}
private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
{
getTerms(query.getQuery(),terms,prohibited,fieldName);
}
/**
* extractTerms is currently the only query-independent means of introspecting queries but it only reveals
* a list of terms for that query - not the boosts each individual term in that query may or may not have.
* "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
* in each child element.
* Some discussion around this topic here:
* http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
* Unfortunately there seemed to be limited interest in requiring all Query objects to implement
* something common which would allow access to child queries so what follows here are query-specific
* implementations for accessing embedded query elements.
*/
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
{
BooleanClause[] queryClauses = query.getClauses();
for (int i = 0; i < queryClauses.length; i++)
{
if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
}
}
private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
{
getTerms(query.getQuery(),terms,prohibited,fieldName);
}
}

View File

@ -21,61 +21,61 @@ package org.apache.lucene.search.highlight;
*/
public class SimpleHTMLEncoder implements Encoder
{
public SimpleHTMLEncoder()
{
}
public SimpleHTMLEncoder()
{
}
public String encodeText(String originalText)
{
return htmlEncode(originalText);
}
public String encodeText(String originalText)
{
return htmlEncode(originalText);
}
/**
* Encode string into HTML
*/
public final static String htmlEncode(String plainText)
{
if (plainText == null || plainText.length() == 0)
{
return "";
}
/**
* Encode string into HTML
*/
public final static String htmlEncode(String plainText)
{
if (plainText == null || plainText.length() == 0)
{
return "";
}
StringBuilder result = new StringBuilder(plainText.length());
StringBuilder result = new StringBuilder(plainText.length());
for (int index=0; index<plainText.length(); index++)
{
char ch = plainText.charAt(index);
for (int index=0; index<plainText.length(); index++)
{
char ch = plainText.charAt(index);
switch (ch)
{
case '"':
result.append("&quot;");
break;
switch (ch)
{
case '"':
result.append("&quot;");
break;
case '&':
result.append("&amp;");
break;
case '&':
result.append("&amp;");
break;
case '<':
result.append("&lt;");
break;
case '<':
result.append("&lt;");
break;
case '>':
result.append("&gt;");
break;
case '>':
result.append("&gt;");
break;
default:
if (ch < 128)
{
result.append(ch);
}
else
{
result.append("&#").append((int)ch).append(";");
}
}
}
default:
if (ch < 128)
{
result.append(ch);
}
else
{
result.append("&#").append((int)ch).append(";");
}
}
}
return result.toString();
}
return result.toString();
}
}

View File

@ -26,34 +26,34 @@ public class SimpleHTMLFormatter implements Formatter {
private static final String DEFAULT_PRE_TAG = "<B>";
private static final String DEFAULT_POST_TAG = "</B>";
private String preTag;
private String postTag;
private String preTag;
private String postTag;
public SimpleHTMLFormatter(String preTag, String postTag) {
this.preTag = preTag;
this.postTag = postTag;
}
public SimpleHTMLFormatter(String preTag, String postTag) {
this.preTag = preTag;
this.postTag = postTag;
}
/** Default constructor uses HTML: &lt;B&gt; tags to markup terms. */
public SimpleHTMLFormatter() {
this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
}
/** Default constructor uses HTML: &lt;B&gt; tags to markup terms. */
public SimpleHTMLFormatter() {
this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
*/
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
if (tokenGroup.getTotalScore() <= 0) {
return originalText;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
*/
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
if (tokenGroup.getTotalScore() <= 0) {
return originalText;
}
// Allocate StringBuilder with the right number of characters from the
// Allocate StringBuilder with the right number of characters from the
// beginning, to avoid char[] allocations in the middle of appends.
StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
returnBuffer.append(preTag);
returnBuffer.append(originalText);
returnBuffer.append(postTag);
return returnBuffer.toString();
}
StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
returnBuffer.append(preTag);
returnBuffer.append(originalText);
returnBuffer.append(postTag);
return returnBuffer.toString();
}
}

View File

@ -22,57 +22,49 @@ package org.apache.lucene.search.highlight;
* doesn't work in Mozilla, thus this class.
*
* @see GradientFormatter
*
*/
public class SpanGradientFormatter
extends GradientFormatter
{
public SpanGradientFormatter(float maxScore, String minForegroundColor,
String maxForegroundColor, String minBackgroundColor,
String maxBackgroundColor)
{
super( maxScore, minForegroundColor,
maxForegroundColor, minBackgroundColor,
maxBackgroundColor);
}
extends GradientFormatter {
public SpanGradientFormatter(float maxScore, String minForegroundColor,
String maxForegroundColor, String minBackgroundColor,
String maxBackgroundColor) {
super(maxScore, minForegroundColor,
maxForegroundColor, minBackgroundColor,
maxBackgroundColor);
}
@Override
public String highlightTerm(String originalText, TokenGroup tokenGroup)
{
if (tokenGroup.getTotalScore() == 0)
return originalText;
float score = tokenGroup.getTotalScore();
if (score == 0)
{
return originalText;
}
// try to size sb correctly
StringBuilder sb = new StringBuilder( originalText.length() + EXTRA);
sb.append("<span style=\"");
if (highlightForeground)
{
sb.append("color: ");
sb.append(getForegroundColorString(score));
sb.append("; ");
}
if (highlightBackground)
{
sb.append("background: ");
sb.append(getBackgroundColorString(score));
sb.append("; ");
}
sb.append("\">");
sb.append(originalText);
sb.append("</span>");
return sb.toString();
@Override
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
if (tokenGroup.getTotalScore() == 0)
return originalText;
float score = tokenGroup.getTotalScore();
if (score == 0) {
return originalText;
}
// guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
private static final String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
private static final int EXTRA = TEMPLATE.length();
// try to size sb correctly
StringBuilder sb = new StringBuilder(originalText.length() + EXTRA);
sb.append("<span style=\"");
if (highlightForeground) {
sb.append("color: ");
sb.append(getForegroundColorString(score));
sb.append("; ");
}
if (highlightBackground) {
sb.append("background: ");
sb.append(getBackgroundColorString(score));
sb.append("; ");
}
sb.append("\">");
sb.append(originalText);
sb.append("</span>");
return sb.toString();
}
// guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
private static final String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
private static final int EXTRA = TEMPLATE.length();
}

View File

@ -25,57 +25,57 @@ package org.apache.lucene.search.highlight;
*/
public class TextFragment
{
CharSequence markedUpText;
int fragNum;
int textStartPos;
int textEndPos;
float score;
CharSequence markedUpText;
int fragNum;
int textStartPos;
int textEndPos;
float score;
public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
{
this.markedUpText=markedUpText;
this.textStartPos = textStartPos;
this.fragNum = fragNum;
}
public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
{
this.markedUpText=markedUpText;
this.textStartPos = textStartPos;
this.fragNum = fragNum;
}
void setScore(float score)
{
this.score=score;
}
public float getScore()
{
return score;
}
/**
* @param frag2 Fragment to be merged into this one
*/
void setScore(float score)
{
this.score=score;
}
public float getScore()
{
return score;
}
/**
* @param frag2 Fragment to be merged into this one
*/
public void merge(TextFragment frag2)
{
textEndPos = frag2.textEndPos;
score=Math.max(score,frag2.score);
}
/**
* @param fragment
* @return true if this fragment follows the one passed
*/
public boolean follows(TextFragment fragment)
{
return textStartPos == fragment.textEndPos;
}
* @param fragment
* @return true if this fragment follows the one passed
*/
public boolean follows(TextFragment fragment)
{
return textStartPos == fragment.textEndPos;
}
/**
* @return the fragment sequence number
*/
public int getFragNum()
{
return fragNum;
}
/**
* @return the fragment sequence number
*/
public int getFragNum()
{
return fragNum;
}
/* Returns the marked-up text for this text fragment
*/
@Override
public String toString() {
return markedUpText.subSequence(textStartPos, textEndPos).toString();
}
/* Returns the marked-up text for this text fragment
*/
@Override
public String toString() {
return markedUpText.subSequence(textStartPos, textEndPos).toString();
}
}

View File

@ -20,45 +20,45 @@ package org.apache.lucene.search.highlight;
*/
public class WeightedTerm
{
float weight; // multiplier
String term; //stemmed form
public WeightedTerm (float weight,String term)
{
this.weight=weight;
this.term=term;
}
float weight; // multiplier
String term; //stemmed form
public WeightedTerm (float weight,String term)
{
this.weight=weight;
this.term=term;
}
/**
* @return the term value (stemmed)
*/
public String getTerm()
{
return term;
}
/**
* @return the term value (stemmed)
*/
public String getTerm()
{
return term;
}
/**
* @return the weight associated with this term
*/
public float getWeight()
{
return weight;
}
/**
* @return the weight associated with this term
*/
public float getWeight()
{
return weight;
}
/**
* @param term the term value (stemmed)
*/
public void setTerm(String term)
{
this.term = term;
}
/**
* @param term the term value (stemmed)
*/
public void setTerm(String term)
{
this.term = term;
}
/**
* @param weight the weight associated with this term
*/
public void setWeight(float weight)
{
this.weight = weight;
}
/**
* @param weight the weight associated with this term
*/
public void setWeight(float weight)
{
this.weight = weight;
}
}

View File

@ -54,11 +54,12 @@ public abstract class DualFloatFunction extends ValueSource {
return new FloatDocValues(this) {
@Override
public float floatVal(int doc) {
return func(doc, aVals, bVals);
return func(doc, aVals, bVals);
}
@Override
public String toString(int doc) {
return name() + '(' + aVals.toString(doc) + ',' + bVals.toString(doc) + ')';
return name() + '(' + aVals.toString(doc) + ',' + bVals.toString(doc) + ')';
}
};
}

View File

@ -50,7 +50,7 @@ public abstract class MultiBoolFunction extends BoolFunction {
return new BoolDocValues(this) {
@Override
public boolean boolVal(int doc) {
return func(doc, vals);
return func(doc, vals);
}
@Override

View File

@ -115,11 +115,11 @@ public class ScaleFloatFunction extends ValueSource {
return new FloatDocValues(this) {
@Override
public float floatVal(int doc) {
return (vals.floatVal(doc) - minSource) * scale + min;
return (vals.floatVal(doc) - minSource) * scale + min;
}
@Override
public String toString(int doc) {
return "scale(" + vals.toString(doc) + ",toMin=" + min + ",toMax=" + max
return "scale(" + vals.toString(doc) + ",toMin=" + min + ",toMax=" + max
+ ",fromMin=" + minSource
+ ",fromMax=" + maxSource
+ ")";

View File

@ -44,11 +44,11 @@ public abstract class SimpleBoolFunction extends BoolFunction {
return new BoolDocValues(this) {
@Override
public boolean boolVal(int doc) {
return func(doc, vals);
return func(doc, vals);
}
@Override
public String toString(int doc) {
return name() + '(' + vals.toString(doc) + ')';
return name() + '(' + vals.toString(doc) + ')';
}
};
}

View File

@ -40,11 +40,11 @@ import java.util.Map;
return new FloatDocValues(this) {
@Override
public float floatVal(int doc) {
return func(doc, vals);
return func(doc, vals);
}
@Override
public String toString(int doc) {
return name() + '(' + vals.toString(doc) + ')';
return name() + '(' + vals.toString(doc) + ')';
}
};
}

View File

@ -29,13 +29,13 @@ import java.io.*;
public final class FastCharStream implements CharStream {
char[] buffer = null;
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
Reader input; // source of chars
Reader input; // source of chars
/** Constructs from a Reader. */
public FastCharStream(Reader r) {
@ -51,24 +51,24 @@ public final class FastCharStream implements CharStream {
private final void refill() throws IOException {
int newPosition = bufferLength - tokenStart;
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
} else if (bufferLength == buffer.length) { // grow buffer
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
}
} else { // shift token to front
} else { // shift token to front
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
}
bufferLength = newPosition; // update state
bufferLength = newPosition; // update state
bufferPosition = newPosition;
bufferStart += tokenStart;
tokenStart = 0;
int charsRead = // fill space in buffer
int charsRead = // fill space in buffer
input.read(buffer, newPosition, buffer.length-newPosition);
if (charsRead == -1)
throw new IOException("read past eof");

View File

@ -166,10 +166,10 @@ public class QueryParser extends QueryParserBase implements QueryParserConstants
// This makes sure that there is no garbage after the query string
final public Query TopLevelQuery(String field) throws ParseException {
Query q;
Query q;
q = Query(field);
jj_consume_token(0);
{if (true) return q;}
{if (true) return q;}
throw new Error("Missing return statement in function");
}

View File

@ -211,13 +211,13 @@ int Modifiers() : {
// This makes sure that there is no garbage after the query string
Query TopLevelQuery(String field) :
{
Query q;
Query q;
}
{
q=Query(field) <EOF>
{
return q;
}
q=Query(field) <EOF>
{
return q;
}
}
Query Query(String field) :

View File

@ -179,12 +179,12 @@ public class FieldQueryNode extends QueryNodeImpl implements FieldValuePairQuery
}
public CharSequence getValue() {
return getText();
}
public CharSequence getValue() {
return getText();
}
public void setValue(CharSequence value) {
setText(value);
}
public void setValue(CharSequence value) {
setText(value);
}
}

View File

@ -29,13 +29,13 @@ import java.io.*;
public final class FastCharStream implements CharStream {
char[] buffer = null;
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
Reader input; // source of chars
Reader input; // source of chars
/** Constructs from a Reader. */
public FastCharStream(Reader r) {
@ -51,24 +51,24 @@ public final class FastCharStream implements CharStream {
private final void refill() throws IOException {
int newPosition = bufferLength - tokenStart;
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
} else if (bufferLength == buffer.length) { // grow buffer
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
char[] newBuffer = new char[buffer.length * 2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
}
} else { // shift token to front
} else { // shift token to front
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
}
bufferLength = newPosition; // update state
bufferLength = newPosition; // update state
bufferPosition = newPosition;
bufferStart += tokenStart;
tokenStart = 0;
int charsRead = // fill space in buffer
int charsRead = // fill space in buffer
input.read(buffer, newPosition, buffer.length-newPosition);
if (charsRead == -1)
throw new IOException("read past eof");

View File

@ -45,14 +45,14 @@ import org.apache.lucene.queryparser.flexible.standard.nodes.TermRangeQueryNode;
*/
public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserConstants {
private static final int CONJ_NONE =0;
private static final int CONJ_AND =2;
private static final int CONJ_OR =2;
private static final int CONJ_NONE =0;
private static final int CONJ_AND =2;
private static final int CONJ_OR =2;
// syntax parser constructor
public StandardSyntaxParser() {
this(new FastCharStream(new StringReader("")));
this(new FastCharStream(new StringReader("")));
}
/** Parses a query string, returning a {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode}.
* @param query the query string to be parsed.
@ -143,10 +143,10 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
// This makes sure that there is no garbage after the query string
final public QueryNode TopLevelQuery(CharSequence field) throws ParseException {
QueryNode q;
QueryNode q;
q = Query(field);
jj_consume_token(0);
{if (true) return q;}
{if (true) return q;}
throw new Error("Missing return statement in function");
}
@ -184,23 +184,23 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
break label_1;
}
c = DisjQuery(field);
if (clauses == null) {
clauses = new Vector<QueryNode>();
clauses.addElement(first);
}
clauses.addElement(c);
if (clauses == null) {
clauses = new Vector<QueryNode>();
clauses.addElement(first);
}
clauses.addElement(c);
}
if (clauses != null) {
{if (true) return new BooleanQueryNode(clauses);}
} else {
{if (true) return first;}
}
{if (true) return new BooleanQueryNode(clauses);}
} else {
{if (true) return first;}
}
throw new Error("Missing return statement in function");
}
final public QueryNode DisjQuery(CharSequence field) throws ParseException {
QueryNode first, c;
Vector<QueryNode> clauses = null;
QueryNode first, c;
Vector<QueryNode> clauses = null;
first = ConjQuery(field);
label_2:
while (true) {
@ -221,7 +221,7 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
clauses.addElement(c);
}
if (clauses != null) {
{if (true) return new OrQueryNode(clauses);}
{if (true) return new OrQueryNode(clauses);}
} else {
{if (true) return first;}
}
@ -229,8 +229,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
final public QueryNode ConjQuery(CharSequence field) throws ParseException {
QueryNode first, c;
Vector<QueryNode> clauses = null;
QueryNode first, c;
Vector<QueryNode> clauses = null;
first = ModClause(field);
label_3:
while (true) {
@ -251,7 +251,7 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
clauses.addElement(c);
}
if (clauses != null) {
{if (true) return new AndQueryNode(clauses);}
{if (true) return new AndQueryNode(clauses);}
} else {
{if (true) return first;}
}
@ -272,27 +272,27 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
// if (mods == ModifierQueryNode.Modifier.MOD_NONE) firstQuery=q;
//
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// }
// (
// conj=Conjunction() mods=Modifiers() q=Clause(field)
// {
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// //TODO: figure out what to do with AND and ORs
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// //TODO: figure out what to do with AND and ORs
// }
// )*
// {
// if (clauses.size() == 1 && firstQuery != null)
// return firstQuery;
// else {
// return new BooleanQueryNode(clauses);
// return new BooleanQueryNode(clauses);
// }
// }
// }
@ -301,10 +301,10 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
ModifierQueryNode.Modifier mods;
mods = Modifiers();
q = Clause(field);
if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
q = new ModifierQueryNode(q, mods);
}
{if (true) return q;}
if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
q = new ModifierQueryNode(q, mods);
}
{if (true) return q;}
throw new Error("Missing return statement in function");
}
@ -378,18 +378,18 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
switch (operator.kind) {
case OP_LESSTHAN:
lowerInclusive = true;
upperInclusive = false;
lowerInclusive = true;
upperInclusive = false;
qLower = new FieldQueryNode(field,
qLower = new FieldQueryNode(field,
"*", term.beginColumn, term.endColumn);
qUpper = new FieldQueryNode(field,
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
break;
case OP_LESSTHANEQ:
lowerInclusive = true;
upperInclusive = true;
lowerInclusive = true;
upperInclusive = true;
qLower = new FieldQueryNode(field,
"*", term.beginColumn, term.endColumn);
@ -397,8 +397,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
break;
case OP_MORETHAN:
lowerInclusive = false;
upperInclusive = true;
lowerInclusive = false;
upperInclusive = true;
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
@ -406,8 +406,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
"*", term.beginColumn, term.endColumn);
break;
case OP_MORETHANEQ:
lowerInclusive = true;
upperInclusive = true;
lowerInclusive = true;
upperInclusive = true;
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
@ -488,18 +488,18 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
}
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
}
}
if (group) { q = new GroupQueryNode(q);}
{if (true) return q;}
@ -522,7 +522,7 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
case TERM:
term = jj_consume_token(TERM);
q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
break;
case REGEXPTERM:
term = jj_consume_token(REGEXPTERM);
@ -564,16 +564,16 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
;
}
if (fuzzy) {
float fms = defaultMinSimilarity;
try {
float fms = defaultMinSimilarity;
try {
fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
} catch (Exception ignored) { }
if(fms < 0.0f){
{if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));}
} else if (fms >= 1.0f && fms != (int) fms) {
{if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS));}
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} catch (Exception ignored) { }
if(fms < 0.0f){
{if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));}
} else if (fms >= 1.0f && fms != (int) fms) {
{if (true) throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS));}
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} else if (regexp) {
String re = term.image.substring(1, term.image.length()-1);
q = new RegexpQueryNode(field, re, 0, re.length());
@ -656,9 +656,9 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
q = new TermRangeQueryNode(qLower, qUpper, startInc ? true : false, endInc ? true : false);
break;
case QUOTED:
@ -690,8 +690,8 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
}
catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no PhraseSlop", if
* slop number is invalid)
*/
* slop number is invalid)
*/
}
}
break;
@ -700,20 +700,20 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
jj_consume_token(-1);
throw new ParseException();
}
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
}
{if (true) return q;}
throw new Error("Missing return statement in function");
}
@ -748,11 +748,6 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
return false;
}
private boolean jj_3R_10() {
if (jj_scan_token(TERM)) return true;
return false;
}
private boolean jj_3R_11() {
if (jj_scan_token(REGEXPTERM)) return true;
return false;
@ -779,6 +774,11 @@ public class StandardSyntaxParser implements SyntaxParser, StandardSyntaxParserC
return false;
}
private boolean jj_3R_10() {
if (jj_scan_token(TERM)) return true;
return false;
}
private boolean jj_3R_7() {
Token xsp;
xsp = jj_scanpos;

View File

@ -57,14 +57,14 @@ import org.apache.lucene.queryparser.flexible.standard.nodes.TermRangeQueryNode;
*/
public class StandardSyntaxParser implements SyntaxParser {
private static final int CONJ_NONE =0;
private static final int CONJ_AND =2;
private static final int CONJ_OR =2;
private static final int CONJ_NONE =0;
private static final int CONJ_AND =2;
private static final int CONJ_OR =2;
// syntax parser constructor
public StandardSyntaxParser() {
this(new FastCharStream(new StringReader("")));
this(new FastCharStream(new StringReader("")));
}
/** Parses a query string, returning a {@link org.apache.lucene.queryparser.flexible.core.nodes.QueryNode}.
* @param query the query string to be parsed.
@ -178,13 +178,13 @@ ModifierQueryNode.Modifier Modifiers() : {
// This makes sure that there is no garbage after the query string
QueryNode TopLevelQuery(CharSequence field) :
{
QueryNode q;
QueryNode q;
}
{
q=Query(field) <EOF>
{
return q;
}
q=Query(field) <EOF>
{
return q;
}
}
// These changes were made to introduce operator precedence:
@ -209,25 +209,25 @@ QueryNode Query(CharSequence field) :
(
c=DisjQuery(field)
{
if (clauses == null) {
clauses = new Vector<QueryNode>();
clauses.addElement(first);
}
clauses.addElement(c);
if (clauses == null) {
clauses = new Vector<QueryNode>();
clauses.addElement(first);
}
clauses.addElement(c);
}
)*
{
if (clauses != null) {
return new BooleanQueryNode(clauses);
} else {
return first;
}
return new BooleanQueryNode(clauses);
} else {
return first;
}
}
}
QueryNode DisjQuery(CharSequence field) : {
QueryNode first, c;
Vector<QueryNode> clauses = null;
QueryNode first, c;
Vector<QueryNode> clauses = null;
}
{
first = ConjQuery(field)
@ -243,7 +243,7 @@ QueryNode DisjQuery(CharSequence field) : {
)*
{
if (clauses != null) {
return new OrQueryNode(clauses);
return new OrQueryNode(clauses);
} else {
return first;
}
@ -251,8 +251,8 @@ QueryNode DisjQuery(CharSequence field) : {
}
QueryNode ConjQuery(CharSequence field) : {
QueryNode first, c;
Vector<QueryNode> clauses = null;
QueryNode first, c;
Vector<QueryNode> clauses = null;
}
{
first = ModClause(field)
@ -268,7 +268,7 @@ QueryNode ConjQuery(CharSequence field) : {
)*
{
if (clauses != null) {
return new AndQueryNode(clauses);
return new AndQueryNode(clauses);
} else {
return first;
}
@ -289,27 +289,27 @@ QueryNode ConjQuery(CharSequence field) : {
// if (mods == ModifierQueryNode.Modifier.MOD_NONE) firstQuery=q;
//
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// }
// (
// conj=Conjunction() mods=Modifiers() q=Clause(field)
// {
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// //TODO: figure out what to do with AND and ORs
// // do not create modifier nodes with MOD_NONE
// if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
// q = new ModifierQueryNode(q, mods);
// }
// clauses.add(q);
// //TODO: figure out what to do with AND and ORs
// }
// )*
// {
// if (clauses.size() == 1 && firstQuery != null)
// return firstQuery;
// else {
// return new BooleanQueryNode(clauses);
// return new BooleanQueryNode(clauses);
// }
// }
// }
@ -320,10 +320,10 @@ QueryNode ModClause(CharSequence field) : {
}
{
mods=Modifiers() q= Clause(field) {
if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
q = new ModifierQueryNode(q, mods);
}
return q;
if (mods != ModifierQueryNode.Modifier.MOD_NONE) {
q = new ModifierQueryNode(q, mods);
}
return q;
}
}
@ -347,18 +347,18 @@ QueryNode Clause(CharSequence field) : {
}
switch (operator.kind) {
case OP_LESSTHAN:
lowerInclusive = true;
upperInclusive = false;
lowerInclusive = true;
upperInclusive = false;
qLower = new FieldQueryNode(field,
qLower = new FieldQueryNode(field,
"*", term.beginColumn, term.endColumn);
qUpper = new FieldQueryNode(field,
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
break;
case OP_LESSTHANEQ:
lowerInclusive = true;
upperInclusive = true;
lowerInclusive = true;
upperInclusive = true;
qLower = new FieldQueryNode(field,
"*", term.beginColumn, term.endColumn);
@ -366,8 +366,8 @@ QueryNode Clause(CharSequence field) : {
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
break;
case OP_MORETHAN:
lowerInclusive = false;
upperInclusive = true;
lowerInclusive = false;
upperInclusive = true;
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
@ -375,8 +375,8 @@ QueryNode Clause(CharSequence field) : {
"*", term.beginColumn, term.endColumn);
break;
case OP_MORETHANEQ:
lowerInclusive = true;
upperInclusive = true;
lowerInclusive = true;
upperInclusive = true;
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn);
@ -401,18 +401,18 @@ QueryNode Clause(CharSequence field) : {
)
{
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
}
}
if (group) { q = new GroupQueryNode(q);}
return q;
@ -433,7 +433,7 @@ QueryNode Term(CharSequence field) : {
{
(
(
term=<TERM> { q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn); }
term=<TERM> { q = new FieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), term.beginColumn, term.endColumn); }
| term=<REGEXPTERM> { regexp=true; }
| term=<NUMBER>
)
@ -441,16 +441,16 @@ QueryNode Term(CharSequence field) : {
[ <CARAT> boost=<NUMBER> [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ] ]
{
if (fuzzy) {
float fms = defaultMinSimilarity;
try {
float fms = defaultMinSimilarity;
try {
fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
} catch (Exception ignored) { }
if(fms < 0.0f){
throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));
} else if (fms >= 1.0f && fms != (int) fms) {
throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS));
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} catch (Exception ignored) { }
if(fms < 0.0f){
throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_LIMITS));
} else if (fms >= 1.0f && fms != (int) fms) {
throw new ParseException(new MessageImpl(QueryParserMessages.INVALID_SYNTAX_FUZZY_EDITS));
}
q = new FuzzyQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image), fms, term.beginColumn, term.endColumn);
} else if (regexp) {
String re = term.image.substring(1, term.image.length()-1);
q = new RegexpQueryNode(field, re, 0, re.length());
@ -471,9 +471,9 @@ QueryNode Term(CharSequence field) : {
}
qLower = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
EscapeQuerySyntaxImpl.discardEscapeChar(goop1.image), goop1.beginColumn, goop1.endColumn);
qUpper = new FieldQueryNode(field,
EscapeQuerySyntaxImpl.discardEscapeChar(goop2.image), goop2.beginColumn, goop2.endColumn);
q = new TermRangeQueryNode(qLower, qUpper, startInc ? true : false, endInc ? true : false);
}
| term=<QUOTED> {q = new QuotedFieldQueryNode(field, EscapeQuerySyntaxImpl.discardEscapeChar(term.image.substring(1, term.image.length()-1)), term.beginColumn + 1, term.endColumn - 1);}
@ -489,28 +489,28 @@ QueryNode Term(CharSequence field) : {
}
catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no PhraseSlop", if
* slop number is invalid)
*/
* slop number is invalid)
*/
}
}
}
)
{
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
}
if (boost != null) {
float f = (float)1.0;
try {
f = Float.valueOf(boost.image).floatValue();
// avoid boosting null queries, such as those caused by stop words
if (q != null) {
q = new BoostQueryNode(q, f);
}
} catch (Exception ignored) {
/* Should this be handled somehow? (defaults to "no boost", if
* boost number is invalid)
*/
}
}
return q;
}
}

View File

@ -26,13 +26,13 @@ import java.io.*;
public final class FastCharStream implements CharStream {
char[] buffer = null;
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int bufferLength = 0; // end of valid chars
int bufferPosition = 0; // next char to read
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
int tokenStart = 0; // offset in buffer
int bufferStart = 0; // position in file of buffer
Reader input; // source of chars
Reader input; // source of chars
/** Constructs from a Reader. */
public FastCharStream(Reader r) {
@ -48,24 +48,24 @@ public final class FastCharStream implements CharStream {
private final void refill() throws IOException {
int newPosition = bufferLength - tokenStart;
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
if (tokenStart == 0) { // token won't fit in buffer
if (buffer == null) { // first time: alloc buffer
buffer = new char[2048];
} else if (bufferLength == buffer.length) { // grow buffer
char[] newBuffer = new char[buffer.length*2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
char[] newBuffer = new char[buffer.length * 2];
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
buffer = newBuffer;
}
} else { // shift token to front
} else { // shift token to front
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
}
bufferLength = newPosition; // update state
bufferLength = newPosition; // update state
bufferPosition = newPosition;
bufferStart += tokenStart;
tokenStart = 0;
int charsRead = // fill space in buffer
int charsRead = // fill space in buffer
input.read(buffer, newPosition, buffer.length-newPosition);
if (charsRead == -1)
throw new IOException("read past eof");

View File

@ -47,9 +47,9 @@ public class BasicQueryFactory {
public String toString() {
return getClass().getName()
+ "(maxBasicQueries: " + maxBasicQueries
+ ", queriesMade: " + queriesMade
+ ")";
+ "(maxBasicQueries: " + maxBasicQueries
+ ", queriesMade: " + queriesMade
+ ")";
}
private boolean atMax() {

View File

@ -45,19 +45,19 @@ abstract class RewriteQuery<SQ extends SrndQuery> extends Query {
@Override
public String toString(String field) {
return getClass().getName()
+ (field == null ? "" : "(unused: " + field + ")")
+ "(" + fieldName
+ ", " + srndQuery.toString()
+ ", " + qf.toString()
+ ")";
+ (field == null ? "" : "(unused: " + field + ")")
+ "(" + fieldName
+ ", " + srndQuery.toString()
+ ", " + qf.toString()
+ ")";
}
@Override
public int hashCode() {
return getClass().hashCode()
^ fieldName.hashCode()
^ qf.hashCode()
^ srndQuery.hashCode();
^ fieldName.hashCode()
^ qf.hashCode()
^ srndQuery.hashCode();
}
@Override
@ -68,8 +68,8 @@ abstract class RewriteQuery<SQ extends SrndQuery> extends Query {
return false;
RewriteQuery other = (RewriteQuery)obj;
return fieldName.equals(other.fieldName)
&& qf.equals(other.qf)
&& srndQuery.equals(other.srndQuery);
&& qf.equals(other.qf)
&& srndQuery.equals(other.srndQuery);
}
/** @throws UnsupportedOperationException */

View File

@ -37,16 +37,16 @@ class SimpleTermRewriteQuery extends RewriteQuery<SimpleTerm> {
public Query rewrite(IndexReader reader) throws IOException {
final List<Query> luceneSubQueries = new ArrayList<Query>();
srndQuery.visitMatchingTerms(reader, fieldName,
new SimpleTerm.MatchingTermVisitor() {
public void visitMatchingTerm(Term term) throws IOException {
luceneSubQueries.add(qf.newTermQuery(term));
}
});
new SimpleTerm.MatchingTermVisitor() {
public void visitMatchingTerm(Term term) throws IOException {
luceneSubQueries.add(qf.newTermQuery(term));
}
});
return (luceneSubQueries.size() == 0) ? SrndQuery.theEmptyLcnQuery
: (luceneSubQueries.size() == 1) ? luceneSubQueries.get(0)
: SrndBooleanQuery.makeBooleanQuery(
/* luceneSubQueries all have default weight */
luceneSubQueries, BooleanClause.Occur.SHOULD); /* OR the subquery terms */
: (luceneSubQueries.size() == 1) ? luceneSubQueries.get(0)
: SrndBooleanQuery.makeBooleanQuery(
/* luceneSubQueries all have default weight */
luceneSubQueries, BooleanClause.Occur.SHOULD); /* OR the subquery terms */
}
}

Some files were not shown because too many files have changed in this diff Show More