mirror of https://github.com/apache/lucene.git
Fix for LUCENE-4362, ban tabs-indent
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1386681 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
310eb39792
commit
ded01621a4
|
@ -50,7 +50,7 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
/** File containing default Brazilian Portuguese stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
/**
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
|
@ -74,19 +74,19 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion) {
|
||||
/**
|
||||
* Contains words that should be indexed but not stemmed.
|
||||
*/
|
||||
private CharArraySet excltable = CharArraySet.EMPTY_SET;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
*/
|
||||
public BrazilianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param matchVersion
|
||||
|
|
|
@ -25,37 +25,37 @@ import java.util.Locale;
|
|||
public class BrazilianStemmer {
|
||||
private static final Locale locale = new Locale("pt", "BR");
|
||||
|
||||
/**
|
||||
* Changed term
|
||||
*/
|
||||
private String TERM ;
|
||||
private String CT ;
|
||||
private String R1 ;
|
||||
private String R2 ;
|
||||
private String RV ;
|
||||
/**
|
||||
* Changed term
|
||||
*/
|
||||
private String TERM ;
|
||||
private String CT ;
|
||||
private String R1 ;
|
||||
private String R2 ;
|
||||
private String RV ;
|
||||
|
||||
|
||||
public BrazilianStemmer() {
|
||||
}
|
||||
public BrazilianStemmer() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Stems the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
protected String stem( String term ) {
|
||||
/**
|
||||
* Stems the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
protected String stem( String term ) {
|
||||
boolean altered = false ; // altered the term
|
||||
|
||||
// creates CT
|
||||
createCT(term) ;
|
||||
|
||||
if ( !isIndexable( CT ) ) {
|
||||
return null;
|
||||
}
|
||||
if ( !isStemmable( CT ) ) {
|
||||
return CT ;
|
||||
}
|
||||
if ( !isIndexable( CT ) ) {
|
||||
return null;
|
||||
}
|
||||
if ( !isStemmable( CT ) ) {
|
||||
return CT ;
|
||||
}
|
||||
|
||||
R1 = getR1(CT) ;
|
||||
R2 = getR1(R1) ;
|
||||
|
@ -76,38 +76,38 @@ public class BrazilianStemmer {
|
|||
step5() ;
|
||||
|
||||
return CT ;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks a term if it can be processed correctly.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable( String term ) {
|
||||
for ( int c = 0; c < term.length(); c++ ) {
|
||||
// Discard terms that contain non-letter characters.
|
||||
if ( !Character.isLetter(term.charAt(c))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* Checks a term if it can be processed correctly.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable( String term ) {
|
||||
for ( int c = 0; c < term.length(); c++ ) {
|
||||
// Discard terms that contain non-letter characters.
|
||||
if ( !Character.isLetter(term.charAt(c))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks a term if it can be processed indexed.
|
||||
*
|
||||
* @return true if it can be indexed
|
||||
*/
|
||||
private boolean isIndexable( String term ) {
|
||||
return (term.length() < 30) && (term.length() > 2) ;
|
||||
}
|
||||
/**
|
||||
* Checks a term if it can be processed indexed.
|
||||
*
|
||||
* @return true if it can be indexed
|
||||
*/
|
||||
private boolean isIndexable( String term ) {
|
||||
return (term.length() < 30) && (term.length() > 2) ;
|
||||
}
|
||||
|
||||
/**
|
||||
* See if string is 'a','e','i','o','u'
|
||||
/**
|
||||
* See if string is 'a','e','i','o','u'
|
||||
*
|
||||
* @return true if is vowel
|
||||
*/
|
||||
private boolean isVowel( char value ) {
|
||||
*/
|
||||
private boolean isVowel( char value ) {
|
||||
return (value == 'a') ||
|
||||
(value == 'e') ||
|
||||
(value == 'i') ||
|
||||
|
@ -115,16 +115,16 @@ public class BrazilianStemmer {
|
|||
(value == 'u') ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets R1
|
||||
/**
|
||||
* Gets R1
|
||||
*
|
||||
* R1 - is the region after the first non-vowel following a vowel,
|
||||
* or is the null region at the end of the word if there is
|
||||
* no such non-vowel.
|
||||
*
|
||||
* @return null or a string representing R1
|
||||
*/
|
||||
private String getR1( String value ) {
|
||||
*/
|
||||
private String getR1( String value ) {
|
||||
int i;
|
||||
int j;
|
||||
|
||||
|
@ -159,8 +159,8 @@ public class BrazilianStemmer {
|
|||
return value.substring(j+1) ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets RV
|
||||
/**
|
||||
* Gets RV
|
||||
*
|
||||
* RV - IF the second letter is a consonant, RV is the region after
|
||||
* the next following vowel,
|
||||
|
@ -175,8 +175,8 @@ public class BrazilianStemmer {
|
|||
* found.
|
||||
*
|
||||
* @return null or a string representing RV
|
||||
*/
|
||||
private String getRV( String value ) {
|
||||
*/
|
||||
private String getRV( String value ) {
|
||||
int i;
|
||||
int j;
|
||||
|
||||
|
@ -229,15 +229,15 @@ public class BrazilianStemmer {
|
|||
return null ;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* 1) Turn to lowercase
|
||||
* 2) Remove accents
|
||||
* 3) ã -> a ; õ -> o
|
||||
* 4) ç -> c
|
||||
*
|
||||
* @return null or a string transformed
|
||||
*/
|
||||
private String changeTerm( String value ) {
|
||||
*/
|
||||
private String changeTerm( String value ) {
|
||||
int j;
|
||||
String r = "" ;
|
||||
|
||||
|
@ -282,12 +282,12 @@ public class BrazilianStemmer {
|
|||
return r ;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Check if a string ends with a suffix
|
||||
*
|
||||
* @return true if the string ends with the specified suffix
|
||||
*/
|
||||
private boolean suffix( String value, String suffix ) {
|
||||
*/
|
||||
private boolean suffix( String value, String suffix ) {
|
||||
|
||||
// be-safe !!!
|
||||
if ((value == null) || (suffix == null)) {
|
||||
|
@ -301,12 +301,12 @@ public class BrazilianStemmer {
|
|||
return value.substring(value.length()-suffix.length()).equals(suffix);
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Replace a string suffix by another
|
||||
*
|
||||
* @return the replaced String
|
||||
*/
|
||||
private String replaceSuffix( String value, String toReplace, String changeTo ) {
|
||||
*/
|
||||
private String replaceSuffix( String value, String toReplace, String changeTo ) {
|
||||
String vvalue ;
|
||||
|
||||
// be-safe !!!
|
||||
|
@ -325,12 +325,12 @@ public class BrazilianStemmer {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Remove a string suffix
|
||||
*
|
||||
* @return the String without the suffix
|
||||
*/
|
||||
private String removeSuffix( String value, String toRemove ) {
|
||||
*/
|
||||
private String removeSuffix( String value, String toRemove ) {
|
||||
// be-safe !!!
|
||||
if ((value == null) ||
|
||||
(toRemove == null) ||
|
||||
|
@ -341,12 +341,12 @@ public class BrazilianStemmer {
|
|||
return value.substring(0,value.length()-toRemove.length()) ;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* See if a suffix is preceded by a String
|
||||
*
|
||||
* @return true if the suffix is preceded
|
||||
*/
|
||||
private boolean suffixPreceded( String value, String suffix, String preceded ) {
|
||||
*/
|
||||
private boolean suffixPreceded( String value, String suffix, String preceded ) {
|
||||
// be-safe !!!
|
||||
if ((value == null) ||
|
||||
(suffix == null) ||
|
||||
|
@ -358,10 +358,10 @@ public class BrazilianStemmer {
|
|||
return suffix(removeSuffix(value,suffix),preceded) ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
|
||||
*/
|
||||
private void createCT( String term ) {
|
||||
/**
|
||||
* Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
|
||||
*/
|
||||
private void createCT( String term ) {
|
||||
CT = changeTerm(term) ;
|
||||
|
||||
if (CT.length() < 2) return ;
|
||||
|
@ -396,14 +396,14 @@ public class BrazilianStemmer {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Standard suffix removal.
|
||||
/**
|
||||
* Standard suffix removal.
|
||||
* Search for the longest among the following suffixes, and perform
|
||||
* the following actions:
|
||||
*
|
||||
* @return false if no ending was removed
|
||||
*/
|
||||
private boolean step1() {
|
||||
*/
|
||||
private boolean step1() {
|
||||
if (CT == null) return false ;
|
||||
|
||||
// suffix length = 7
|
||||
|
@ -559,15 +559,15 @@ public class BrazilianStemmer {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Verb suffixes.
|
||||
/**
|
||||
* Verb suffixes.
|
||||
*
|
||||
* Search for the longest among the following suffixes in RV,
|
||||
* and if found, delete.
|
||||
*
|
||||
* @return false if no ending was removed
|
||||
*/
|
||||
private boolean step2() {
|
||||
*/
|
||||
private boolean step2() {
|
||||
if (RV == null) return false ;
|
||||
|
||||
// suffix lenght = 7
|
||||
|
@ -941,11 +941,11 @@ public class BrazilianStemmer {
|
|||
return false ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete suffix 'i' if in RV and preceded by 'c'
|
||||
/**
|
||||
* Delete suffix 'i' if in RV and preceded by 'c'
|
||||
*
|
||||
*/
|
||||
private void step3() {
|
||||
*/
|
||||
private void step3() {
|
||||
if (RV == null) return ;
|
||||
|
||||
if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) {
|
||||
|
@ -954,14 +954,14 @@ public class BrazilianStemmer {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Residual suffix
|
||||
/**
|
||||
* Residual suffix
|
||||
*
|
||||
* If the word ends with one of the suffixes (os a i o á í ó)
|
||||
* in RV, delete it
|
||||
*
|
||||
*/
|
||||
private void step4() {
|
||||
*/
|
||||
private void step4() {
|
||||
if (RV == null) return ;
|
||||
|
||||
if (suffix(RV,"os")) {
|
||||
|
@ -979,15 +979,15 @@ public class BrazilianStemmer {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* If the word ends with one of ( e é ê) in RV,delete it,
|
||||
/**
|
||||
* If the word ends with one of ( e é ê) in RV,delete it,
|
||||
* and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
|
||||
* delete the 'u' (or 'i')
|
||||
*
|
||||
* Or if the word ends ç remove the cedilha
|
||||
*
|
||||
*/
|
||||
private void step5() {
|
||||
*/
|
||||
private void step5() {
|
||||
if (RV == null) return ;
|
||||
|
||||
if (suffix(RV,"e")) {
|
||||
|
@ -1007,18 +1007,18 @@ public class BrazilianStemmer {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For log and debug purpose
|
||||
*
|
||||
* @return TERM, CT, RV, R1 and R2
|
||||
*/
|
||||
public String log() {
|
||||
/**
|
||||
* For log and debug purpose
|
||||
*
|
||||
* @return TERM, CT, RV, R1 and R2
|
||||
*/
|
||||
public String log() {
|
||||
return " (TERM = " + TERM + ")" +
|
||||
" (CT = " + CT +")" +
|
||||
" (RV = " + RV +")" +
|
||||
" (R1 = " + R1 +")" +
|
||||
" (R2 = " + R2 +")" ;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
|
||||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
|
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
* on 9/17/12 9:29 AM from the specification file
|
||||
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
*/
|
||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||
|
||||
|
@ -52,29 +52,29 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
|
||||
/** lexical states */
|
||||
private static final int DOUBLE_QUOTED_STRING = 28;
|
||||
private static final int CHARACTER_REFERENCE_TAIL = 6;
|
||||
private static final int START_TAG_TAIL_EXCLUDE = 38;
|
||||
private static final int SCRIPT = 14;
|
||||
private static final int CDATA = 22;
|
||||
private static final int LEFT_ANGLE_BRACKET = 8;
|
||||
private static final int END_TAG_TAIL_EXCLUDE = 32;
|
||||
private static final int SERVER_SIDE_INCLUDE = 24;
|
||||
private static final int END_TAG_TAIL_SUBSTITUTE = 34;
|
||||
private static final int SINGLE_QUOTED_STRING = 26;
|
||||
private static final int YYINITIAL = 0;
|
||||
private static final int STYLE = 42;
|
||||
private static final int START_TAG_TAIL_INCLUDE = 36;
|
||||
private static final int AMPERSAND = 2;
|
||||
private static final int BANG = 10;
|
||||
private static final int LEFT_ANGLE_BRACKET_SLASH = 18;
|
||||
private static final int START_TAG_TAIL_SUBSTITUTE = 40;
|
||||
private static final int COMMENT = 12;
|
||||
private static final int SCRIPT_COMMENT = 16;
|
||||
private static final int LEFT_ANGLE_BRACKET_SPACE = 20;
|
||||
private static final int STYLE_COMMENT = 44;
|
||||
private static final int NUMERIC_CHARACTER = 4;
|
||||
private static final int CHARACTER_REFERENCE_TAIL = 6;
|
||||
private static final int LEFT_ANGLE_BRACKET = 8;
|
||||
private static final int BANG = 10;
|
||||
private static final int COMMENT = 12;
|
||||
private static final int SCRIPT = 14;
|
||||
private static final int SCRIPT_COMMENT = 16;
|
||||
private static final int LEFT_ANGLE_BRACKET_SLASH = 18;
|
||||
private static final int LEFT_ANGLE_BRACKET_SPACE = 20;
|
||||
private static final int CDATA = 22;
|
||||
private static final int SERVER_SIDE_INCLUDE = 24;
|
||||
private static final int SINGLE_QUOTED_STRING = 26;
|
||||
private static final int DOUBLE_QUOTED_STRING = 28;
|
||||
private static final int END_TAG_TAIL_INCLUDE = 30;
|
||||
private static final int END_TAG_TAIL_EXCLUDE = 32;
|
||||
private static final int END_TAG_TAIL_SUBSTITUTE = 34;
|
||||
private static final int START_TAG_TAIL_INCLUDE = 36;
|
||||
private static final int START_TAG_TAIL_EXCLUDE = 38;
|
||||
private static final int START_TAG_TAIL_SUBSTITUTE = 40;
|
||||
private static final int STYLE = 42;
|
||||
private static final int STYLE_COMMENT = 44;
|
||||
|
||||
/**
|
||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
||||
|
@ -30967,7 +30967,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -31247,135 +31247,24 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 15:
|
||||
{
|
||||
case 1:
|
||||
{ return zzBuffer[zzStartRead];
|
||||
}
|
||||
case 54: break;
|
||||
case 39:
|
||||
{ yybegin(STYLE);
|
||||
}
|
||||
case 55: break;
|
||||
case 27:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 56: break;
|
||||
case 30:
|
||||
{ int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
case 57: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 58: break;
|
||||
case 8:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 59: break;
|
||||
case 2:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
case 60: break;
|
||||
case 44:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
case 55: break;
|
||||
case 3:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
case 61: break;
|
||||
case 21:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 62: break;
|
||||
case 11:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
case 63: break;
|
||||
case 35:
|
||||
{ yybegin(SCRIPT);
|
||||
}
|
||||
case 64: break;
|
||||
case 42:
|
||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 65: break;
|
||||
case 10:
|
||||
{ inputSegment.append('!'); yybegin(BANG);
|
||||
}
|
||||
case 66: break;
|
||||
case 51:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
char lowSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 67: break;
|
||||
case 56: break;
|
||||
case 4:
|
||||
{ yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
|
@ -31383,166 +31272,11 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 68: break;
|
||||
case 43:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
case 57: break;
|
||||
case 5:
|
||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 69: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 70: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 71: break;
|
||||
case 50:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
case 72: break;
|
||||
case 16:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 73: break;
|
||||
case 22:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 74: break;
|
||||
case 26:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 75: break;
|
||||
case 20:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
}
|
||||
case 76: break;
|
||||
case 47:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 77: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 78: break;
|
||||
case 23:
|
||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||
}
|
||||
case 79: break;
|
||||
case 32:
|
||||
{ yybegin(COMMENT);
|
||||
}
|
||||
case 80: break;
|
||||
case 24:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 81: break;
|
||||
case 3:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
case 82: break;
|
||||
case 46:
|
||||
{ yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 83: break;
|
||||
case 14:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 84: break;
|
||||
case 58: break;
|
||||
case 6:
|
||||
{ int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
|
@ -31576,50 +31310,26 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 85: break;
|
||||
case 34:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
case 59: break;
|
||||
case 7:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 86: break;
|
||||
case 5:
|
||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 87: break;
|
||||
case 13:
|
||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
case 88: break;
|
||||
case 18:
|
||||
case 60: break;
|
||||
case 8:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 89: break;
|
||||
case 40:
|
||||
{ yybegin(SCRIPT_COMMENT);
|
||||
}
|
||||
case 90: break;
|
||||
case 37:
|
||||
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 91: break;
|
||||
case 12:
|
||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||
}
|
||||
case 92: break;
|
||||
case 61: break;
|
||||
case 9:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31629,57 +31339,55 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 93: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
case 62: break;
|
||||
case 10:
|
||||
{ inputSegment.append('!'); yybegin(BANG);
|
||||
}
|
||||
case 63: break;
|
||||
case 11:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
case 64: break;
|
||||
case 12:
|
||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||
}
|
||||
case 65: break;
|
||||
case 13:
|
||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
case 66: break;
|
||||
case 14:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 94: break;
|
||||
case 29:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
case 67: break;
|
||||
case 15:
|
||||
{
|
||||
}
|
||||
case 95: break;
|
||||
case 68: break;
|
||||
case 16:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 69: break;
|
||||
case 17:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 96: break;
|
||||
case 45:
|
||||
{ yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
case 70: break;
|
||||
case 18:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 97: break;
|
||||
case 7:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 98: break;
|
||||
case 71: break;
|
||||
case 19:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
|
@ -31689,7 +31397,34 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 99: break;
|
||||
case 72: break;
|
||||
case 20:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
}
|
||||
case 73: break;
|
||||
case 21:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 74: break;
|
||||
case 22:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 75: break;
|
||||
case 23:
|
||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||
}
|
||||
case 76: break;
|
||||
case 24:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 77: break;
|
||||
case 25:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
|
@ -31699,7 +31434,45 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
case 100: break;
|
||||
case 78: break;
|
||||
case 26:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 79: break;
|
||||
case 27:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 80: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 81: break;
|
||||
case 29:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 82: break;
|
||||
case 30:
|
||||
{ int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
case 83: break;
|
||||
case 31:
|
||||
{ int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
|
@ -31734,7 +31507,262 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 84: break;
|
||||
case 32:
|
||||
{ yybegin(COMMENT);
|
||||
}
|
||||
case 85: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 86: break;
|
||||
case 34:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 87: break;
|
||||
case 35:
|
||||
{ yybegin(SCRIPT);
|
||||
}
|
||||
case 88: break;
|
||||
case 36:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 89: break;
|
||||
case 37:
|
||||
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 90: break;
|
||||
case 38:
|
||||
{ yybegin(restoreState);
|
||||
}
|
||||
case 91: break;
|
||||
case 39:
|
||||
{ yybegin(STYLE);
|
||||
}
|
||||
case 92: break;
|
||||
case 40:
|
||||
{ yybegin(SCRIPT_COMMENT);
|
||||
}
|
||||
case 93: break;
|
||||
case 41:
|
||||
{ yybegin(STYLE_COMMENT);
|
||||
}
|
||||
case 94: break;
|
||||
case 42:
|
||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 95: break;
|
||||
case 43:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 96: break;
|
||||
case 44:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 97: break;
|
||||
case 45:
|
||||
{ yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 98: break;
|
||||
case 46:
|
||||
{ yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 99: break;
|
||||
case 47:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 100: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 101: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 102: break;
|
||||
case 50:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
case 103: break;
|
||||
case 51:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
char lowSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try { // Low surrogates are in decimal range [56320, 57343]
|
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(9, 14) + "'";
|
||||
}
|
||||
if (Character.isLowSurrogate(lowSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.unsafeWrite(lowSurrogate);
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 104: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 105: break;
|
||||
case 53:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
|
@ -31770,34 +31798,6 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 102: break;
|
||||
case 36:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 103: break;
|
||||
case 38:
|
||||
{ yybegin(restoreState);
|
||||
}
|
||||
case 104: break;
|
||||
case 41:
|
||||
{ yybegin(STYLE_COMMENT);
|
||||
}
|
||||
case 105: break;
|
||||
case 1:
|
||||
{ return zzBuffer[zzStartRead];
|
||||
}
|
||||
case 106: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
|
|
|
@ -90,18 +90,18 @@ EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
|
|||
[bB][lL][uU][rR] |
|
||||
[cC][hH][aA][nN][gG][eE] |
|
||||
[cC][lL][iI][cC][kK] |
|
||||
[dD][bB][lL][cC][lL][iI][cC][kK] |
|
||||
[dD][bB][lL][cC][lL][iI][cC][kK] |
|
||||
[eE][rR][rR][oO][rR] |
|
||||
[fF][oO][cC][uU][sS] |
|
||||
[kK][eE][yY][dD][oO][wW][nN] |
|
||||
[kK][eE][yY][pP][rR][eE][sS][sS] |
|
||||
[kK][eE][yY][uU][pP] |
|
||||
[kK][eE][yY][dD][oO][wW][nN] |
|
||||
[kK][eE][yY][pP][rR][eE][sS][sS] |
|
||||
[kK][eE][yY][uU][pP] |
|
||||
[lL][oO][aA][dD] |
|
||||
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
|
||||
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
|
||||
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
|
||||
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
|
||||
[mM][oO][uU][sS][eE][oO][uU][tT] |
|
||||
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
|
||||
[mM][oO][uU][sS][eE][uU][pP] |
|
||||
[mM][oO][uU][sS][eE][uU][pP] |
|
||||
[rR][eE][sS][eE][tT] |
|
||||
[sS][eE][lL][eE][cC][tT] |
|
||||
[sS][uU][bB][mM][iI][tT] |
|
||||
|
|
|
@ -30,7 +30,7 @@ import java.io.IOException;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
|
||||
* minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
|
||||
* minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
|
|
|
@ -50,7 +50,7 @@ import org.xml.sax.InputSource;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
|
||||
* dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
|
||||
* dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
|
|
|
@ -50,24 +50,24 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
*
|
||||
* @return a set of default Czech-stopwords
|
||||
*/
|
||||
public static final CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
public static final CharArraySet getDefaultStopSet(){
|
||||
return DefaultSetHolder.DEFAULT_SET;
|
||||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(CzechAnalyzer.class,
|
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private final CharArraySet stemExclusionTable;
|
||||
|
@ -77,9 +77,9 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
*
|
||||
* @param matchVersion Lucene version to match
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion) {
|
||||
public CzechAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
|
|
|
@ -49,8 +49,8 @@ import java.util.StringTokenizer;
|
|||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
|
||||
* keep="java solr lucene" keepIgnoreCase="false"
|
||||
* okPrefix="McK McD McA"/>
|
||||
* keep="java solr lucene" keepIgnoreCase="false"
|
||||
* okPrefix="McK McD McA"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
|
|
|
@ -31,8 +31,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* Example field definition in schema.xml:
|
||||
* <pre class="prettyprint">
|
||||
* <fieldtype name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
* <filter class="solr.StopFilterFactory" ignoreCase="true"/>
|
||||
* <filter class="solr.HyphenatedWordsFilterFactory"/>
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
*
|
||||
*/
|
||||
public class HyphenatedWordsFilterFactory extends TokenFilterFactory {
|
||||
public HyphenatedWordsFilter create(TokenStream input) {
|
||||
return new HyphenatedWordsFilter(input);
|
||||
}
|
||||
public HyphenatedWordsFilter create(TokenStream input) {
|
||||
return new HyphenatedWordsFilter(input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,10 +43,10 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
|
|||
* <pre class="prettyprint" >
|
||||
* <fieldType name="descendent_path" class="solr.TextField">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
* <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
* </analyzer>
|
||||
* <analyzer type="query">
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
|
@ -61,10 +61,10 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
|
|||
* <pre class="prettyprint" >
|
||||
* <fieldType name="descendent_path" class="solr.TextField">
|
||||
* <analyzer type="index">
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
* </analyzer>
|
||||
* <analyzer type="query">
|
||||
* <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
* <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
|
|
|
@ -211,6 +211,6 @@ public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper {
|
|||
}
|
||||
}
|
||||
return allStopWords.toArray(new Term[allStopWords.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -395,7 +395,7 @@ public final class ShingleFilter extends TokenFilter {
|
|||
exhausted = true;
|
||||
}
|
||||
return newTarget;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Fills {@link #inputWindow} with input stream tokens, if available,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:28 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
* on 9/17/12 9:28 AM from the specification file
|
||||
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
|
@ -453,7 +453,7 @@ public final void getText(CharTermAttribute t) {
|
|||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -674,44 +674,44 @@ public final void getText(CharTermAttribute t) {
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||
}
|
||||
case 11: break;
|
||||
case 2:
|
||||
{ return ALPHANUM;
|
||||
}
|
||||
case 12: break;
|
||||
case 4:
|
||||
{ return HOST;
|
||||
case 3:
|
||||
{ return CJ;
|
||||
}
|
||||
case 13: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
case 4:
|
||||
{ return HOST;
|
||||
}
|
||||
case 14: break;
|
||||
case 5:
|
||||
{ return NUM;
|
||||
}
|
||||
case 15: break;
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||
}
|
||||
case 16: break;
|
||||
case 9:
|
||||
{ return ACRONYM;
|
||||
}
|
||||
case 17: break;
|
||||
case 7:
|
||||
{ return COMPANY;
|
||||
}
|
||||
case 18: break;
|
||||
case 6:
|
||||
{ return APOSTROPHE;
|
||||
}
|
||||
case 16: break;
|
||||
case 7:
|
||||
{ return COMPANY;
|
||||
}
|
||||
case 17: break;
|
||||
case 8:
|
||||
{ return ACRONYM_DEP;
|
||||
}
|
||||
case 18: break;
|
||||
case 9:
|
||||
{ return ACRONYM;
|
||||
}
|
||||
case 19: break;
|
||||
case 3:
|
||||
{ return CJ;
|
||||
case 10:
|
||||
{ return EMAIL;
|
||||
}
|
||||
case 20: break;
|
||||
default:
|
||||
|
|
|
@ -79,7 +79,7 @@ APOSTROPHE = {ALPHA} ("'" {ALPHA})+
|
|||
// use a post-filter to remove dots
|
||||
ACRONYM = {LETTER} "." ({LETTER} ".")+
|
||||
|
||||
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
|
||||
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
|
||||
|
||||
// company names like AT&T and Excite@Home.
|
||||
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
||||
|
@ -100,7 +100,7 @@ NUM = ({ALPHANUM} {P} {HAS_DIGIT}
|
|||
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
|
||||
|
||||
// punctuation
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
|
||||
// at least one digit
|
||||
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
|
||||
// Generated using ICU4J 49.1.0.0 on Monday, September 17, 2012 1:28:46 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:28 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -936,7 +936,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
|||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1157,36 +1157,36 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
}
|
||||
case 9: break;
|
||||
case 2:
|
||||
{ return WORD_TYPE;
|
||||
}
|
||||
case 9: break;
|
||||
case 5:
|
||||
{ return SOUTH_EAST_ASIAN_TYPE;
|
||||
}
|
||||
case 10: break;
|
||||
case 3:
|
||||
{ return NUMERIC_TYPE;
|
||||
}
|
||||
case 11: break;
|
||||
case 4:
|
||||
{ return KATAKANA_TYPE;
|
||||
}
|
||||
case 11: break;
|
||||
case 6:
|
||||
{ return IDEOGRAPHIC_TYPE;
|
||||
}
|
||||
case 12: break;
|
||||
case 8:
|
||||
{ return HANGUL_TYPE;
|
||||
case 5:
|
||||
{ return SOUTH_EAST_ASIAN_TYPE;
|
||||
}
|
||||
case 13: break;
|
||||
case 3:
|
||||
{ return NUMERIC_TYPE;
|
||||
case 6:
|
||||
{ return IDEOGRAPHIC_TYPE;
|
||||
}
|
||||
case 14: break;
|
||||
case 7:
|
||||
{ return HIRAGANA_TYPE;
|
||||
}
|
||||
case 15: break;
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
case 8:
|
||||
{ return HANGUL_TYPE;
|
||||
}
|
||||
case 16: break;
|
||||
default:
|
||||
|
|
|
@ -115,8 +115,8 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -4126,7 +4126,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
|||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -4347,51 +4347,51 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 11:
|
||||
// lookahead expression with fixed base length
|
||||
zzMarkedPos = zzStartRead + 6;
|
||||
{ return WORD_TYPE;
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
}
|
||||
case 12: break;
|
||||
case 2:
|
||||
{ return WORD_TYPE;
|
||||
}
|
||||
case 13: break;
|
||||
case 5:
|
||||
{ return SOUTH_EAST_ASIAN_TYPE;
|
||||
}
|
||||
case 14: break;
|
||||
case 1:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */
|
||||
}
|
||||
case 15: break;
|
||||
case 10:
|
||||
{ return URL_TYPE;
|
||||
}
|
||||
case 16: break;
|
||||
case 9:
|
||||
{ return EMAIL_TYPE;
|
||||
}
|
||||
case 17: break;
|
||||
case 4:
|
||||
{ return KATAKANA_TYPE;
|
||||
}
|
||||
case 18: break;
|
||||
case 6:
|
||||
{ return IDEOGRAPHIC_TYPE;
|
||||
}
|
||||
case 19: break;
|
||||
case 8:
|
||||
{ return HANGUL_TYPE;
|
||||
}
|
||||
case 20: break;
|
||||
case 3:
|
||||
{ return NUMERIC_TYPE;
|
||||
}
|
||||
case 21: break;
|
||||
case 14: break;
|
||||
case 4:
|
||||
{ return KATAKANA_TYPE;
|
||||
}
|
||||
case 15: break;
|
||||
case 5:
|
||||
{ return SOUTH_EAST_ASIAN_TYPE;
|
||||
}
|
||||
case 16: break;
|
||||
case 6:
|
||||
{ return IDEOGRAPHIC_TYPE;
|
||||
}
|
||||
case 17: break;
|
||||
case 7:
|
||||
{ return HIRAGANA_TYPE;
|
||||
}
|
||||
case 18: break;
|
||||
case 8:
|
||||
{ return HANGUL_TYPE;
|
||||
}
|
||||
case 19: break;
|
||||
case 9:
|
||||
{ return EMAIL_TYPE;
|
||||
}
|
||||
case 20: break;
|
||||
case 10:
|
||||
{ return URL_TYPE;
|
||||
}
|
||||
case 21: break;
|
||||
case 11:
|
||||
// lookahead expression with fixed base length
|
||||
zzMarkedPos = zzStartRead + 6;
|
||||
{ return WORD_TYPE;
|
||||
}
|
||||
case 22: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
|
|
|
@ -200,8 +200,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 9/17/12 9:29 AM */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
* on 9/17/12 9:29 AM from the specification file
|
||||
* <tt>/Users/Erick/apache/trunk_4326/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
|
@ -37,16 +37,16 @@ class WikipediaTokenizerImpl {
|
|||
private static final int ZZ_BUFFERSIZE = 4096;
|
||||
|
||||
/** lexical states */
|
||||
public static final int THREE_SINGLE_QUOTES_STATE = 10;
|
||||
public static final int EXTERNAL_LINK_STATE = 6;
|
||||
public static final int DOUBLE_EQUALS_STATE = 14;
|
||||
public static final int INTERNAL_LINK_STATE = 4;
|
||||
public static final int DOUBLE_BRACE_STATE = 16;
|
||||
public static final int CATEGORY_STATE = 2;
|
||||
public static final int YYINITIAL = 0;
|
||||
public static final int STRING = 18;
|
||||
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
|
||||
public static final int CATEGORY_STATE = 2;
|
||||
public static final int INTERNAL_LINK_STATE = 4;
|
||||
public static final int EXTERNAL_LINK_STATE = 6;
|
||||
public static final int TWO_SINGLE_QUOTES_STATE = 8;
|
||||
public static final int THREE_SINGLE_QUOTES_STATE = 10;
|
||||
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
|
||||
public static final int DOUBLE_EQUALS_STATE = 14;
|
||||
public static final int DOUBLE_BRACE_STATE = 16;
|
||||
public static final int STRING = 18;
|
||||
|
||||
/**
|
||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
||||
|
@ -589,7 +589,7 @@ final void reset() {
|
|||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// numRead < 0
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -810,188 +810,188 @@ final void reset() {
|
|||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 44:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
case 1:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 47: break;
|
||||
case 37:
|
||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 48: break;
|
||||
case 16:
|
||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 49: break;
|
||||
case 20:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 50: break;
|
||||
case 40:
|
||||
{ positionInc = 1; return ACRONYM;
|
||||
}
|
||||
case 51: break;
|
||||
case 5:
|
||||
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 52: break;
|
||||
case 36:
|
||||
{ positionInc = 1; return COMPANY;
|
||||
}
|
||||
case 53: break;
|
||||
case 10:
|
||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 54: break;
|
||||
case 15:
|
||||
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 55: break;
|
||||
case 22:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 56: break;
|
||||
case 35:
|
||||
{ positionInc = 1; return NUM;
|
||||
}
|
||||
case 57: break;
|
||||
case 33:
|
||||
{ positionInc = 1; return APOSTROPHE;
|
||||
}
|
||||
case 58: break;
|
||||
case 21:
|
||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||
}
|
||||
case 59: break;
|
||||
case 18:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
|
||||
}
|
||||
case 60: break;
|
||||
case 2:
|
||||
{ positionInc = 1; return ALPHANUM;
|
||||
}
|
||||
case 48: break;
|
||||
case 3:
|
||||
{ positionInc = 1; return CJ;
|
||||
}
|
||||
case 49: break;
|
||||
case 4:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 50: break;
|
||||
case 5:
|
||||
{ positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 51: break;
|
||||
case 6:
|
||||
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 52: break;
|
||||
case 7:
|
||||
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 53: break;
|
||||
case 8:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||
}
|
||||
case 54: break;
|
||||
case 9:
|
||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||
}
|
||||
case 55: break;
|
||||
case 10:
|
||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 56: break;
|
||||
case 11:
|
||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 57: break;
|
||||
case 12:
|
||||
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
||||
}
|
||||
case 58: break;
|
||||
case 13:
|
||||
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 59: break;
|
||||
case 14:
|
||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 60: break;
|
||||
case 15:
|
||||
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 61: break;
|
||||
case 1:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;
|
||||
case 16:
|
||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 62: break;
|
||||
case 17:
|
||||
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
||||
}
|
||||
case 63: break;
|
||||
case 39:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
|
||||
case 18:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */
|
||||
}
|
||||
case 64: break;
|
||||
case 29:
|
||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 65: break;
|
||||
case 46:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 66: break;
|
||||
case 27:
|
||||
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 67: break;
|
||||
case 4:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 68: break;
|
||||
case 38:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
|
||||
}
|
||||
case 69: break;
|
||||
case 13:
|
||||
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 70: break;
|
||||
case 3:
|
||||
{ positionInc = 1; return CJ;
|
||||
}
|
||||
case 71: break;
|
||||
case 45:
|
||||
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 72: break;
|
||||
case 6:
|
||||
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 73: break;
|
||||
case 11:
|
||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 74: break;
|
||||
case 25:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 75: break;
|
||||
case 8:
|
||||
{ /* Break so we don't hit fall-through warning: */ break;/* ignore */
|
||||
}
|
||||
case 76: break;
|
||||
case 19:
|
||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
||||
}
|
||||
case 77: break;
|
||||
case 43:
|
||||
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||
case 65: break;
|
||||
case 20:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 78: break;
|
||||
case 42:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
|
||||
case 66: break;
|
||||
case 21:
|
||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||
}
|
||||
case 79: break;
|
||||
case 30:
|
||||
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
case 67: break;
|
||||
case 22:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 80: break;
|
||||
case 14:
|
||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 81: break;
|
||||
case 9:
|
||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||
}
|
||||
case 82: break;
|
||||
case 7:
|
||||
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
||||
}
|
||||
case 83: break;
|
||||
case 41:
|
||||
{ positionInc = 1; return EMAIL;
|
||||
}
|
||||
case 84: break;
|
||||
case 28:
|
||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 85: break;
|
||||
case 68: break;
|
||||
case 23:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 86: break;
|
||||
case 34:
|
||||
{ positionInc = 1; return HOST;
|
||||
}
|
||||
case 87: break;
|
||||
case 32:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 88: break;
|
||||
case 12:
|
||||
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
||||
}
|
||||
case 89: break;
|
||||
case 69: break;
|
||||
case 24:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 90: break;
|
||||
case 70: break;
|
||||
case 25:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 71: break;
|
||||
case 26:
|
||||
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 72: break;
|
||||
case 27:
|
||||
{ numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 73: break;
|
||||
case 28:
|
||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 74: break;
|
||||
case 29:
|
||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 75: break;
|
||||
case 30:
|
||||
{ yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 76: break;
|
||||
case 31:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/
|
||||
}
|
||||
case 77: break;
|
||||
case 32:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 78: break;
|
||||
case 33:
|
||||
{ positionInc = 1; return APOSTROPHE;
|
||||
}
|
||||
case 79: break;
|
||||
case 34:
|
||||
{ positionInc = 1; return HOST;
|
||||
}
|
||||
case 80: break;
|
||||
case 35:
|
||||
{ positionInc = 1; return NUM;
|
||||
}
|
||||
case 81: break;
|
||||
case 36:
|
||||
{ positionInc = 1; return COMPANY;
|
||||
}
|
||||
case 82: break;
|
||||
case 37:
|
||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 83: break;
|
||||
case 38:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/
|
||||
}
|
||||
case 84: break;
|
||||
case 39:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/
|
||||
}
|
||||
case 85: break;
|
||||
case 40:
|
||||
{ positionInc = 1; return ACRONYM;
|
||||
}
|
||||
case 86: break;
|
||||
case 41:
|
||||
{ positionInc = 1; return EMAIL;
|
||||
}
|
||||
case 87: break;
|
||||
case 42:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/
|
||||
}
|
||||
case 88: break;
|
||||
case 43:
|
||||
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||
}
|
||||
case 89: break;
|
||||
case 44:
|
||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 90: break;
|
||||
case 45:
|
||||
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 91: break;
|
||||
case 26:
|
||||
{ yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;
|
||||
case 46:
|
||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;
|
||||
}
|
||||
case 92: break;
|
||||
default:
|
||||
|
|
|
@ -136,7 +136,7 @@ NUM = ({ALPHANUM} {P} {HAS_DIGIT}
|
|||
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
|
||||
|
||||
// punctuation
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
|
||||
// at least one digit
|
||||
HAS_DIGIT =
|
||||
|
|
|
@ -43,25 +43,26 @@ import java.lang.reflect.Method;
|
|||
* reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
|
||||
*/
|
||||
public class Among {
|
||||
private static final Class<?>[] EMPTY_PARAMS = new Class[0];
|
||||
public Among (String s, int substring_i, int result,
|
||||
String methodname, SnowballProgram methodobject) {
|
||||
this.s_size = s.length();
|
||||
this.s = s.toCharArray();
|
||||
this.substring_i = substring_i;
|
||||
this.result = result;
|
||||
this.methodobject = methodobject;
|
||||
if (methodname.length() == 0) {
|
||||
this.method = null;
|
||||
} else {
|
||||
try {
|
||||
this.method = methodobject.getClass().
|
||||
getDeclaredMethod(methodname, EMPTY_PARAMS);
|
||||
} catch (NoSuchMethodException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
private static final Class<?>[] EMPTY_PARAMS = new Class[0];
|
||||
|
||||
public Among(String s, int substring_i, int result,
|
||||
String methodname, SnowballProgram methodobject) {
|
||||
this.s_size = s.length();
|
||||
this.s = s.toCharArray();
|
||||
this.substring_i = substring_i;
|
||||
this.result = result;
|
||||
this.methodobject = methodobject;
|
||||
if (methodname.length() == 0) {
|
||||
this.method = null;
|
||||
} else {
|
||||
try {
|
||||
this.method = methodobject.getClass().
|
||||
getDeclaredMethod(methodname, EMPTY_PARAMS);
|
||||
} catch (NoSuchMethodException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final int s_size; /* search string */
|
||||
public final char[] s; /* search string */
|
||||
|
|
|
@ -51,8 +51,8 @@ public abstract class SnowballProgram {
|
|||
|
||||
protected SnowballProgram()
|
||||
{
|
||||
current = new char[8];
|
||||
setCurrent("");
|
||||
current = new char[8];
|
||||
setCurrent("");
|
||||
}
|
||||
|
||||
public abstract boolean stem();
|
||||
|
@ -62,12 +62,12 @@ public abstract class SnowballProgram {
|
|||
*/
|
||||
public void setCurrent(String value)
|
||||
{
|
||||
current = value.toCharArray();
|
||||
cursor = 0;
|
||||
limit = value.length();
|
||||
limit_backward = 0;
|
||||
bra = cursor;
|
||||
ket = limit;
|
||||
current = value.toCharArray();
|
||||
cursor = 0;
|
||||
limit = value.length();
|
||||
limit_backward = 0;
|
||||
bra = cursor;
|
||||
ket = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -130,354 +130,350 @@ public abstract class SnowballProgram {
|
|||
|
||||
protected void copy_from(SnowballProgram other)
|
||||
{
|
||||
current = other.current;
|
||||
cursor = other.cursor;
|
||||
limit = other.limit;
|
||||
limit_backward = other.limit_backward;
|
||||
bra = other.bra;
|
||||
ket = other.ket;
|
||||
current = other.current;
|
||||
cursor = other.cursor;
|
||||
limit = other.limit;
|
||||
limit_backward = other.limit_backward;
|
||||
bra = other.bra;
|
||||
ket = other.ket;
|
||||
}
|
||||
|
||||
protected boolean in_grouping(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean in_grouping_b(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_grouping(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) {
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor ++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) {
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor ++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean out_grouping_b(char [] s, int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
ch -= min;
|
||||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean in_range(int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean in_range_b(int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if (ch > max || ch < min) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_range(int min, int max)
|
||||
{
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (!(ch > max || ch < min)) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
if (cursor >= limit) return false;
|
||||
char ch = current[cursor];
|
||||
if (!(ch > max || ch < min)) return false;
|
||||
cursor++;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean out_range_b(int min, int max)
|
||||
{
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if(!(ch > max || ch < min)) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
if (cursor <= limit_backward) return false;
|
||||
char ch = current[cursor - 1];
|
||||
if(!(ch > max || ch < min)) return false;
|
||||
cursor--;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean eq_s(int s_size, CharSequence s)
|
||||
{
|
||||
if (limit - cursor < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor += s_size;
|
||||
return true;
|
||||
if (limit - cursor < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor += s_size;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean eq_s_b(int s_size, CharSequence s)
|
||||
{
|
||||
if (cursor - limit_backward < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor - s_size + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor -= s_size;
|
||||
return true;
|
||||
if (cursor - limit_backward < s_size) return false;
|
||||
int i;
|
||||
for (i = 0; i != s_size; i++) {
|
||||
if (current[cursor - s_size + i] != s.charAt(i)) return false;
|
||||
}
|
||||
cursor -= s_size;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean eq_v(CharSequence s)
|
||||
{
|
||||
return eq_s(s.length(), s);
|
||||
return eq_s(s.length(), s);
|
||||
}
|
||||
|
||||
protected boolean eq_v_b(CharSequence s)
|
||||
{ return eq_s_b(s.length(), s);
|
||||
{
|
||||
return eq_s_b(s.length(), s);
|
||||
}
|
||||
|
||||
protected int find_among(Among v[], int v_size)
|
||||
{
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
|
||||
int c = cursor;
|
||||
int l = limit;
|
||||
int c = cursor;
|
||||
int l = limit;
|
||||
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
|
||||
boolean first_key_inspected = false;
|
||||
boolean first_key_inspected = false;
|
||||
|
||||
while(true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j; // smaller
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = common; i2 < w.s_size; i2++) {
|
||||
if (c + common == l) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c + common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break; // v->s has been inspected
|
||||
if (j == i) break; // only one item in v
|
||||
while (true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j; // smaller
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = common; i2 < w.s_size; i2++) {
|
||||
if (c + common == l) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c + common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break; // v->s has been inspected
|
||||
if (j == i) break; // only one item in v
|
||||
|
||||
// - but now we need to go round once more to get
|
||||
// v->s inspected. This looks messy, but is actually
|
||||
// the optimal approach.
|
||||
// - but now we need to go round once more to get
|
||||
// v->s inspected. This looks messy, but is actually
|
||||
// the optimal approach.
|
||||
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while(true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c + w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c + w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c + w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c + w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// find_among_b is for backwards processing. Same comments apply
|
||||
// find_among_b is for backwards processing. Same comments apply
|
||||
protected int find_among_b(Among v[], int v_size)
|
||||
{
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
int i = 0;
|
||||
int j = v_size;
|
||||
|
||||
int c = cursor;
|
||||
int lb = limit_backward;
|
||||
int c = cursor;
|
||||
int lb = limit_backward;
|
||||
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
int common_i = 0;
|
||||
int common_j = 0;
|
||||
|
||||
boolean first_key_inspected = false;
|
||||
boolean first_key_inspected = false;
|
||||
|
||||
while(true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j;
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
|
||||
if (c - common == lb) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c - 1 - common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break;
|
||||
if (j == i) break;
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while(true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c - w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
while (true) {
|
||||
int k = i + ((j - i) >> 1);
|
||||
int diff = 0;
|
||||
int common = common_i < common_j ? common_i : common_j;
|
||||
Among w = v[k];
|
||||
int i2;
|
||||
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
|
||||
if (c - common == lb) {
|
||||
diff = -1;
|
||||
break;
|
||||
}
|
||||
diff = current[c - 1 - common] - w.s[i2];
|
||||
if (diff != 0) break;
|
||||
common++;
|
||||
}
|
||||
if (diff < 0) {
|
||||
j = k;
|
||||
common_j = common;
|
||||
} else {
|
||||
i = k;
|
||||
common_i = common;
|
||||
}
|
||||
if (j - i <= 1) {
|
||||
if (i > 0) break;
|
||||
if (j == i) break;
|
||||
if (first_key_inspected) break;
|
||||
first_key_inspected = true;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
Among w = v[i];
|
||||
if (common_i >= w.s_size) {
|
||||
cursor = c - w.s_size;
|
||||
if (w.method == null) return w.result;
|
||||
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c - w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
boolean res;
|
||||
try {
|
||||
Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
|
||||
res = resobj.toString().equals("true");
|
||||
} catch (InvocationTargetException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
} catch (IllegalAccessException e) {
|
||||
res = false;
|
||||
// FIXME - debug message
|
||||
}
|
||||
cursor = c - w.s_size;
|
||||
if (res) return w.result;
|
||||
}
|
||||
i = w.substring_i;
|
||||
if (i < 0) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* to replace chars between c_bra and c_ket in current by the
|
||||
/* to replace chars between c_bra and c_ket in current by the
|
||||
* chars in s.
|
||||
*/
|
||||
protected int replace_s(int c_bra, int c_ket, CharSequence s)
|
||||
{
|
||||
final int adjustment = s.length() - (c_ket - c_bra);
|
||||
final int newLength = limit + adjustment;
|
||||
//resize if necessary
|
||||
if (newLength > current.length) {
|
||||
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(current, 0, newBuffer, 0, limit);
|
||||
current = newBuffer;
|
||||
}
|
||||
// if the substring being replaced is longer or shorter than the
|
||||
// replacement, need to shift things around
|
||||
if (adjustment != 0 && c_ket < limit) {
|
||||
System.arraycopy(current, c_ket, current, c_bra + s.length(),
|
||||
limit - c_ket);
|
||||
}
|
||||
// insert the replacement text
|
||||
// Note, faster is s.getChars(0, s.length(), current, c_bra);
|
||||
// but would have to duplicate this method for both String and StringBuilder
|
||||
for (int i = 0; i < s.length(); i++)
|
||||
current[c_bra + i] = s.charAt(i);
|
||||
|
||||
limit += adjustment;
|
||||
if (cursor >= c_ket) cursor += adjustment;
|
||||
else if (cursor > c_bra) cursor = c_bra;
|
||||
return adjustment;
|
||||
protected int replace_s(int c_bra, int c_ket, CharSequence s) {
|
||||
final int adjustment = s.length() - (c_ket - c_bra);
|
||||
final int newLength = limit + adjustment;
|
||||
//resize if necessary
|
||||
if (newLength > current.length) {
|
||||
char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
System.arraycopy(current, 0, newBuffer, 0, limit);
|
||||
current = newBuffer;
|
||||
}
|
||||
// if the substring being replaced is longer or shorter than the
|
||||
// replacement, need to shift things around
|
||||
if (adjustment != 0 && c_ket < limit) {
|
||||
System.arraycopy(current, c_ket, current, c_bra + s.length(),
|
||||
limit - c_ket);
|
||||
}
|
||||
// insert the replacement text
|
||||
// Note, faster is s.getChars(0, s.length(), current, c_bra);
|
||||
// but would have to duplicate this method for both String and StringBuilder
|
||||
for (int i = 0; i < s.length(); i++)
|
||||
current[c_bra + i] = s.charAt(i);
|
||||
|
||||
protected void slice_check()
|
||||
{
|
||||
if (bra < 0 ||
|
||||
bra > ket ||
|
||||
ket > limit)
|
||||
{
|
||||
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
|
||||
// FIXME: report error somehow.
|
||||
/*
|
||||
fprintf(stderr, "faulty slice operation:\n");
|
||||
debug(z, -1, 0);
|
||||
exit(1);
|
||||
*/
|
||||
}
|
||||
}
|
||||
limit += adjustment;
|
||||
if (cursor >= c_ket) cursor += adjustment;
|
||||
else if (cursor > c_bra) cursor = c_bra;
|
||||
return adjustment;
|
||||
}
|
||||
|
||||
protected void slice_from(CharSequence s)
|
||||
{
|
||||
slice_check();
|
||||
replace_s(bra, ket, s);
|
||||
}
|
||||
|
||||
protected void slice_del()
|
||||
{
|
||||
slice_from((CharSequence)"");
|
||||
protected void slice_check() {
|
||||
if (bra < 0 ||
|
||||
bra > ket ||
|
||||
ket > limit) {
|
||||
throw new IllegalArgumentException("faulty slice operation: bra=" + bra + ",ket=" + ket + ",limit=" + limit);
|
||||
// FIXME: report error somehow.
|
||||
/*
|
||||
fprintf(stderr, "faulty slice operation:\n");
|
||||
debug(z, -1, 0);
|
||||
exit(1);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
protected void insert(int c_bra, int c_ket, CharSequence s)
|
||||
protected void slice_from(CharSequence s) {
|
||||
slice_check();
|
||||
replace_s(bra, ket, s);
|
||||
}
|
||||
|
||||
protected void slice_del() {
|
||||
slice_from((CharSequence) "");
|
||||
}
|
||||
|
||||
protected void insert(int c_bra, int c_ket, CharSequence s)
|
||||
{
|
||||
int adjustment = replace_s(c_bra, c_ket, s);
|
||||
if (c_bra <= bra) bra += adjustment;
|
||||
if (c_bra <= ket) ket += adjustment;
|
||||
int adjustment = replace_s(c_bra, c_ket, s);
|
||||
if (c_bra <= bra) bra += adjustment;
|
||||
if (c_bra <= ket) ket += adjustment;
|
||||
}
|
||||
|
||||
/* Copy the slice into the supplied StringBuffer */
|
||||
protected StringBuilder slice_to(StringBuilder s)
|
||||
{
|
||||
slice_check();
|
||||
int len = ket - bra;
|
||||
s.setLength(0);
|
||||
s.append(current, bra, len);
|
||||
return s;
|
||||
slice_check();
|
||||
int len = ket - bra;
|
||||
s.setLength(0);
|
||||
s.append(current, bra, len);
|
||||
return s;
|
||||
}
|
||||
|
||||
protected StringBuilder assign_to(StringBuilder s)
|
||||
{
|
||||
s.setLength(0);
|
||||
s.append(current, 0, limit);
|
||||
return s;
|
||||
s.setLength(0);
|
||||
s.append(current, 0, limit);
|
||||
return s;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -38,87 +38,87 @@ import org.apache.lucene.analysis.util.CharArraySet;
|
|||
public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testWithSnowballExamples() throws Exception {
|
||||
check("boa", "boa");
|
||||
check("boainain", "boainain");
|
||||
check("boas", "boas");
|
||||
check("bôas", "boas"); // removes diacritic: different from snowball portugese
|
||||
check("boassu", "boassu");
|
||||
check("boataria", "boat");
|
||||
check("boate", "boat");
|
||||
check("boates", "boat");
|
||||
check("boatos", "boat");
|
||||
check("bob", "bob");
|
||||
check("boba", "bob");
|
||||
check("bobagem", "bobag");
|
||||
check("bobagens", "bobagens");
|
||||
check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
|
||||
check("bobear", "bob");
|
||||
check("bobeira", "bobeir");
|
||||
check("bobinho", "bobinh");
|
||||
check("bobinhos", "bobinh");
|
||||
check("bobo", "bob");
|
||||
check("bobs", "bobs");
|
||||
check("boca", "boc");
|
||||
check("bocadas", "boc");
|
||||
check("bocadinho", "bocadinh");
|
||||
check("bocado", "boc");
|
||||
check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
|
||||
check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
|
||||
check("bocarra", "bocarr");
|
||||
check("bocas", "boc");
|
||||
check("bode", "bod");
|
||||
check("bodoque", "bodoqu");
|
||||
check("body", "body");
|
||||
check("boeing", "boeing");
|
||||
check("boem", "boem");
|
||||
check("boemia", "boem");
|
||||
check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
|
||||
check("bogotá", "bogot");
|
||||
check("boi", "boi");
|
||||
check("bóia", "boi"); // removes diacritic: different from snowball portuguese
|
||||
check("boiando", "boi");
|
||||
check("quiabo", "quiab");
|
||||
check("quicaram", "quic");
|
||||
check("quickly", "quickly");
|
||||
check("quieto", "quiet");
|
||||
check("quietos", "quiet");
|
||||
check("quilate", "quilat");
|
||||
check("quilates", "quilat");
|
||||
check("quilinhos", "quilinh");
|
||||
check("quilo", "quil");
|
||||
check("quilombo", "quilomb");
|
||||
check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
|
||||
check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
|
||||
check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
|
||||
check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
|
||||
check("quilos", "quil");
|
||||
check("quimica", "quimic");
|
||||
check("quilos", "quil");
|
||||
check("quimica", "quimic");
|
||||
check("quimicas", "quimic");
|
||||
check("quimico", "quimic");
|
||||
check("quimicos", "quimic");
|
||||
check("quimioterapia", "quimioterap");
|
||||
check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
|
||||
check("quimono", "quimon");
|
||||
check("quincas", "quinc");
|
||||
check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
|
||||
check("quinhentos", "quinhent");
|
||||
check("quinn", "quinn");
|
||||
check("quino", "quin");
|
||||
check("quinta", "quint");
|
||||
check("quintal", "quintal");
|
||||
check("quintana", "quintan");
|
||||
check("quintanilha", "quintanilh");
|
||||
check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
|
||||
check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
|
||||
check("quintino", "quintin");
|
||||
check("quinto", "quint");
|
||||
check("quintos", "quint");
|
||||
check("quintuplicou", "quintuplic");
|
||||
check("quinze", "quinz");
|
||||
check("quinzena", "quinzen");
|
||||
check("quiosque", "quiosqu");
|
||||
check("boa", "boa");
|
||||
check("boainain", "boainain");
|
||||
check("boas", "boas");
|
||||
check("bôas", "boas"); // removes diacritic: different from snowball portugese
|
||||
check("boassu", "boassu");
|
||||
check("boataria", "boat");
|
||||
check("boate", "boat");
|
||||
check("boates", "boat");
|
||||
check("boatos", "boat");
|
||||
check("bob", "bob");
|
||||
check("boba", "bob");
|
||||
check("bobagem", "bobag");
|
||||
check("bobagens", "bobagens");
|
||||
check("bobalhões", "bobalho"); // removes diacritic: different from snowball portugese
|
||||
check("bobear", "bob");
|
||||
check("bobeira", "bobeir");
|
||||
check("bobinho", "bobinh");
|
||||
check("bobinhos", "bobinh");
|
||||
check("bobo", "bob");
|
||||
check("bobs", "bobs");
|
||||
check("boca", "boc");
|
||||
check("bocadas", "boc");
|
||||
check("bocadinho", "bocadinh");
|
||||
check("bocado", "boc");
|
||||
check("bocaiúva", "bocaiuv"); // removes diacritic: different from snowball portuguese
|
||||
check("boçal", "bocal"); // removes diacritic: different from snowball portuguese
|
||||
check("bocarra", "bocarr");
|
||||
check("bocas", "boc");
|
||||
check("bode", "bod");
|
||||
check("bodoque", "bodoqu");
|
||||
check("body", "body");
|
||||
check("boeing", "boeing");
|
||||
check("boem", "boem");
|
||||
check("boemia", "boem");
|
||||
check("boêmio", "boemi"); // removes diacritic: different from snowball portuguese
|
||||
check("bogotá", "bogot");
|
||||
check("boi", "boi");
|
||||
check("bóia", "boi"); // removes diacritic: different from snowball portuguese
|
||||
check("boiando", "boi");
|
||||
check("quiabo", "quiab");
|
||||
check("quicaram", "quic");
|
||||
check("quickly", "quickly");
|
||||
check("quieto", "quiet");
|
||||
check("quietos", "quiet");
|
||||
check("quilate", "quilat");
|
||||
check("quilates", "quilat");
|
||||
check("quilinhos", "quilinh");
|
||||
check("quilo", "quil");
|
||||
check("quilombo", "quilomb");
|
||||
check("quilométricas", "quilometr"); // removes diacritic: different from snowball portuguese
|
||||
check("quilométricos", "quilometr"); // removes diacritic: different from snowball portuguese
|
||||
check("quilômetro", "quilometr"); // removes diacritic: different from snowball portoguese
|
||||
check("quilômetros", "quilometr"); // removes diacritic: different from snowball portoguese
|
||||
check("quilos", "quil");
|
||||
check("quimica", "quimic");
|
||||
check("quilos", "quil");
|
||||
check("quimica", "quimic");
|
||||
check("quimicas", "quimic");
|
||||
check("quimico", "quimic");
|
||||
check("quimicos", "quimic");
|
||||
check("quimioterapia", "quimioterap");
|
||||
check("quimioterápicos", "quimioterap"); // removes diacritic: different from snowball portoguese
|
||||
check("quimono", "quimon");
|
||||
check("quincas", "quinc");
|
||||
check("quinhão", "quinha"); // removes diacritic: different from snowball portoguese
|
||||
check("quinhentos", "quinhent");
|
||||
check("quinn", "quinn");
|
||||
check("quino", "quin");
|
||||
check("quinta", "quint");
|
||||
check("quintal", "quintal");
|
||||
check("quintana", "quintan");
|
||||
check("quintanilha", "quintanilh");
|
||||
check("quintão", "quinta"); // removes diacritic: different from snowball portoguese
|
||||
check("quintessência", "quintessente"); // versus snowball portuguese 'quintessent'
|
||||
check("quintino", "quintin");
|
||||
check("quinto", "quint");
|
||||
check("quintos", "quint");
|
||||
check("quintuplicou", "quintuplic");
|
||||
check("quinze", "quinz");
|
||||
check("quinzena", "quinzen");
|
||||
check("quiosque", "quiosqu");
|
||||
}
|
||||
|
||||
public void testNormalization() throws Exception {
|
||||
|
@ -175,4 +175,4 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
|||
};
|
||||
checkOneTermReuse(a, "", "");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -46,7 +46,7 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
// Verify the correct analysis of capitals and small accented letters, and
|
||||
|
|
|
@ -31,93 +31,92 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testAnalyzer() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
assertAnalyzesTo(fa, "", new String[] {
|
||||
});
|
||||
public void testAnalyzer() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
||||
assertAnalyzesTo(fa, "", new String[] {
|
||||
});
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien chat cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien chat cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien CHAT CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien CHAT CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
" chien ,? + = - CHAT /: > CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
" chien ,? + = - CHAT /: > CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
|
||||
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"mot \"entreguillemet\"",
|
||||
new String[] { "mot", "entreguilemet" });
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"mot \"entreguillemet\"",
|
||||
new String[] { "mot", "entreguilemet" });
|
||||
|
||||
// let's do some french specific tests now
|
||||
// let's do some french specific tests now
|
||||
/* 1. couldn't resist
|
||||
I would expect this to stay one term as in French the minus
|
||||
sign is often used for composing words */
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"Jean-François",
|
||||
new String[] { "jean", "francoi" });
|
||||
|
||||
/* 1. couldn't resist
|
||||
I would expect this to stay one term as in French the minus
|
||||
sign is often used for composing words */
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"Jean-François",
|
||||
new String[] { "jean", "francoi" });
|
||||
// 2. stopwords
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
// 2. stopwords
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
"lanc",
|
||||
"chism",
|
||||
"habitabl",
|
||||
"chist",
|
||||
"element",
|
||||
"captif" });
|
||||
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
"lanc",
|
||||
"chism",
|
||||
"habitabl",
|
||||
"chist",
|
||||
"element",
|
||||
"captif" });
|
||||
// some verbs
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"finissions souffrirent rugissante",
|
||||
new String[] { "finision", "soufrirent", "rugisant" });
|
||||
|
||||
// some verbs
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"finissions souffrirent rugissante",
|
||||
new String[] { "finision", "soufrirent", "rugisant" });
|
||||
// some everything else
|
||||
// aujourd'hui stays one term which is OK
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||||
new String[] {
|
||||
"c3po",
|
||||
"aujourd'hui",
|
||||
"oeuf",
|
||||
"ïaöuaä",
|
||||
"anticonstitutionel",
|
||||
"java" });
|
||||
|
||||
// some everything else
|
||||
// aujourd'hui stays one term which is OK
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||||
new String[] {
|
||||
"c3po",
|
||||
"aujourd'hui",
|
||||
"oeuf",
|
||||
"ïaöuaä",
|
||||
"anticonstitutionel",
|
||||
"java" });
|
||||
// some more everything else
|
||||
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||
new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
|
||||
|
||||
// some more everything else
|
||||
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||
new String[] { "33bi", "1940", "1945", "1940", "1945", "i" });
|
||||
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stopwords
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stopwords
|
||||
assertAnalyzesToReuse(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
|
@ -134,7 +133,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
"chist",
|
||||
"element",
|
||||
"captif" });
|
||||
}
|
||||
}
|
||||
|
||||
public void testExclusionTableViaCtor() throws Exception {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
|
|
|
@ -32,37 +32,37 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
|
|||
* HyphenatedWordsFilter test
|
||||
*/
|
||||
public class TestHyphenatedWordsFilter extends BaseTokenStreamTestCase {
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
// first test
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
|
||||
*/
|
||||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
String input = "abc- def geh 1234- 5678-";
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
// first test
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
|
||||
*/
|
||||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
String input = "abc- def geh 1234- 5678-";
|
||||
TokenStream ts = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "abcdef", "geh", "12345678-" },
|
||||
new int[] { 0, 9, 13 },
|
||||
new int[] { 8, 12, 24 });
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomString() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
|
|
@ -34,83 +34,83 @@ import org.apache.lucene.util.Version;
|
|||
public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testWithSnowballExamples() throws Exception {
|
||||
check("lichaamsziek", "lichaamsziek");
|
||||
check("lichamelijk", "licham");
|
||||
check("lichamelijke", "licham");
|
||||
check("lichamelijkheden", "licham");
|
||||
check("lichamen", "licham");
|
||||
check("lichere", "licher");
|
||||
check("licht", "licht");
|
||||
check("lichtbeeld", "lichtbeeld");
|
||||
check("lichtbruin", "lichtbruin");
|
||||
check("lichtdoorlatende", "lichtdoorlat");
|
||||
check("lichte", "licht");
|
||||
check("lichten", "licht");
|
||||
check("lichtende", "lichtend");
|
||||
check("lichtenvoorde", "lichtenvoord");
|
||||
check("lichter", "lichter");
|
||||
check("lichtere", "lichter");
|
||||
check("lichters", "lichter");
|
||||
check("lichtgevoeligheid", "lichtgevoel");
|
||||
check("lichtgewicht", "lichtgewicht");
|
||||
check("lichtgrijs", "lichtgrijs");
|
||||
check("lichthoeveelheid", "lichthoevel");
|
||||
check("lichtintensiteit", "lichtintensiteit");
|
||||
check("lichtje", "lichtj");
|
||||
check("lichtjes", "lichtjes");
|
||||
check("lichtkranten", "lichtkrant");
|
||||
check("lichtkring", "lichtkring");
|
||||
check("lichtkringen", "lichtkring");
|
||||
check("lichtregelsystemen", "lichtregelsystem");
|
||||
check("lichtste", "lichtst");
|
||||
check("lichtstromende", "lichtstrom");
|
||||
check("lichtte", "licht");
|
||||
check("lichtten", "licht");
|
||||
check("lichttoetreding", "lichttoetred");
|
||||
check("lichtverontreinigde", "lichtverontreinigd");
|
||||
check("lichtzinnige", "lichtzinn");
|
||||
check("lid", "lid");
|
||||
check("lidia", "lidia");
|
||||
check("lidmaatschap", "lidmaatschap");
|
||||
check("lidstaten", "lidstat");
|
||||
check("lidvereniging", "lidveren");
|
||||
check("opgingen", "opging");
|
||||
check("opglanzing", "opglanz");
|
||||
check("opglanzingen", "opglanz");
|
||||
check("opglimlachten", "opglimlacht");
|
||||
check("opglimpen", "opglimp");
|
||||
check("opglimpende", "opglimp");
|
||||
check("opglimping", "opglimp");
|
||||
check("opglimpingen", "opglimp");
|
||||
check("opgraven", "opgrav");
|
||||
check("opgrijnzen", "opgrijnz");
|
||||
check("opgrijzende", "opgrijz");
|
||||
check("opgroeien", "opgroei");
|
||||
check("opgroeiende", "opgroei");
|
||||
check("opgroeiplaats", "opgroeiplat");
|
||||
check("ophaal", "ophal");
|
||||
check("ophaaldienst", "ophaaldienst");
|
||||
check("ophaalkosten", "ophaalkost");
|
||||
check("ophaalsystemen", "ophaalsystem");
|
||||
check("ophaalt", "ophaalt");
|
||||
check("ophaaltruck", "ophaaltruck");
|
||||
check("ophalen", "ophal");
|
||||
check("ophalend", "ophal");
|
||||
check("ophalers", "ophaler");
|
||||
check("ophef", "ophef");
|
||||
check("opheldering", "ophelder");
|
||||
check("ophemelde", "ophemeld");
|
||||
check("ophemelen", "ophemel");
|
||||
check("opheusden", "opheusd");
|
||||
check("ophief", "ophief");
|
||||
check("ophield", "ophield");
|
||||
check("ophieven", "ophiev");
|
||||
check("ophoepelt", "ophoepelt");
|
||||
check("ophoog", "ophog");
|
||||
check("ophoogzand", "ophoogzand");
|
||||
check("ophopen", "ophop");
|
||||
check("ophoping", "ophop");
|
||||
check("ophouden", "ophoud");
|
||||
check("lichaamsziek", "lichaamsziek");
|
||||
check("lichamelijk", "licham");
|
||||
check("lichamelijke", "licham");
|
||||
check("lichamelijkheden", "licham");
|
||||
check("lichamen", "licham");
|
||||
check("lichere", "licher");
|
||||
check("licht", "licht");
|
||||
check("lichtbeeld", "lichtbeeld");
|
||||
check("lichtbruin", "lichtbruin");
|
||||
check("lichtdoorlatende", "lichtdoorlat");
|
||||
check("lichte", "licht");
|
||||
check("lichten", "licht");
|
||||
check("lichtende", "lichtend");
|
||||
check("lichtenvoorde", "lichtenvoord");
|
||||
check("lichter", "lichter");
|
||||
check("lichtere", "lichter");
|
||||
check("lichters", "lichter");
|
||||
check("lichtgevoeligheid", "lichtgevoel");
|
||||
check("lichtgewicht", "lichtgewicht");
|
||||
check("lichtgrijs", "lichtgrijs");
|
||||
check("lichthoeveelheid", "lichthoevel");
|
||||
check("lichtintensiteit", "lichtintensiteit");
|
||||
check("lichtje", "lichtj");
|
||||
check("lichtjes", "lichtjes");
|
||||
check("lichtkranten", "lichtkrant");
|
||||
check("lichtkring", "lichtkring");
|
||||
check("lichtkringen", "lichtkring");
|
||||
check("lichtregelsystemen", "lichtregelsystem");
|
||||
check("lichtste", "lichtst");
|
||||
check("lichtstromende", "lichtstrom");
|
||||
check("lichtte", "licht");
|
||||
check("lichtten", "licht");
|
||||
check("lichttoetreding", "lichttoetred");
|
||||
check("lichtverontreinigde", "lichtverontreinigd");
|
||||
check("lichtzinnige", "lichtzinn");
|
||||
check("lid", "lid");
|
||||
check("lidia", "lidia");
|
||||
check("lidmaatschap", "lidmaatschap");
|
||||
check("lidstaten", "lidstat");
|
||||
check("lidvereniging", "lidveren");
|
||||
check("opgingen", "opging");
|
||||
check("opglanzing", "opglanz");
|
||||
check("opglanzingen", "opglanz");
|
||||
check("opglimlachten", "opglimlacht");
|
||||
check("opglimpen", "opglimp");
|
||||
check("opglimpende", "opglimp");
|
||||
check("opglimping", "opglimp");
|
||||
check("opglimpingen", "opglimp");
|
||||
check("opgraven", "opgrav");
|
||||
check("opgrijnzen", "opgrijnz");
|
||||
check("opgrijzende", "opgrijz");
|
||||
check("opgroeien", "opgroei");
|
||||
check("opgroeiende", "opgroei");
|
||||
check("opgroeiplaats", "opgroeiplat");
|
||||
check("ophaal", "ophal");
|
||||
check("ophaaldienst", "ophaaldienst");
|
||||
check("ophaalkosten", "ophaalkost");
|
||||
check("ophaalsystemen", "ophaalsystem");
|
||||
check("ophaalt", "ophaalt");
|
||||
check("ophaaltruck", "ophaaltruck");
|
||||
check("ophalen", "ophal");
|
||||
check("ophalend", "ophal");
|
||||
check("ophalers", "ophaler");
|
||||
check("ophef", "ophef");
|
||||
check("opheldering", "ophelder");
|
||||
check("ophemelde", "ophemeld");
|
||||
check("ophemelen", "ophemel");
|
||||
check("opheusden", "opheusd");
|
||||
check("ophief", "ophief");
|
||||
check("ophield", "ophield");
|
||||
check("ophieven", "ophiev");
|
||||
check("ophoepelt", "ophoepelt");
|
||||
check("ophoog", "ophog");
|
||||
check("ophoogzand", "ophoogzand");
|
||||
check("ophopen", "ophop");
|
||||
check("ophoping", "ophop");
|
||||
check("ophouden", "ophoud");
|
||||
}
|
||||
|
||||
public void testSnowballCorrectness() throws Exception {
|
||||
|
@ -171,4 +171,4 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random(), new DutchAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
|
||||
public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
||||
{
|
||||
public void testSplitting() throws Exception
|
||||
public void testSplitting() throws Exception
|
||||
{
|
||||
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
|
||||
String[][] tests = {
|
||||
|
@ -71,8 +71,8 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
|||
}
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
|
|
|
@ -38,31 +38,31 @@ import org.apache.lucene.util.Version;
|
|||
*/
|
||||
|
||||
public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE);
|
||||
}
|
||||
/*
|
||||
* testcase for offsets
|
||||
*/
|
||||
public void testOffsets() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
|
||||
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
|
||||
}
|
||||
|
||||
public void testStopWords() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "แสดง", "งาน", "ดี" },
|
||||
new int[] { 13, 20, 23 },
|
||||
new int[] { 17, 23, 25 },
|
||||
new int[] { 5, 2, 1 });
|
||||
}
|
||||
|
||||
public void testTokenType() throws Exception {
|
||||
/*
|
||||
* testcase for offsets
|
||||
*/
|
||||
public void testOffsets() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
|
||||
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
|
||||
}
|
||||
|
||||
public void testStopWords() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "แสดง", "งาน", "ดี" },
|
||||
new int[] { 13, 20, 23 },
|
||||
new int[] { 17, 23, 25 },
|
||||
new int[] { 5, 2, 1 });
|
||||
}
|
||||
|
||||
public void testTokenType() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
|
@ -70,31 +70,31 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
|
||||
"<NUM>" });
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that position increments are adjusted correctly for stopwords.
|
||||
*/
|
||||
// note this test uses stopfilter's stopset
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
/*
|
||||
* Test that position increments are adjusted correctly for stopwords.
|
||||
*/
|
||||
// note this test uses stopfilter's stopset
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||
new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
|
||||
new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
|
||||
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
|
||||
|
||||
// case that a stopword is adjacent to thai text, with no whitespace
|
||||
|
||||
// case that a stopword is adjacent to thai text, with no whitespace
|
||||
assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||
new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
|
||||
new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
|
||||
new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET);
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
|
@ -105,8 +105,8 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new ThaiAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -46,8 +46,8 @@ public class TestWordlistLoader extends LuceneTestCase {
|
|||
|
||||
private void checkSet(CharArraySet wordset) {
|
||||
assertEquals(3, wordset.size());
|
||||
assertTrue(wordset.contains("ONE")); // case is not modified
|
||||
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
|
||||
assertTrue(wordset.contains("ONE")); // case is not modified
|
||||
assertTrue(wordset.contains("two")); // surrounding whitespace is removed
|
||||
assertTrue(wordset.contains("three"));
|
||||
assertFalse(wordset.contains("four"));
|
||||
}
|
||||
|
|
|
@ -292,12 +292,12 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
if (!characterDefinition.isKanji((char) buffer.get(pos2))) {
|
||||
allKanji = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allKanji) { // Process only Kanji keywords
|
||||
if (allKanji) { // Process only Kanji keywords
|
||||
return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
|
||||
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
|
||||
return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
|
||||
return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
@ -807,7 +807,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
}
|
||||
if (characterId == characterDefinition.getCharacterClass((char) ch) &&
|
||||
isPunctuation((char) ch) == isPunct) {
|
||||
unknownWordLength++;
|
||||
unknownWordLength++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -150,7 +150,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
@Override
|
||||
public int getLeftId(int wordId) {
|
||||
return buffer.getShort(wordId) >>> 3;
|
||||
}
|
||||
|
@ -162,7 +162,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
|
||||
@Override
|
||||
public int getWordCost(int wordId) {
|
||||
return buffer.getShort(wordId + 2); // Skip id
|
||||
return buffer.getShort(wordId + 2); // Skip id
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,21 +28,21 @@ public interface Dictionary {
|
|||
/**
|
||||
* Get left id of specified word
|
||||
* @param wordId
|
||||
* @return left id
|
||||
* @return left id
|
||||
*/
|
||||
public int getLeftId(int wordId);
|
||||
|
||||
/**
|
||||
* Get right id of specified word
|
||||
* @param wordId
|
||||
* @return left id
|
||||
* @return left id
|
||||
*/
|
||||
public int getRightId(int wordId);
|
||||
|
||||
/**
|
||||
* Get word cost of specified word
|
||||
* @param wordId
|
||||
* @return left id
|
||||
* @return left id
|
||||
*/
|
||||
public int getWordCost(int wordId);
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ public final class UnknownDictionary extends BinaryDictionary {
|
|||
int length = 1;
|
||||
for (int i = 1; i < len; i++) {
|
||||
if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text[offset+i])){
|
||||
length++;
|
||||
length++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -246,7 +246,7 @@ public final class UserDictionary implements Dictionary {
|
|||
return null;
|
||||
}
|
||||
|
||||
return allFeatures.split(INTERNAL_SEPARATOR);
|
||||
return allFeatures.split(INTERNAL_SEPARATOR);
|
||||
}
|
||||
|
||||
|
||||
|
@ -261,7 +261,7 @@ public final class UserDictionary implements Dictionary {
|
|||
sb.append(CSVUtil.quoteEscape(feature)).append(",");
|
||||
}
|
||||
} else if (fields.length == 1) { // One feature doesn't need to escape value
|
||||
sb.append(allFeatures[fields[0]]).append(",");
|
||||
sb.append(allFeatures[fields[0]]).append(",");
|
||||
} else {
|
||||
for (int field : fields){
|
||||
sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
|
||||
|
|
|
@ -42,7 +42,7 @@ public final class CSVUtil {
|
|||
*/
|
||||
public static String[] parse(String line) {
|
||||
boolean insideQuote = false;
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
ArrayList<String> result = new ArrayList<String>();
|
||||
int quoteCount = 0;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i = 0; i < line.length(); i++) {
|
||||
|
|
|
@ -326,12 +326,12 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testSegmentation() throws Exception {
|
||||
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
|
||||
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
|
||||
// String[] surfaceForms = {
|
||||
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
|
||||
// "スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
// "うたがわしい", "。"
|
||||
// };
|
||||
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
|
||||
// String[] surfaceForms = {
|
||||
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
|
||||
// "スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
// "うたがわしい", "。"
|
||||
// };
|
||||
String input = "スペースステーションに行きます。うたがわしい。";
|
||||
String[] surfaceForms = {
|
||||
"スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
|
|
|
@ -75,6 +75,6 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
@Test
|
||||
public void testRead() throws IOException {
|
||||
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
|
||||
assertNotNull(dictionary);
|
||||
assertNotNull(dictionary);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -174,26 +174,26 @@ public class TokenInfoDictionaryBuilder {
|
|||
/*
|
||||
* IPADIC features
|
||||
*
|
||||
* 0 - surface
|
||||
* 1 - left cost
|
||||
* 2 - right cost
|
||||
* 3 - word cost
|
||||
* 4-9 - pos
|
||||
* 10 - base form
|
||||
* 11 - reading
|
||||
* 12 - pronounciation
|
||||
* 0 - surface
|
||||
* 1 - left cost
|
||||
* 2 - right cost
|
||||
* 3 - word cost
|
||||
* 4-9 - pos
|
||||
* 10 - base form
|
||||
* 11 - reading
|
||||
* 12 - pronounciation
|
||||
*
|
||||
* UniDic features
|
||||
*
|
||||
* 0 - surface
|
||||
* 1 - left cost
|
||||
* 2 - right cost
|
||||
* 3 - word cost
|
||||
* 4-9 - pos
|
||||
* 10 - base form reading
|
||||
* 11 - base form
|
||||
* 12 - surface form
|
||||
* 13 - surface reading
|
||||
* 0 - surface
|
||||
* 1 - left cost
|
||||
* 2 - right cost
|
||||
* 3 - word cost
|
||||
* 4-9 - pos
|
||||
* 10 - base form reading
|
||||
* 11 - base form
|
||||
* 12 - surface form
|
||||
* 13 - surface reading
|
||||
*/
|
||||
|
||||
public String[] formatEntry(String[] features) {
|
||||
|
@ -221,7 +221,7 @@ public class TokenInfoDictionaryBuilder {
|
|||
} else {
|
||||
features2[11] = features[13];
|
||||
features2[12] = features[13];
|
||||
}
|
||||
}
|
||||
return features2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -107,22 +107,22 @@ public class UnknownDictionaryBuilder {
|
|||
continue;
|
||||
}
|
||||
|
||||
if(line.startsWith("0x")) { // Category mapping
|
||||
String[] values = line.split(" ", 2); // Split only first space
|
||||
if(line.startsWith("0x")) { // Category mapping
|
||||
String[] values = line.split(" ", 2); // Split only first space
|
||||
|
||||
if(!values[0].contains("..")) {
|
||||
int cp = Integer.decode(values[0]).intValue();
|
||||
dictionary.putCharacterCategory(cp, values[1]);
|
||||
dictionary.putCharacterCategory(cp, values[1]);
|
||||
} else {
|
||||
String[] codePoints = values[0].split("\\.\\.");
|
||||
int cpFrom = Integer.decode(codePoints[0]).intValue();
|
||||
int cpTo = Integer.decode(codePoints[1]).intValue();
|
||||
|
||||
for(int i = cpFrom; i <= cpTo; i++){
|
||||
dictionary.putCharacterCategory(i, values[1]);
|
||||
dictionary.putCharacterCategory(i, values[1]);
|
||||
}
|
||||
}
|
||||
} else { // Invoke definition
|
||||
} else { // Invoke definition
|
||||
String[] values = line.split(" "); // Consecutive space is merged above
|
||||
String characterClassName = values[0];
|
||||
int invoke = Integer.parseInt(values[1]);
|
||||
|
|
|
@ -175,23 +175,23 @@ public class TestPhoneticFilterFactory extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testSpeed() throws Exception {
|
||||
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
|
||||
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
|
||||
checkSpeedEncoding("Soundex", "easgasg", "E220");
|
||||
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
|
||||
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
|
||||
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
|
||||
checkSpeedEncoding("Metaphone", "easgasg", "ESKS");
|
||||
checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS");
|
||||
checkSpeedEncoding("Soundex", "easgasg", "E220");
|
||||
checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034");
|
||||
checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111");
|
||||
checkSpeedEncoding("ColognePhonetic", "Schmitt", "862");
|
||||
}
|
||||
|
||||
private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception {
|
||||
long start = System.currentTimeMillis();
|
||||
for ( int i=0; i<REPEATS; i++) {
|
||||
assertAlgorithm(encoder, "false", toBeEncoded,
|
||||
new String[] { estimated });
|
||||
}
|
||||
long duration = System.currentTimeMillis()-start;
|
||||
if (VERBOSE)
|
||||
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
|
||||
long start = System.currentTimeMillis();
|
||||
for ( int i=0; i<REPEATS; i++) {
|
||||
assertAlgorithm(encoder, "false", toBeEncoded,
|
||||
new String[] { estimated });
|
||||
}
|
||||
long duration = System.currentTimeMillis()-start;
|
||||
if (VERBOSE)
|
||||
System.out.println(encoder + " encodings per msec: "+(REPEATS/duration));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -115,7 +115,7 @@ abstract class AbstractDictionary {
|
|||
}
|
||||
int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161
|
||||
int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol.
|
||||
// Therefore, each code page only has 16*6-2=94 characters.
|
||||
// Therefore, each code page only has 16*6-2=94 characters.
|
||||
return (short) (b0 * 94 + b1);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -37,43 +37,43 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
|||
*/
|
||||
public class StreamUtils {
|
||||
|
||||
/** Buffer size used across the benchmark package */
|
||||
public static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
||||
/** File format type */
|
||||
public enum Type {
|
||||
/** BZIP2 is automatically used for <b>.bz2</b> and <b>.bzip2</b> extensions. */
|
||||
BZIP2(CompressorStreamFactory.BZIP2),
|
||||
/** GZIP is automatically used for <b>.gz</b> and <b>.gzip</b> extensions. */
|
||||
GZIP(CompressorStreamFactory.GZIP),
|
||||
/** Plain text is used for anything which is not GZIP or BZIP. */
|
||||
PLAIN(null);
|
||||
private final String csfType;
|
||||
Type(String csfType) {
|
||||
this.csfType = csfType;
|
||||
}
|
||||
private InputStream inputStream(InputStream in) throws IOException {
|
||||
try {
|
||||
return csfType==null ? in : new CompressorStreamFactory().createCompressorInputStream(csfType, in);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe; }
|
||||
}
|
||||
private OutputStream outputStream(OutputStream os) throws IOException {
|
||||
try {
|
||||
return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Buffer size used across the benchmark package */
|
||||
public static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
||||
/** File format type */
|
||||
public enum Type {
|
||||
/** BZIP2 is automatically used for <b>.bz2</b> and <b>.bzip2</b> extensions. */
|
||||
BZIP2(CompressorStreamFactory.BZIP2),
|
||||
/** GZIP is automatically used for <b>.gz</b> and <b>.gzip</b> extensions. */
|
||||
GZIP(CompressorStreamFactory.GZIP),
|
||||
/** Plain text is used for anything which is not GZIP or BZIP. */
|
||||
PLAIN(null);
|
||||
private final String csfType;
|
||||
Type(String csfType) {
|
||||
this.csfType = csfType;
|
||||
}
|
||||
private InputStream inputStream(InputStream in) throws IOException {
|
||||
try {
|
||||
return csfType==null ? in : new CompressorStreamFactory().createCompressorInputStream(csfType, in);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe; }
|
||||
}
|
||||
private OutputStream outputStream(OutputStream os) throws IOException {
|
||||
try {
|
||||
return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String,Type> extensionToType = new HashMap<String,Type>();
|
||||
static {
|
||||
// these in are lower case, we will lower case at the test as well
|
||||
// these in are lower case, we will lower case at the test as well
|
||||
extensionToType.put(".bz2", Type.BZIP2);
|
||||
extensionToType.put(".bzip", Type.BZIP2);
|
||||
extensionToType.put(".gz", Type.GZIP);
|
||||
|
@ -95,14 +95,14 @@ public class StreamUtils {
|
|||
|
||||
/** Return the type of the file, or null if unknown */
|
||||
private static Type fileType(File file) {
|
||||
Type type = null;
|
||||
Type type = null;
|
||||
String fileName = file.getName();
|
||||
int idx = fileName.lastIndexOf('.');
|
||||
if (idx != -1) {
|
||||
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ROOT));
|
||||
}
|
||||
return type==null ? Type.PLAIN : type;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link OutputStream} over the requested file, identifying
|
||||
|
|
|
@ -157,16 +157,16 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
String expDate, String expBody) throws Exception {
|
||||
InputStream in = new FileInputStream(file);
|
||||
switch(fileType) {
|
||||
case BZIP2:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in);
|
||||
break;
|
||||
case GZIP:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in);
|
||||
break;
|
||||
case PLAIN:
|
||||
break; // nothing to do
|
||||
default:
|
||||
assertFalse("Unknown file type!",true); //fail, should not happen
|
||||
case BZIP2:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in);
|
||||
break;
|
||||
case GZIP:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in);
|
||||
break;
|
||||
case PLAIN:
|
||||
break; // nothing to do
|
||||
default:
|
||||
assertFalse("Unknown file type!",true); //fail, should not happen
|
||||
}
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
try {
|
||||
|
|
|
@ -57,38 +57,38 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
|
||||
@Test
|
||||
public void testGetInputStreamBzip2() throws Exception {
|
||||
assertReadText(rawBzip2File("bz2"));
|
||||
assertReadText(rawBzip2File("bzip"));
|
||||
assertReadText(rawBzip2File("BZ2"));
|
||||
assertReadText(rawBzip2File("BZIP"));
|
||||
assertReadText(rawBzip2File("bz2"));
|
||||
assertReadText(rawBzip2File("bzip"));
|
||||
assertReadText(rawBzip2File("BZ2"));
|
||||
assertReadText(rawBzip2File("BZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOutputStreamBzip2() throws Exception {
|
||||
assertReadText(autoOutFile("bz2"));
|
||||
assertReadText(autoOutFile("bzip"));
|
||||
assertReadText(autoOutFile("BZ2"));
|
||||
assertReadText(autoOutFile("BZIP"));
|
||||
assertReadText(autoOutFile("bz2"));
|
||||
assertReadText(autoOutFile("bzip"));
|
||||
assertReadText(autoOutFile("BZ2"));
|
||||
assertReadText(autoOutFile("BZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOutputStreamGzip() throws Exception {
|
||||
assertReadText(autoOutFile("gz"));
|
||||
assertReadText(autoOutFile("gzip"));
|
||||
assertReadText(autoOutFile("GZ"));
|
||||
assertReadText(autoOutFile("GZIP"));
|
||||
assertReadText(autoOutFile("gz"));
|
||||
assertReadText(autoOutFile("gzip"));
|
||||
assertReadText(autoOutFile("GZ"));
|
||||
assertReadText(autoOutFile("GZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOutputStreamPlain() throws Exception {
|
||||
assertReadText(autoOutFile("txt"));
|
||||
assertReadText(autoOutFile("text"));
|
||||
assertReadText(autoOutFile("TXT"));
|
||||
assertReadText(autoOutFile("TEXT"));
|
||||
assertReadText(autoOutFile("txt"));
|
||||
assertReadText(autoOutFile("text"));
|
||||
assertReadText(autoOutFile("TXT"));
|
||||
assertReadText(autoOutFile("TEXT"));
|
||||
}
|
||||
|
||||
private File rawTextFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), IOUtils.CHARSET_UTF_8));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
|
@ -97,32 +97,32 @@ public class StreamUtilsTest extends BenchmarkTestCase {
|
|||
}
|
||||
|
||||
private File rawGzipFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f));
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private File rawBzip2File(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
|
||||
writeText(os);
|
||||
return f;
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private File autoOutFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = StreamUtils.outputStream(f);
|
||||
writeText(os);
|
||||
return f;
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = StreamUtils.outputStream(f);
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private void writeText(OutputStream os) throws IOException {
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
}
|
||||
private void writeText(OutputStream os) throws IOException {
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os, IOUtils.CHARSET_UTF_8));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
}
|
||||
|
||||
private void assertReadText(File f) throws Exception {
|
||||
InputStream ir = StreamUtils.inputStream(f);
|
||||
|
|
|
@ -170,7 +170,7 @@ public class FixedGapTermsIndexReader extends TermsIndexReaderBase {
|
|||
|
||||
@Override
|
||||
public long seek(BytesRef target) {
|
||||
int lo = 0; // binary search
|
||||
int lo = 0; // binary search
|
||||
int hi = fieldIndex.numIndexTerms - 1;
|
||||
assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
|
||||
|
||||
|
|
|
@ -163,7 +163,7 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
int c = 0;
|
||||
int end = bits.length;
|
||||
for (int i = 0; i < end; i++) {
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
}
|
||||
count = c;
|
||||
}
|
||||
|
@ -176,12 +176,12 @@ final class BitVector implements Cloneable, MutableBits {
|
|||
int c = 0;
|
||||
int end = bits.length;
|
||||
for (int i = 0; i < end; i++) {
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
private static final byte[] BYTE_COUNTS = { // table of bits/byte
|
||||
private static final byte[] BYTE_COUNTS = { // table of bits/byte
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
||||
|
|
|
@ -1672,7 +1672,7 @@ public class CheckIndex {
|
|||
" times, to check more than one segment, eg '-segment _2 -segment _a'.\n" +
|
||||
" You can't use this with the -fix option\n" +
|
||||
" -dir-impl X: use a specific " + FSDirectory.class.getSimpleName() + " implementation. " +
|
||||
"If no package is specified the " + FSDirectory.class.getPackage().getName() + " package will be used.\n" +
|
||||
"If no package is specified the " + FSDirectory.class.getPackage().getName() + " package will be used.\n" +
|
||||
"\n" +
|
||||
"**WARNING**: -fix should only be used on an emergency basis as it will cause\n" +
|
||||
"documents (perhaps many) to be permanently removed from the index. Always make\n" +
|
||||
|
|
|
@ -396,11 +396,11 @@ final class DocumentsWriterFlushControl {
|
|||
return flushingWriters.size();
|
||||
}
|
||||
|
||||
public boolean doApplyAllDeletes() {
|
||||
public boolean doApplyAllDeletes() {
|
||||
return flushDeletes.getAndSet(false);
|
||||
}
|
||||
|
||||
public void setApplyAllDeletes() {
|
||||
public void setApplyAllDeletes() {
|
||||
flushDeletes.set(true);
|
||||
}
|
||||
|
||||
|
|
|
@ -571,7 +571,7 @@ final class IndexFileDeleter {
|
|||
infoStream.message("IFD", "delete \"" + fileName + "\"");
|
||||
}
|
||||
directory.deleteFile(fileName);
|
||||
} catch (IOException e) { // if delete fails
|
||||
} catch (IOException e) { // if delete fails
|
||||
if (directory.fileExists(fileName)) {
|
||||
|
||||
// Some operating systems (e.g. Windows) don't
|
||||
|
|
|
@ -2847,7 +2847,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
final boolean anySegmentFlushed;
|
||||
|
||||
synchronized (fullFlushLock) {
|
||||
boolean flushSuccess = false;
|
||||
boolean flushSuccess = false;
|
||||
try {
|
||||
anySegmentFlushed = docWriter.flushAllThreads();
|
||||
flushSuccess = true;
|
||||
|
|
|
@ -42,9 +42,9 @@ public final class SegmentInfo {
|
|||
public static final int NO = -1; // e.g. no norms; no deletes;
|
||||
public static final int YES = 1; // e.g. have norms; have deletes;
|
||||
|
||||
public final String name; // unique name in dir
|
||||
private int docCount; // number of docs in seg
|
||||
public final Directory dir; // where segment resides
|
||||
public final String name; // unique name in dir
|
||||
private int docCount; // number of docs in seg
|
||||
public final Directory dir; // where segment resides
|
||||
|
||||
private boolean isCompoundFile;
|
||||
|
||||
|
|
|
@ -404,7 +404,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
|
|||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
if (minNrShouldMatch == 0 && clauses.size() == 1) { // optimize 1-clause queries
|
||||
BooleanClause c = clauses.get(0);
|
||||
if (!c.isProhibited()) { // just return clause
|
||||
if (!c.isProhibited()) { // just return clause
|
||||
|
||||
Query query = c.getQuery().rewrite(reader); // rewrite first
|
||||
|
||||
|
@ -475,7 +475,7 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
|
|||
|
||||
Query subQuery = c.getQuery();
|
||||
if (subQuery != null) {
|
||||
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
buffer.append("(");
|
||||
buffer.append(subQuery.toString(field));
|
||||
buffer.append(")");
|
||||
|
|
|
@ -508,7 +508,7 @@ public interface FieldCache {
|
|||
// this special case is the reason that Arrays.binarySearch() isn't useful.
|
||||
if (key == null)
|
||||
return 0;
|
||||
|
||||
|
||||
int low = 1;
|
||||
int high = numOrd()-1;
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* @since lucene 1.4
|
||||
*/
|
||||
class FieldCacheImpl implements FieldCache {
|
||||
|
||||
|
||||
private Map<Class<?>,Cache> caches;
|
||||
FieldCacheImpl() {
|
||||
init();
|
||||
|
@ -173,7 +173,7 @@ class FieldCacheImpl implements FieldCache {
|
|||
((AtomicReader)key).addReaderClosedListener(purgeReader);
|
||||
} else {
|
||||
// last chance
|
||||
reader.addReaderClosedListener(purgeReader);
|
||||
reader.addReaderClosedListener(purgeReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ import java.io.IOException;
|
|||
public abstract class FilteredDocIdSetIterator extends DocIdSetIterator {
|
||||
protected DocIdSetIterator _innerIter;
|
||||
private int doc;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param innerIter Underlying DocIdSetIterator.
|
||||
|
@ -40,7 +40,7 @@ public abstract class FilteredDocIdSetIterator extends DocIdSetIterator {
|
|||
_innerIter = innerIter;
|
||||
doc = -1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Validation method to determine whether a docid should be in the result set.
|
||||
* @param doc docid to be tested
|
||||
|
@ -48,7 +48,7 @@ public abstract class FilteredDocIdSetIterator extends DocIdSetIterator {
|
|||
* @see #FilteredDocIdSetIterator(DocIdSetIterator)
|
||||
*/
|
||||
protected abstract boolean match(int doc);
|
||||
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
|
|
|
@ -110,7 +110,7 @@ public class MultiPhraseQuery extends Query {
|
|||
* Do not modify the List or its contents.
|
||||
*/
|
||||
public List<Term[]> getTermArrays() {
|
||||
return Collections.unmodifiableList(termArrays);
|
||||
return Collections.unmodifiableList(termArrays);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -24,13 +24,13 @@ import org.apache.lucene.index.*;
|
|||
* Position of a term in a document that takes into account the term offset within the phrase.
|
||||
*/
|
||||
final class PhrasePositions {
|
||||
int doc; // current doc
|
||||
int position; // position in doc
|
||||
int count; // remaining pos in this doc
|
||||
int offset; // position in phrase
|
||||
int doc; // current doc
|
||||
int position; // position in doc
|
||||
int count; // remaining pos in this doc
|
||||
int offset; // position in phrase
|
||||
final int ord; // unique across all PhrasePositions instances
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
int rptGroup = -1; // >=0 indicates that this is a repeating PP
|
||||
int rptInd; // index in the rptGroup
|
||||
final Term[] terms; // for repetitions initialization
|
||||
|
@ -42,7 +42,7 @@ final class PhrasePositions {
|
|||
this.terms = terms;
|
||||
}
|
||||
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
doc = postings.nextDoc();
|
||||
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return false;
|
||||
|
@ -59,7 +59,7 @@ final class PhrasePositions {
|
|||
}
|
||||
|
||||
final void firstPosition() throws IOException {
|
||||
count = postings.freq(); // read first pos
|
||||
count = postings.freq(); // read first pos
|
||||
nextPosition();
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,7 @@ final class PhrasePositions {
|
|||
* have exactly the same <code>position</code>.
|
||||
*/
|
||||
final boolean nextPosition() throws IOException {
|
||||
if (count-- > 0) { // read subsequent pos's
|
||||
if (count-- > 0) { // read subsequent pos's
|
||||
position = postings.nextPosition() - offset;
|
||||
return true;
|
||||
} else
|
||||
|
|
|
@ -281,7 +281,7 @@ public class PhraseQuery extends Query {
|
|||
ArrayUtil.mergeSort(postingsFreqs);
|
||||
}
|
||||
|
||||
if (slop == 0) { // optimize exact case
|
||||
if (slop == 0) { // optimize exact case
|
||||
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactSimScorer(stats, context));
|
||||
if (s.noDocs) {
|
||||
return null;
|
||||
|
|
|
@ -24,12 +24,12 @@ package org.apache.lucene.search.payloads;
|
|||
public class MinPayloadFunction extends PayloadFunction {
|
||||
|
||||
@Override
|
||||
public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
|
||||
public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
|
||||
if (numPayloadsSeen == 0) {
|
||||
return currentPayloadScore;
|
||||
} else {
|
||||
return Math.min(currentPayloadScore, currentScore);
|
||||
}
|
||||
return Math.min(currentPayloadScore, currentScore);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -56,10 +56,10 @@ public abstract class PayloadFunction {
|
|||
public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore);
|
||||
|
||||
public Explanation explain(int docId, String field, int numPayloadsSeen, float payloadScore){
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ".docScore()");
|
||||
result.setValue(docScore(docId, field, numPayloadsSeen, payloadScore));
|
||||
return result;
|
||||
Explanation result = new Explanation();
|
||||
result.setDescription(getClass().getSimpleName() + ".docScore()");
|
||||
result.setValue(docScore(docId, field, numPayloadsSeen, payloadScore));
|
||||
return result;
|
||||
};
|
||||
|
||||
@Override
|
||||
|
|
|
@ -257,7 +257,7 @@ public class PayloadNearQuery extends SpanNearQuery {
|
|||
getPayloads(spansArr);
|
||||
more = spans.next();
|
||||
} while (more && (doc == spans.doc()));
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -117,7 +117,7 @@ public class NearSpansOrdered extends Spans {
|
|||
public int end() { return matchEnd; }
|
||||
|
||||
public Spans[] getSubSpans() {
|
||||
return subSpans;
|
||||
return subSpans;
|
||||
}
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
|
|
|
@ -151,7 +151,7 @@ public class NearSpansUnordered extends Spans {
|
|||
}
|
||||
}
|
||||
public Spans[] getSubSpans() {
|
||||
return subSpans;
|
||||
return subSpans;
|
||||
}
|
||||
@Override
|
||||
public boolean next() throws IOException {
|
||||
|
@ -286,7 +286,7 @@ public class NearSpansUnordered extends Spans {
|
|||
}
|
||||
|
||||
private void addToList(SpansCell cell) {
|
||||
if (last != null) { // add next to end of list
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = cell;
|
||||
} else
|
||||
first = cell;
|
||||
|
@ -295,7 +295,7 @@ public class NearSpansUnordered extends Spans {
|
|||
}
|
||||
|
||||
private void firstToLast() {
|
||||
last.next = first; // move first to end of list
|
||||
last.next = first; // move first to end of list
|
||||
last = first;
|
||||
first = first.next;
|
||||
last.next = null;
|
||||
|
|
|
@ -92,9 +92,9 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
|
|||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
for (final SpanQuery clause : clauses) {
|
||||
clause.extractTerms(terms);
|
||||
}
|
||||
for (final SpanQuery clause : clauses) {
|
||||
clause.extractTerms(terms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
|
|||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
match.extractTerms(terms);
|
||||
match.extractTerms(terms);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -186,4 +186,4 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
|
|||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public abstract class Spans {
|
|||
* boolean skipTo(int target) {
|
||||
* do {
|
||||
* if (!next())
|
||||
* return false;
|
||||
* return false;
|
||||
* } while (target > doc());
|
||||
* return true;
|
||||
* }
|
||||
|
|
|
@ -41,9 +41,9 @@ public abstract class BufferedIndexInput extends IndexInput {
|
|||
|
||||
protected byte[] buffer;
|
||||
|
||||
private long bufferStart = 0; // position in file of buffer
|
||||
private int bufferLength = 0; // end of valid bytes
|
||||
private int bufferPosition = 0; // next byte to read
|
||||
private long bufferStart = 0; // position in file of buffer
|
||||
private int bufferLength = 0; // end of valid bytes
|
||||
private int bufferPosition = 0; // next byte to read
|
||||
|
||||
@Override
|
||||
public final byte readByte() throws IOException {
|
||||
|
@ -259,7 +259,7 @@ public abstract class BufferedIndexInput extends IndexInput {
|
|||
private void refill() throws IOException {
|
||||
long start = bufferStart + bufferPosition;
|
||||
long end = start + bufferSize;
|
||||
if (end > length()) // don't read past EOF
|
||||
if (end > length()) // don't read past EOF
|
||||
end = length();
|
||||
int newLength = (int)(end - start);
|
||||
if (newLength <= 0)
|
||||
|
@ -294,7 +294,7 @@ public abstract class BufferedIndexInput extends IndexInput {
|
|||
else {
|
||||
bufferStart = pos;
|
||||
bufferPosition = 0;
|
||||
bufferLength = 0; // trigger refill() on read()
|
||||
bufferLength = 0; // trigger refill() on read()
|
||||
seekInternal(pos);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -135,7 +135,7 @@ public abstract class Lock {
|
|||
return doBody();
|
||||
} finally {
|
||||
if (locked)
|
||||
lock.release();
|
||||
lock.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.LucenePackage;
|
|||
**/
|
||||
|
||||
public final class Constants {
|
||||
private Constants() {} // can't construct
|
||||
private Constants() {} // can't construct
|
||||
|
||||
/** JVM vendor info. */
|
||||
public static final String JVM_VENDOR = System.getProperty("java.vm.vendor");
|
||||
|
|
|
@ -177,11 +177,11 @@ public abstract class PriorityQueue<T> {
|
|||
time. */
|
||||
public final T pop() {
|
||||
if (size > 0) {
|
||||
T result = heap[1]; // save first value
|
||||
heap[1] = heap[size]; // move last to first
|
||||
heap[size] = null; // permit GC of objects
|
||||
T result = heap[1]; // save first value
|
||||
heap[1] = heap[size]; // move last to first
|
||||
heap[size] = null; // permit GC of objects
|
||||
size--;
|
||||
downHeap(); // adjust heap
|
||||
downHeap(); // adjust heap
|
||||
return result;
|
||||
} else
|
||||
return null;
|
||||
|
@ -226,26 +226,26 @@ public abstract class PriorityQueue<T> {
|
|||
|
||||
private final void upHeap() {
|
||||
int i = size;
|
||||
T node = heap[i]; // save bottom node
|
||||
T node = heap[i]; // save bottom node
|
||||
int j = i >>> 1;
|
||||
while (j > 0 && lessThan(node, heap[j])) {
|
||||
heap[i] = heap[j]; // shift parents down
|
||||
heap[i] = heap[j]; // shift parents down
|
||||
i = j;
|
||||
j = j >>> 1;
|
||||
}
|
||||
heap[i] = node; // install saved node
|
||||
heap[i] = node; // install saved node
|
||||
}
|
||||
|
||||
private final void downHeap() {
|
||||
int i = 1;
|
||||
T node = heap[i]; // save top node
|
||||
int j = i << 1; // find smaller child
|
||||
T node = heap[i]; // save top node
|
||||
int j = i << 1; // find smaller child
|
||||
int k = j + 1;
|
||||
if (k <= size && lessThan(heap[k], heap[j])) {
|
||||
j = k;
|
||||
}
|
||||
while (j <= size && lessThan(heap[j], node)) {
|
||||
heap[i] = heap[j]; // shift up child
|
||||
heap[i] = heap[j]; // shift up child
|
||||
i = j;
|
||||
j = i << 1;
|
||||
k = j + 1;
|
||||
|
@ -253,7 +253,7 @@ public abstract class PriorityQueue<T> {
|
|||
j = k;
|
||||
}
|
||||
}
|
||||
heap[i] = node; // install saved node
|
||||
heap[i] = node; // install saved node
|
||||
}
|
||||
|
||||
/** This method returns the internal heap array as Object[].
|
||||
|
|
|
@ -111,26 +111,26 @@ public class TestLongPostings extends LuceneTestCase {
|
|||
}
|
||||
|
||||
final IndexReader r;
|
||||
final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
|
||||
.setMergePolicy(newLogMergePolicy());
|
||||
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
|
||||
iwc.setMaxBufferedDocs(-1);
|
||||
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
for(int idx=0;idx<NUM_DOCS;idx++) {
|
||||
final Document doc = new Document();
|
||||
String s = isS1.get(idx) ? s1 : s2;
|
||||
final Field f = newTextField("field", s, Field.Store.NO);
|
||||
final int count = _TestUtil.nextInt(random(), 1, 4);
|
||||
for(int ct=0;ct<count;ct++) {
|
||||
doc.add(f);
|
||||
}
|
||||
riw.addDocument(doc);
|
||||
}
|
||||
|
||||
r = riw.getReader();
|
||||
riw.close();
|
||||
final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
|
||||
.setMergePolicy(newLogMergePolicy());
|
||||
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
|
||||
iwc.setMaxBufferedDocs(-1);
|
||||
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
|
||||
|
||||
for(int idx=0;idx<NUM_DOCS;idx++) {
|
||||
final Document doc = new Document();
|
||||
String s = isS1.get(idx) ? s1 : s2;
|
||||
final Field f = newTextField("field", s, Field.Store.NO);
|
||||
final int count = _TestUtil.nextInt(random(), 1, 4);
|
||||
for(int ct=0;ct<count;ct++) {
|
||||
doc.add(f);
|
||||
}
|
||||
riw.addDocument(doc);
|
||||
}
|
||||
|
||||
r = riw.getReader();
|
||||
riw.close();
|
||||
|
||||
/*
|
||||
if (VERBOSE) {
|
||||
|
|
|
@ -152,7 +152,7 @@ public class TestParallelReaderEmptyIndex extends LuceneTestCase {
|
|||
|
||||
rd1.close();
|
||||
rd2.close();
|
||||
|
||||
|
||||
iwOut.forceMerge(1);
|
||||
iwOut.close();
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ final class BugReproTokenStream extends TokenStream {
|
|||
offsetAtt.setOffset(starts[nextTokenIndex], ends[nextTokenIndex]);
|
||||
posIncAtt.setPositionIncrement(incs[nextTokenIndex]);
|
||||
nextTokenIndex++;
|
||||
return true;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -41,13 +41,13 @@ import org.apache.lucene.util.Bits;
|
|||
*/
|
||||
|
||||
public class TestTransactionRollback extends LuceneTestCase {
|
||||
|
||||
|
||||
private static final String FIELD_RECORD_ID = "record_id";
|
||||
private Directory dir;
|
||||
|
||||
|
||||
//Rolls back index to a chosen ID
|
||||
private void rollBackLast(int id) throws Exception {
|
||||
|
||||
|
||||
// System.out.println("Attempting to rollback to "+id);
|
||||
String ids="-"+id;
|
||||
IndexCommit last=null;
|
||||
|
@ -62,7 +62,7 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
|
||||
if (last==null)
|
||||
throw new RuntimeException("Couldn't find commit point "+id);
|
||||
|
||||
|
||||
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
|
||||
TEST_VERSION_CURRENT, new MockAnalyzer(random())).setIndexDeletionPolicy(
|
||||
new RollbackDeletionPolicy(id)).setIndexCommit(last));
|
||||
|
@ -72,22 +72,22 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
w.close();
|
||||
}
|
||||
|
||||
public void testRepeatedRollBacks() throws Exception {
|
||||
public void testRepeatedRollBacks() throws Exception {
|
||||
|
||||
int expectedLastRecordId=100;
|
||||
while (expectedLastRecordId>10) {
|
||||
expectedLastRecordId -=10;
|
||||
expectedLastRecordId -=10;
|
||||
rollBackLast(expectedLastRecordId);
|
||||
|
||||
BitSet expecteds = new BitSet(100);
|
||||
expecteds.set(1,(expectedLastRecordId+1),true);
|
||||
checkExpecteds(expecteds);
|
||||
checkExpecteds(expecteds);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void checkExpecteds(BitSet expecteds) throws Exception {
|
||||
IndexReader r = DirectoryReader.open(dir);
|
||||
|
||||
|
||||
//Perhaps not the most efficient approach but meets our
|
||||
//needs here.
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(r);
|
||||
|
@ -114,7 +114,7 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
Collection files = comm.getFileNames();
|
||||
for (Iterator iterator2 = files.iterator(); iterator2.hasNext();) {
|
||||
String filename = (String) iterator2.next();
|
||||
System.out.print(filename+", ");
|
||||
System.out.print(filename+", ");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
@ -133,7 +133,7 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
Document doc=new Document();
|
||||
doc.add(newTextField(FIELD_RECORD_ID, ""+currentRecordId, Field.Store.YES));
|
||||
w.addDocument(doc);
|
||||
|
||||
|
||||
if (currentRecordId%10 == 0) {
|
||||
Map<String,String> data = new HashMap<String,String>();
|
||||
data.put("index", "records 1-"+currentRecordId);
|
||||
|
@ -177,16 +177,16 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
" UserData="+commit.getUserData() +") ("+(commits.size()-1)+" commit points left) files=");
|
||||
Collection files = commit.getFileNames();
|
||||
for (Iterator iterator2 = files.iterator(); iterator2.hasNext();) {
|
||||
System.out.print(" "+iterator2.next());
|
||||
System.out.print(" "+iterator2.next());
|
||||
}
|
||||
System.out.println();
|
||||
*/
|
||||
|
||||
commit.delete();
|
||||
|
||||
commit.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class DeleteLastCommitPolicy implements IndexDeletionPolicy {
|
||||
|
@ -198,7 +198,7 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testRollbackDeletionPolicy() throws Exception {
|
||||
public void testRollbackDeletionPolicy() throws Exception {
|
||||
for(int i=0;i<2;i++) {
|
||||
// Unless you specify a prior commit point, rollback
|
||||
// should not work:
|
||||
|
@ -209,7 +209,7 @@ public class TestTransactionRollback extends LuceneTestCase {
|
|||
r.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Keeps all commit points (used to build index)
|
||||
class KeepAllDeletionPolicy implements IndexDeletionPolicy {
|
||||
public void onCommit(List<? extends IndexCommit> commits) throws IOException {}
|
||||
|
|
|
@ -129,7 +129,7 @@ public class TestTransactions extends LuceneTestCase {
|
|||
}
|
||||
try {
|
||||
writer2.prepareCommit();
|
||||
} catch (Throwable t) {
|
||||
} catch (Throwable t) {
|
||||
writer1.rollback();
|
||||
writer2.rollback();
|
||||
return;
|
||||
|
|
|
@ -145,7 +145,7 @@ public class TestCachingCollector extends LuceneTestCase {
|
|||
try {
|
||||
cc.replay(new NoOpCollector(false)); // this call should fail
|
||||
fail("should have failed if an in-order Collector was given to replay(), " +
|
||||
"while CachingCollector was initialized with out-of-order collection");
|
||||
"while CachingCollector was initialized with out-of-order collection");
|
||||
} catch (IllegalArgumentException e) {
|
||||
// ok
|
||||
}
|
||||
|
|
|
@ -62,15 +62,15 @@ public class TestDocIdSet extends LuceneTestCase {
|
|||
};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
DocIdSet filteredSet = new FilteredDocIdSet(innerSet){
|
||||
@Override
|
||||
protected boolean match(int docid) {
|
||||
return docid%2 == 0; //validate only even docids
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
DocIdSetIterator iter = filteredSet.iterator();
|
||||
ArrayList<Integer> list = new ArrayList<Integer>();
|
||||
int doc = iter.advance(3);
|
||||
|
@ -80,7 +80,7 @@ public class TestDocIdSet extends LuceneTestCase {
|
|||
list.add(Integer.valueOf(doc));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int[] docs = new int[list.size()];
|
||||
int c=0;
|
||||
Iterator<Integer> intIter = list.iterator();
|
||||
|
@ -151,7 +151,7 @@ public class TestDocIdSet extends LuceneTestCase {
|
|||
@Override
|
||||
protected boolean match(int docid) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
|
|
@ -101,7 +101,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
// not similar enough:
|
||||
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
|
||||
query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals(0, hits.length);
|
||||
query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3
|
||||
|
|
|
@ -140,7 +140,7 @@ public class TestPayloadNearQuery extends LuceneTestCase {
|
|||
|
||||
query = newPhraseQuery("field", "twenty two", true, new AveragePayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
|
||||
|
||||
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
|
||||
// and all the similarity factors are set to 1
|
||||
hits = searcher.search(query, null, 100);
|
||||
|
@ -162,8 +162,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
|
|||
assertEquals("should be 100 hits", 100, hits.totalHits);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||
}
|
||||
}
|
||||
|
@ -192,71 +192,71 @@ public class TestPayloadNearQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testAverageFunction() throws IOException {
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
|
||||
query = newPhraseQuery("field", "twenty two", true, new AveragePayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
|
||||
// and all the similarity factors are set to 1
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("AveragePayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 3, explain.getValue() == 3f);
|
||||
}
|
||||
query = newPhraseQuery("field", "twenty two", true, new AveragePayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
|
||||
// and all the similarity factors are set to 1
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("AveragePayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 3, explain.getValue() == 3f);
|
||||
}
|
||||
}
|
||||
public void testMaxFunction() throws IOException {
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
|
||||
query = newPhraseQuery("field", "twenty two", true, new MaxPayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 4 (max payload value)
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 4, doc.score == 4);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("MaxPayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 4, explain.getValue() == 4f);
|
||||
}
|
||||
query = newPhraseQuery("field", "twenty two", true, new MaxPayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 4 (max payload value)
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 4, doc.score == 4);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("MaxPayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 4, explain.getValue() == 4f);
|
||||
}
|
||||
}
|
||||
public void testMinFunction() throws IOException {
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
PayloadNearQuery query;
|
||||
TopDocs hits;
|
||||
|
||||
query = newPhraseQuery("field", "twenty two", true, new MinPayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 2 (min payload value)
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("MinPayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 2, explain.getValue() == 2f);
|
||||
}
|
||||
query = newPhraseQuery("field", "twenty two", true, new MinPayloadFunction());
|
||||
QueryUtils.check(query);
|
||||
// all 10 hits should have score = 2 (min payload value)
|
||||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("should be 10 hits", hits.totalHits == 10);
|
||||
for (int j = 0; j < hits.scoreDocs.length; j++) {
|
||||
ScoreDoc doc = hits.scoreDocs[j];
|
||||
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
|
||||
Explanation explain = searcher.explain(query, hits.scoreDocs[j].doc);
|
||||
String exp = explain.toString();
|
||||
assertTrue(exp, exp.indexOf("MinPayloadFunction") > -1);
|
||||
assertTrue(hits.scoreDocs[j].score + " explain value does not equal: " + 2, explain.getValue() == 2f);
|
||||
}
|
||||
}
|
||||
private SpanQuery[] getClauses() {
|
||||
SpanNearQuery q1, q2;
|
||||
q1 = spanNearQuery("field2", "twenty two");
|
||||
q2 = spanNearQuery("field2", "twenty three");
|
||||
SpanQuery[] clauses = new SpanQuery[2];
|
||||
clauses[0] = q1;
|
||||
clauses[1] = q2;
|
||||
return clauses;
|
||||
SpanNearQuery q1, q2;
|
||||
q1 = spanNearQuery("field2", "twenty two");
|
||||
q2 = spanNearQuery("field2", "twenty three");
|
||||
SpanQuery[] clauses = new SpanQuery[2];
|
||||
clauses[0] = q1;
|
||||
clauses[1] = q2;
|
||||
return clauses;
|
||||
}
|
||||
private SpanNearQuery spanNearQuery(String fieldName, String words) {
|
||||
String[] wordList = words.split("[\\s]+");
|
||||
|
@ -274,8 +274,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
|
|||
hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
ScoreDoc doc = hits.scoreDocs[0];
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
assertTrue("there should only be one hit", hits.totalHits == 1);
|
||||
// should have score = 3 because adjacent terms have payloads of 2,4
|
||||
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||
|
@ -299,8 +299,8 @@ public class TestPayloadNearQuery extends LuceneTestCase {
|
|||
assertTrue("should only be one hit", hits.scoreDocs.length == 1);
|
||||
// the score should be 3 - the average of all the underlying payloads
|
||||
ScoreDoc doc = hits.scoreDocs[0];
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
// System.out.println("Doc: " + doc.toString());
|
||||
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
|
||||
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||
}
|
||||
|
||||
|
|
|
@ -582,21 +582,21 @@ public class TestBasics extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testSpansSkipTo() throws Exception {
|
||||
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "seventy"));
|
||||
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "seventy"));
|
||||
Spans s1 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t1);
|
||||
Spans s2 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t2);
|
||||
|
||||
assertTrue(s1.next());
|
||||
assertTrue(s2.next());
|
||||
|
||||
boolean hasMore = true;
|
||||
|
||||
do {
|
||||
hasMore = skipToAccoringToJavaDocs(s1, s1.doc());
|
||||
assertEquals(hasMore, s2.skipTo(s2.doc()));
|
||||
assertEquals(s1.doc(), s2.doc());
|
||||
} while (hasMore);
|
||||
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "seventy"));
|
||||
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "seventy"));
|
||||
Spans s1 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t1);
|
||||
Spans s2 = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), t2);
|
||||
|
||||
assertTrue(s1.next());
|
||||
assertTrue(s2.next());
|
||||
|
||||
boolean hasMore = true;
|
||||
|
||||
do {
|
||||
hasMore = skipToAccoringToJavaDocs(s1, s1.doc());
|
||||
assertEquals(hasMore, s2.skipTo(s2.doc()));
|
||||
assertEquals(s1.doc(), s2.doc());
|
||||
} while (hasMore);
|
||||
}
|
||||
|
||||
/** Skips to the first match beyond the current, whose document number is
|
||||
|
|
|
@ -84,7 +84,7 @@ public class TestWindowsMMap extends LuceneTestCase {
|
|||
for(int dx = 0; dx < num; dx ++) {
|
||||
String f = randomField();
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("data", f, Field.Store.YES));
|
||||
doc.add(newTextField("data", f, Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
|
|
@ -71,12 +71,12 @@ public class TestBitUtil extends LuceneTestCase {
|
|||
long sumRes = 0;
|
||||
while (iters-- >= 0) {
|
||||
for (int i = 1; i <= 63; i++) {
|
||||
long a = testArg(i);
|
||||
sumRes += BitUtil.nlz(a);
|
||||
sumRes += BitUtil.nlz(a+1);
|
||||
sumRes += BitUtil.nlz(a-1);
|
||||
sumRes += BitUtil.nlz(a+10);
|
||||
sumRes += BitUtil.nlz(a-10);
|
||||
long a = testArg(i);
|
||||
sumRes += BitUtil.nlz(a);
|
||||
sumRes += BitUtil.nlz(a + 1);
|
||||
sumRes += BitUtil.nlz(a - 1);
|
||||
sumRes += BitUtil.nlz(a + 10);
|
||||
sumRes += BitUtil.nlz(a - 10);
|
||||
}
|
||||
}
|
||||
return sumRes;
|
||||
|
@ -86,12 +86,12 @@ public class TestBitUtil extends LuceneTestCase {
|
|||
long sumRes = 0;
|
||||
while (iters-- >= 0) {
|
||||
for (int i = 1; i <= 63; i++) {
|
||||
long a = testArg(i);
|
||||
sumRes += Long.numberOfLeadingZeros(a);
|
||||
sumRes += Long.numberOfLeadingZeros(a+1);
|
||||
sumRes += Long.numberOfLeadingZeros(a-1);
|
||||
sumRes += Long.numberOfLeadingZeros(a+10);
|
||||
sumRes += Long.numberOfLeadingZeros(a-10);
|
||||
long a = testArg(i);
|
||||
sumRes += Long.numberOfLeadingZeros(a);
|
||||
sumRes += Long.numberOfLeadingZeros(a + 1);
|
||||
sumRes += Long.numberOfLeadingZeros(a - 1);
|
||||
sumRes += Long.numberOfLeadingZeros(a + 10);
|
||||
sumRes += Long.numberOfLeadingZeros(a - 10);
|
||||
}
|
||||
}
|
||||
return sumRes;
|
||||
|
|
|
@ -49,7 +49,7 @@ public class TestFixedBitSet extends LuceneTestCase {
|
|||
// aa = a.prevSetBit(aa-1);
|
||||
aa--;
|
||||
while ((aa >= 0) && (! a.get(aa))) {
|
||||
aa--;
|
||||
aa--;
|
||||
}
|
||||
if (b.length() == 0) {
|
||||
bb = -1;
|
||||
|
|
|
@ -71,7 +71,7 @@ public class TestOpenBitSet extends LuceneTestCase {
|
|||
// aa = a.prevSetBit(aa-1);
|
||||
aa--;
|
||||
while ((aa >= 0) && (! a.get(aa))) {
|
||||
aa--;
|
||||
aa--;
|
||||
}
|
||||
bb = b.prevSetBit(bb-1);
|
||||
assertEquals(aa,bb);
|
||||
|
@ -85,7 +85,7 @@ public class TestOpenBitSet extends LuceneTestCase {
|
|||
// aa = a.prevSetBit(aa-1);
|
||||
aa--;
|
||||
while ((aa >= 0) && (! a.get(aa))) {
|
||||
aa--;
|
||||
aa--;
|
||||
}
|
||||
bb = (int) b.prevSetBit((long) (bb-1));
|
||||
assertEquals(aa,bb);
|
||||
|
|
|
@ -356,7 +356,7 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
// only possible writer, and it is "synchronized" to avoid this case).
|
||||
DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
|
||||
if (r2 == null) {
|
||||
return false; // no changes, nothing to do
|
||||
return false; // no changes, nothing to do
|
||||
}
|
||||
|
||||
// validate that a refresh is valid at this point, i.e. that the taxonomy
|
||||
|
@ -364,13 +364,13 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
|
||||
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
|
||||
if (t1==null) {
|
||||
if (t2!=null) {
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
|
||||
}
|
||||
if (t2!=null) {
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
|
||||
}
|
||||
} else if (!t1.equals(t2)) {
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
|
||||
}
|
||||
|
||||
IndexReader oldreader = indexReader;
|
||||
|
|
|
@ -21,12 +21,12 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public class DefaultEncoder implements Encoder
|
||||
{
|
||||
public DefaultEncoder()
|
||||
{
|
||||
}
|
||||
public DefaultEncoder()
|
||||
{
|
||||
}
|
||||
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
}
|
|
@ -22,8 +22,8 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public interface Encoder
|
||||
{
|
||||
/**
|
||||
* @param originalText The section of text being output
|
||||
*/
|
||||
String encodeText(String originalText);
|
||||
/**
|
||||
* @param originalText The section of text being output
|
||||
*/
|
||||
String encodeText(String originalText);
|
||||
}
|
|
@ -24,10 +24,10 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public interface Formatter
|
||||
{
|
||||
/**
|
||||
* @param originalText The section of text being considered for markup
|
||||
* @param tokenGroup contains one or several overlapping Tokens along with
|
||||
* their scores and positions.
|
||||
*/
|
||||
String highlightTerm(String originalText, TokenGroup tokenGroup);
|
||||
/**
|
||||
* @param originalText The section of text being considered for markup
|
||||
* @param tokenGroup contains one or several overlapping Tokens along with
|
||||
* their scores and positions.
|
||||
*/
|
||||
String highlightTerm(String originalText, TokenGroup tokenGroup);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public class GradientFormatter implements Formatter
|
|||
*
|
||||
* @param maxScore
|
||||
* The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight
|
||||
* which can be used to calibrate scoring scale)
|
||||
* which can be used to calibrate scoring scale)
|
||||
* @param minForegroundColor
|
||||
* The hex color used for representing IDF scores of zero eg
|
||||
* #FFFFFF (white) or null if no foreground color required
|
||||
|
|
|
@ -38,445 +38,445 @@ public class Highlighter
|
|||
public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
|
||||
|
||||
private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
|
||||
private Formatter formatter;
|
||||
private Encoder encoder;
|
||||
private Fragmenter textFragmenter=new SimpleFragmenter();
|
||||
private Scorer fragmentScorer=null;
|
||||
private Formatter formatter;
|
||||
private Encoder encoder;
|
||||
private Fragmenter textFragmenter=new SimpleFragmenter();
|
||||
private Scorer fragmentScorer=null;
|
||||
|
||||
public Highlighter(Scorer fragmentScorer)
|
||||
{
|
||||
this(new SimpleHTMLFormatter(),fragmentScorer);
|
||||
}
|
||||
public Highlighter(Scorer fragmentScorer)
|
||||
{
|
||||
this(new SimpleHTMLFormatter(),fragmentScorer);
|
||||
}
|
||||
|
||||
|
||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||
{
|
||||
this(formatter,new DefaultEncoder(),fragmentScorer);
|
||||
}
|
||||
public Highlighter(Formatter formatter, Scorer fragmentScorer)
|
||||
{
|
||||
this(formatter,new DefaultEncoder(),fragmentScorer);
|
||||
}
|
||||
|
||||
|
||||
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
|
||||
{
|
||||
this.formatter = formatter;
|
||||
this.encoder = encoder;
|
||||
this.fragmentScorer = fragmentScorer;
|
||||
}
|
||||
public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
|
||||
{
|
||||
this.formatter = formatter;
|
||||
this.encoder = encoder;
|
||||
this.fragmentScorer = fragmentScorer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||
* This is a convenience method that calls
|
||||
* {@link #getBestFragment(TokenStream, String)}
|
||||
*
|
||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||
* into chunks
|
||||
* @param text text to highlight terms in
|
||||
* @param fieldName Name of field used to influence analyzer's tokenization policy
|
||||
*
|
||||
* @return highlighted text fragment or null if no terms found
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
return getBestFragment(tokenStream, text);
|
||||
}
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||
* This is a convenience method that calls
|
||||
* {@link #getBestFragment(TokenStream, String)}
|
||||
*
|
||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||
* into chunks
|
||||
* @param text text to highlight terms in
|
||||
* @param fieldName Name of field used to influence analyzer's tokenization policy
|
||||
*
|
||||
* @return highlighted text fragment or null if no terms found
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
return getBestFragment(tokenStream, text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragment with the highest score
|
||||
* is returned
|
||||
*
|
||||
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
|
||||
* This is typically produced by an analyzer re-parsing a document's
|
||||
* text. Some work may be done on retrieving TokenStreams more efficiently
|
||||
* by adding support for storing original text position data in the Lucene
|
||||
* index but this support is not currently available (as of Lucene 1.4 rc2).
|
||||
* @param text text to highlight terms in
|
||||
*
|
||||
* @return highlighted text fragment or null if no terms found
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragment(TokenStream tokenStream, String text)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
String[] results = getBestFragments(tokenStream,text, 1);
|
||||
if (results.length > 0)
|
||||
{
|
||||
return results[0];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant section.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragment with the highest score
|
||||
* is returned
|
||||
*
|
||||
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
|
||||
* This is typically produced by an analyzer re-parsing a document's
|
||||
* text. Some work may be done on retrieving TokenStreams more efficiently
|
||||
* by adding support for storing original text position data in the Lucene
|
||||
* index but this support is not currently available (as of Lucene 1.4 rc2).
|
||||
* @param text text to highlight terms in
|
||||
*
|
||||
* @return highlighted text fragment or null if no terms found
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragment(TokenStream tokenStream, String text)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
String[] results = getBestFragments(tokenStream,text, 1);
|
||||
if (results.length > 0)
|
||||
{
|
||||
return results[0];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||
* This is a convenience method that calls
|
||||
* {@link #getBestFragments(TokenStream, String, int)}
|
||||
*
|
||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||
* into chunks
|
||||
* @param fieldName the name of the field being highlighted (used by analyzer)
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
*
|
||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String[] getBestFragments(
|
||||
Analyzer analyzer,
|
||||
String fieldName,
|
||||
String text,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
return getBestFragments(tokenStream, text, maxNumFragments);
|
||||
}
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||
* This is a convenience method that calls
|
||||
* {@link #getBestFragments(TokenStream, String, int)}
|
||||
*
|
||||
* @param analyzer the analyzer that will be used to split <code>text</code>
|
||||
* into chunks
|
||||
* @param fieldName the name of the field being highlighted (used by analyzer)
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
*
|
||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String[] getBestFragments(
|
||||
Analyzer analyzer,
|
||||
String fieldName,
|
||||
String text,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
return getBestFragments(tokenStream, text, maxNumFragments);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned as an array of strings in order of score (contiguous fragments are merged into
|
||||
* one in their original order to improve readability)
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
*
|
||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String[] getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
|
||||
/**
|
||||
* Highlights chosen terms in a text, extracting the most relevant sections.
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned as an array of strings in order of score (contiguous fragments are merged into
|
||||
* one in their original order to improve readability)
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
*
|
||||
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String[] getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
maxNumFragments = Math.max(1, maxNumFragments); //sanity check
|
||||
|
||||
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
|
||||
TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
|
||||
|
||||
//Get text
|
||||
ArrayList<String> fragTexts = new ArrayList<String>();
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if ((frag[i] != null) && (frag[i].getScore() > 0))
|
||||
{
|
||||
fragTexts.add(frag[i].toString());
|
||||
}
|
||||
}
|
||||
return fragTexts.toArray(new String[0]);
|
||||
}
|
||||
//Get text
|
||||
ArrayList<String> fragTexts = new ArrayList<String>();
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if ((frag[i] != null) && (frag[i].getScore() > 0))
|
||||
{
|
||||
fragTexts.add(frag[i].toString());
|
||||
}
|
||||
}
|
||||
return fragTexts.toArray(new String[0]);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Low level api to get the most relevant (formatted) sections of the document.
|
||||
* This method has been made public to allow visibility of score information held in TextFragment objects.
|
||||
* Thanks to Jason Calabrese for help in redefining the interface.
|
||||
* @param tokenStream
|
||||
* @param text
|
||||
* @param maxNumFragments
|
||||
* @param mergeContiguousFragments
|
||||
* @throws IOException
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final TextFragment[] getBestTextFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
boolean mergeContiguousFragments,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
|
||||
StringBuilder newText=new StringBuilder();
|
||||
|
||||
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
tokenStream.reset();
|
||||
|
||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||
|
||||
/**
|
||||
* Low level api to get the most relevant (formatted) sections of the document.
|
||||
* This method has been made public to allow visibility of score information held in TextFragment objects.
|
||||
* Thanks to Jason Calabrese for help in redefining the interface.
|
||||
* @param tokenStream
|
||||
* @param text
|
||||
* @param maxNumFragments
|
||||
* @param mergeContiguousFragments
|
||||
* @throws IOException
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final TextFragment[] getBestTextFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
boolean mergeContiguousFragments,
|
||||
int maxNumFragments)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
|
||||
StringBuilder newText=new StringBuilder();
|
||||
|
||||
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
tokenStream.reset();
|
||||
|
||||
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
|
||||
|
||||
if (fragmentScorer instanceof QueryScorer) {
|
||||
((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
||||
}
|
||||
|
||||
TokenStream newStream = fragmentScorer.init(tokenStream);
|
||||
if(newStream != null) {
|
||||
tokenStream = newStream;
|
||||
}
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
TokenStream newStream = fragmentScorer.init(tokenStream);
|
||||
if(newStream != null) {
|
||||
tokenStream = newStream;
|
||||
}
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
|
||||
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
|
||||
FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
|
||||
|
||||
try
|
||||
{
|
||||
try
|
||||
{
|
||||
|
||||
String tokenText;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int lastEndOffset = 0;
|
||||
textFragmenter.start(text, tokenStream);
|
||||
String tokenText;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
int lastEndOffset = 0;
|
||||
textFragmenter.start(text, tokenStream);
|
||||
|
||||
TokenGroup tokenGroup=new TokenGroup(tokenStream);
|
||||
TokenGroup tokenGroup=new TokenGroup(tokenStream);
|
||||
|
||||
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
|
||||
next = tokenStream.incrementToken())
|
||||
{
|
||||
if( (offsetAtt.endOffset()>text.length())
|
||||
||
|
||||
(offsetAtt.startOffset()>text.length())
|
||||
)
|
||||
{
|
||||
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
|
||||
+" exceeds length of provided text sized "+text.length());
|
||||
}
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
|
||||
{
|
||||
//the current token is distinct from previous tokens -
|
||||
// markup the cached token group info
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=Math.max(endOffset, lastEndOffset);
|
||||
tokenGroup.clear();
|
||||
for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
|
||||
next = tokenStream.incrementToken())
|
||||
{
|
||||
if( (offsetAtt.endOffset()>text.length())
|
||||
||
|
||||
(offsetAtt.startOffset()>text.length())
|
||||
)
|
||||
{
|
||||
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
|
||||
+" exceeds length of provided text sized "+text.length());
|
||||
}
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
|
||||
{
|
||||
//the current token is distinct from previous tokens -
|
||||
// markup the cached token group info
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=Math.max(endOffset, lastEndOffset);
|
||||
tokenGroup.clear();
|
||||
|
||||
//check if current token marks the start of a new fragment
|
||||
if(textFragmenter.isNewFragment())
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
}
|
||||
}
|
||||
//check if current token marks the start of a new fragment
|
||||
if(textFragmenter.isNewFragment())
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
|
||||
fragmentScorer.startFragment(currentFrag);
|
||||
docFrags.add(currentFrag);
|
||||
}
|
||||
}
|
||||
|
||||
tokenGroup.addToken(fragmentScorer.getTokenScore());
|
||||
tokenGroup.addToken(fragmentScorer.getTokenScore());
|
||||
|
||||
// if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
// {
|
||||
// break;
|
||||
// }
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
// if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
// {
|
||||
// break;
|
||||
// }
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
|
||||
if(tokenGroup.numTokens>0)
|
||||
{
|
||||
//flush the accumulated text (same code as in above loop)
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=Math.max(lastEndOffset,endOffset);
|
||||
}
|
||||
if(tokenGroup.numTokens>0)
|
||||
{
|
||||
//flush the accumulated text (same code as in above loop)
|
||||
startOffset = tokenGroup.matchStartOffset;
|
||||
endOffset = tokenGroup.matchEndOffset;
|
||||
tokenText = text.substring(startOffset, endOffset);
|
||||
String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
|
||||
//store any whitespace etc from between this and last group
|
||||
if (startOffset > lastEndOffset)
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
|
||||
newText.append(markedUpText);
|
||||
lastEndOffset=Math.max(lastEndOffset,endOffset);
|
||||
}
|
||||
|
||||
//Test what remains of the original text beyond the point where we stopped analyzing
|
||||
if (
|
||||
// if there is text beyond the last token considered..
|
||||
(lastEndOffset < text.length())
|
||||
&&
|
||||
// and that text is not too large...
|
||||
(text.length()<= maxDocCharsToAnalyze)
|
||||
)
|
||||
{
|
||||
//append it to the last fragment
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
|
||||
}
|
||||
//Test what remains of the original text beyond the point where we stopped analyzing
|
||||
if (
|
||||
// if there is text beyond the last token considered..
|
||||
(lastEndOffset < text.length())
|
||||
&&
|
||||
// and that text is not too large...
|
||||
(text.length()<= maxDocCharsToAnalyze)
|
||||
)
|
||||
{
|
||||
//append it to the last fragment
|
||||
newText.append(encoder.encodeText(text.substring(lastEndOffset)));
|
||||
}
|
||||
|
||||
currentFrag.textEndPos = newText.length();
|
||||
currentFrag.textEndPos = newText.length();
|
||||
|
||||
//sort the most relevant sections of the text
|
||||
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
|
||||
{
|
||||
currentFrag = i.next();
|
||||
//sort the most relevant sections of the text
|
||||
for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
|
||||
{
|
||||
currentFrag = i.next();
|
||||
|
||||
//If you are running with a version of Lucene before 11th Sept 03
|
||||
// you do not have PriorityQueue.insert() - so uncomment the code below
|
||||
/*
|
||||
if (currentFrag.getScore() >= minScore)
|
||||
{
|
||||
fragQueue.put(currentFrag);
|
||||
if (fragQueue.size() > maxNumFragments)
|
||||
{ // if hit queue overfull
|
||||
fragQueue.pop(); // remove lowest in hit queue
|
||||
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
|
||||
}
|
||||
//If you are running with a version of Lucene before 11th Sept 03
|
||||
// you do not have PriorityQueue.insert() - so uncomment the code below
|
||||
/*
|
||||
if (currentFrag.getScore() >= minScore)
|
||||
{
|
||||
fragQueue.put(currentFrag);
|
||||
if (fragQueue.size() > maxNumFragments)
|
||||
{ // if hit queue overfull
|
||||
fragQueue.pop(); // remove lowest in hit queue
|
||||
minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
*/
|
||||
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
|
||||
//fix to PriorityQueue. The correct method to use here is the new "insert" method
|
||||
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
|
||||
fragQueue.insertWithOverflow(currentFrag);
|
||||
}
|
||||
}
|
||||
*/
|
||||
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
|
||||
//fix to PriorityQueue. The correct method to use here is the new "insert" method
|
||||
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
|
||||
fragQueue.insertWithOverflow(currentFrag);
|
||||
}
|
||||
|
||||
//return the most relevant fragments
|
||||
TextFragment frag[] = new TextFragment[fragQueue.size()];
|
||||
for (int i = frag.length - 1; i >= 0; i--)
|
||||
{
|
||||
frag[i] = fragQueue.pop();
|
||||
}
|
||||
//return the most relevant fragments
|
||||
TextFragment frag[] = new TextFragment[fragQueue.size()];
|
||||
for (int i = frag.length - 1; i >= 0; i--)
|
||||
{
|
||||
frag[i] = fragQueue.pop();
|
||||
}
|
||||
|
||||
//merge any contiguous fragments to improve readability
|
||||
if(mergeContiguousFragments)
|
||||
{
|
||||
mergeContiguousFragments(frag);
|
||||
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if ((frag[i] != null) && (frag[i].getScore() > 0))
|
||||
{
|
||||
fragTexts.add(frag[i]);
|
||||
}
|
||||
}
|
||||
frag= fragTexts.toArray(new TextFragment[0]);
|
||||
}
|
||||
//merge any contiguous fragments to improve readability
|
||||
if(mergeContiguousFragments)
|
||||
{
|
||||
mergeContiguousFragments(frag);
|
||||
ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if ((frag[i] != null) && (frag[i].getScore() > 0))
|
||||
{
|
||||
fragTexts.add(frag[i]);
|
||||
}
|
||||
}
|
||||
frag= fragTexts.toArray(new TextFragment[0]);
|
||||
}
|
||||
|
||||
return frag;
|
||||
return frag;
|
||||
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (tokenStream != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (tokenStream != null)
|
||||
{
|
||||
try
|
||||
{
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
|
||||
* that were contiguous in the original text into one larger fragment with the correct order.
|
||||
* This will leave a "null" in the array entry for the lesser scored fragment.
|
||||
*
|
||||
* @param frag An array of document fragments in descending score
|
||||
*/
|
||||
private void mergeContiguousFragments(TextFragment[] frag)
|
||||
{
|
||||
boolean mergingStillBeingDone;
|
||||
if (frag.length > 1)
|
||||
do
|
||||
{
|
||||
mergingStillBeingDone = false; //initialise loop control flag
|
||||
//for each fragment, scan other frags looking for contiguous blocks
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if (frag[i] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
//merge any contiguous blocks
|
||||
for (int x = 0; x < frag.length; x++)
|
||||
{
|
||||
if (frag[x] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (frag[i] == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
TextFragment frag1 = null;
|
||||
TextFragment frag2 = null;
|
||||
int frag1Num = 0;
|
||||
int frag2Num = 0;
|
||||
int bestScoringFragNum;
|
||||
int worstScoringFragNum;
|
||||
//if blocks are contiguous....
|
||||
if (frag[i].follows(frag[x]))
|
||||
{
|
||||
frag1 = frag[x];
|
||||
frag1Num = x;
|
||||
frag2 = frag[i];
|
||||
frag2Num = i;
|
||||
}
|
||||
else
|
||||
if (frag[x].follows(frag[i]))
|
||||
{
|
||||
frag1 = frag[i];
|
||||
frag1Num = i;
|
||||
frag2 = frag[x];
|
||||
frag2Num = x;
|
||||
}
|
||||
//merging required..
|
||||
if (frag1 != null)
|
||||
{
|
||||
if (frag1.getScore() > frag2.getScore())
|
||||
{
|
||||
bestScoringFragNum = frag1Num;
|
||||
worstScoringFragNum = frag2Num;
|
||||
}
|
||||
else
|
||||
{
|
||||
bestScoringFragNum = frag2Num;
|
||||
worstScoringFragNum = frag1Num;
|
||||
}
|
||||
frag1.merge(frag2);
|
||||
frag[worstScoringFragNum] = null;
|
||||
mergingStillBeingDone = true;
|
||||
frag[bestScoringFragNum] = frag1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while (mergingStillBeingDone);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Highlights terms in the text , extracting the most relevant sections
|
||||
* and concatenating the chosen fragments with a separator (typically "...").
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned in order as "separator" delimited strings.
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
* @param separator the separator used to intersperse the document fragments (typically "...")
|
||||
*
|
||||
* @return highlighted text
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments,
|
||||
String separator)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = 0; i < sections.length; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
{
|
||||
result.append(separator);
|
||||
}
|
||||
result.append(sections[i]);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
|
||||
* that were contiguous in the original text into one larger fragment with the correct order.
|
||||
* This will leave a "null" in the array entry for the lesser scored fragment.
|
||||
*
|
||||
* @param frag An array of document fragments in descending score
|
||||
*/
|
||||
private void mergeContiguousFragments(TextFragment[] frag)
|
||||
{
|
||||
boolean mergingStillBeingDone;
|
||||
if (frag.length > 1)
|
||||
do
|
||||
{
|
||||
mergingStillBeingDone = false; //initialise loop control flag
|
||||
//for each fragment, scan other frags looking for contiguous blocks
|
||||
for (int i = 0; i < frag.length; i++)
|
||||
{
|
||||
if (frag[i] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
//merge any contiguous blocks
|
||||
for (int x = 0; x < frag.length; x++)
|
||||
{
|
||||
if (frag[x] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
if (frag[i] == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
TextFragment frag1 = null;
|
||||
TextFragment frag2 = null;
|
||||
int frag1Num = 0;
|
||||
int frag2Num = 0;
|
||||
int bestScoringFragNum;
|
||||
int worstScoringFragNum;
|
||||
//if blocks are contiguous....
|
||||
if (frag[i].follows(frag[x]))
|
||||
{
|
||||
frag1 = frag[x];
|
||||
frag1Num = x;
|
||||
frag2 = frag[i];
|
||||
frag2Num = i;
|
||||
}
|
||||
else
|
||||
if (frag[x].follows(frag[i]))
|
||||
{
|
||||
frag1 = frag[i];
|
||||
frag1Num = i;
|
||||
frag2 = frag[x];
|
||||
frag2Num = x;
|
||||
}
|
||||
//merging required..
|
||||
if (frag1 != null)
|
||||
{
|
||||
if (frag1.getScore() > frag2.getScore())
|
||||
{
|
||||
bestScoringFragNum = frag1Num;
|
||||
worstScoringFragNum = frag2Num;
|
||||
}
|
||||
else
|
||||
{
|
||||
bestScoringFragNum = frag2Num;
|
||||
worstScoringFragNum = frag1Num;
|
||||
}
|
||||
frag1.merge(frag2);
|
||||
frag[worstScoringFragNum] = null;
|
||||
mergingStillBeingDone = true;
|
||||
frag[bestScoringFragNum] = frag1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while (mergingStillBeingDone);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Highlights terms in the text , extracting the most relevant sections
|
||||
* and concatenating the chosen fragments with a separator (typically "...").
|
||||
* The document text is analysed in chunks to record hit statistics
|
||||
* across the document. After accumulating stats, the fragments with the highest scores
|
||||
* are returned in order as "separator" delimited strings.
|
||||
*
|
||||
* @param text text to highlight terms in
|
||||
* @param maxNumFragments the maximum number of fragments.
|
||||
* @param separator the separator used to intersperse the document fragments (typically "...")
|
||||
*
|
||||
* @return highlighted text
|
||||
* @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
|
||||
*/
|
||||
public final String getBestFragments(
|
||||
TokenStream tokenStream,
|
||||
String text,
|
||||
int maxNumFragments,
|
||||
String separator)
|
||||
throws IOException, InvalidTokenOffsetsException
|
||||
{
|
||||
String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = 0; i < sections.length; i++)
|
||||
{
|
||||
if (i > 0)
|
||||
{
|
||||
result.append(separator);
|
||||
}
|
||||
result.append(sections[i]);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public int getMaxDocCharsToAnalyze() {
|
||||
return maxDocCharsToAnalyze;
|
||||
|
@ -487,35 +487,35 @@ public class Highlighter
|
|||
}
|
||||
|
||||
|
||||
public Fragmenter getTextFragmenter()
|
||||
{
|
||||
return textFragmenter;
|
||||
}
|
||||
public Fragmenter getTextFragmenter()
|
||||
{
|
||||
return textFragmenter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param fragmenter
|
||||
*/
|
||||
public void setTextFragmenter(Fragmenter fragmenter)
|
||||
{
|
||||
textFragmenter = fragmenter;
|
||||
}
|
||||
/**
|
||||
* @param fragmenter
|
||||
*/
|
||||
public void setTextFragmenter(Fragmenter fragmenter)
|
||||
{
|
||||
textFragmenter = fragmenter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Object used to score each text fragment
|
||||
*/
|
||||
public Scorer getFragmentScorer()
|
||||
{
|
||||
return fragmentScorer;
|
||||
}
|
||||
/**
|
||||
* @return Object used to score each text fragment
|
||||
*/
|
||||
public Scorer getFragmentScorer()
|
||||
{
|
||||
return fragmentScorer;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param scorer
|
||||
*/
|
||||
public void setFragmentScorer(Scorer scorer)
|
||||
{
|
||||
fragmentScorer = scorer;
|
||||
}
|
||||
/**
|
||||
* @param scorer
|
||||
*/
|
||||
public void setFragmentScorer(Scorer scorer)
|
||||
{
|
||||
fragmentScorer = scorer;
|
||||
}
|
||||
|
||||
public Encoder getEncoder()
|
||||
{
|
||||
|
@ -528,17 +528,17 @@ public class Highlighter
|
|||
}
|
||||
class FragmentQueue extends PriorityQueue<TextFragment>
|
||||
{
|
||||
public FragmentQueue(int size)
|
||||
{
|
||||
super(size);
|
||||
}
|
||||
public FragmentQueue(int size)
|
||||
{
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean lessThan(TextFragment fragA, TextFragment fragB)
|
||||
{
|
||||
if (fragA.getScore() == fragB.getScore())
|
||||
return fragA.fragNum > fragB.fragNum;
|
||||
else
|
||||
return fragA.getScore() < fragB.getScore();
|
||||
}
|
||||
@Override
|
||||
public final boolean lessThan(TextFragment fragA, TextFragment fragB)
|
||||
{
|
||||
if (fragA.getScore() == fragB.getScore())
|
||||
return fragA.fragNum > fragB.fragNum;
|
||||
else
|
||||
return fragA.getScore() < fragB.getScore();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,9 +23,9 @@ package org.apache.lucene.search.highlight;
|
|||
public class InvalidTokenOffsetsException extends Exception
|
||||
{
|
||||
|
||||
public InvalidTokenOffsetsException(String message)
|
||||
{
|
||||
super(message);
|
||||
}
|
||||
public InvalidTokenOffsetsException(String message)
|
||||
{
|
||||
super(message);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -37,126 +37,118 @@ import org.apache.lucene.search.Query;
|
|||
public final class QueryTermExtractor
|
||||
{
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query)
|
||||
{
|
||||
return getTerms(query,false);
|
||||
}
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query)
|
||||
{
|
||||
return getTerms(query,false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param reader used to compute IDF which can be used to a) score selected fragments better
|
||||
* b) use graded highlights eg changing intensity of font color
|
||||
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
|
||||
{
|
||||
WeightedTerm[] terms=getTerms(query,false, fieldName);
|
||||
int totalNumDocs=reader.maxDoc();
|
||||
for (int i = 0; i < terms.length; i++)
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param reader used to compute IDF which can be used to a) score selected fragments better
|
||||
* b) use graded highlights eg changing intensity of font color
|
||||
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
|
||||
{
|
||||
WeightedTerm[] terms=getTerms(query,false, fieldName);
|
||||
int totalNumDocs=reader.maxDoc();
|
||||
for (int i = 0; i < terms.length; i++)
|
||||
{
|
||||
try
|
||||
try
|
||||
{
|
||||
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
|
||||
//IDF algorithm taken from DefaultSimilarity class
|
||||
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
|
||||
terms[i].weight*=idf;
|
||||
}
|
||||
catch (IOException e)
|
||||
catch (IOException e)
|
||||
{
|
||||
//ignore
|
||||
//ignore
|
||||
}
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param prohibited <code>true</code> to extract "prohibited" terms, too
|
||||
* @param fieldName The fieldName used to filter query terms
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param prohibited <code>true</code> to extract "prohibited" terms, too
|
||||
* @param fieldName The fieldName used to filter query terms
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName)
|
||||
{
|
||||
HashSet<WeightedTerm> terms=new HashSet<WeightedTerm>();
|
||||
getTerms(query,terms,prohibited,fieldName);
|
||||
return terms.toArray(new WeightedTerm[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param prohibited <code>true</code> to extract "prohibited" terms, too
|
||||
public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName)
|
||||
{
|
||||
HashSet<WeightedTerm> terms=new HashSet<WeightedTerm>();
|
||||
getTerms(query,terms,prohibited,fieldName);
|
||||
return terms.toArray(new WeightedTerm[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts all terms texts of a given Query into an array of WeightedTerms
|
||||
*
|
||||
* @param query Query to extract term texts from
|
||||
* @param prohibited <code>true</code> to extract "prohibited" terms, too
|
||||
* @return an array of the terms used in a query, plus their weights.
|
||||
*/
|
||||
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
|
||||
{
|
||||
return getTerms(query,prohibited,null);
|
||||
}
|
||||
public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
|
||||
{
|
||||
return getTerms(query,prohibited,null);
|
||||
}
|
||||
|
||||
private static final void getTerms(Query query, HashSet<WeightedTerm> terms,boolean prohibited, String fieldName)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (query instanceof BooleanQuery)
|
||||
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
|
||||
else
|
||||
if(query instanceof FilteredQuery)
|
||||
getTermsFromFilteredQuery((FilteredQuery)query, terms,prohibited, fieldName);
|
||||
else
|
||||
{
|
||||
HashSet<Term> nonWeightedTerms=new HashSet<Term>();
|
||||
query.extractTerms(nonWeightedTerms);
|
||||
for (Iterator<Term> iter = nonWeightedTerms.iterator(); iter.hasNext();)
|
||||
{
|
||||
Term term = iter.next();
|
||||
if((fieldName==null)||(term.field().equals(fieldName)))
|
||||
{
|
||||
terms.add(new WeightedTerm(query.getBoost(),term.text()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch(UnsupportedOperationException ignore)
|
||||
{
|
||||
//this is non-fatal for our purposes
|
||||
}
|
||||
}
|
||||
private static final void getTerms(Query query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName) {
|
||||
try {
|
||||
if (query instanceof BooleanQuery)
|
||||
getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
|
||||
else if (query instanceof FilteredQuery)
|
||||
getTermsFromFilteredQuery((FilteredQuery) query, terms, prohibited, fieldName);
|
||||
else {
|
||||
HashSet<Term> nonWeightedTerms = new HashSet<Term>();
|
||||
query.extractTerms(nonWeightedTerms);
|
||||
for (Iterator<Term> iter = nonWeightedTerms.iterator(); iter.hasNext(); ) {
|
||||
Term term = iter.next();
|
||||
if ((fieldName == null) || (term.field().equals(fieldName))) {
|
||||
terms.add(new WeightedTerm(query.getBoost(), term.text()));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (UnsupportedOperationException ignore) {
|
||||
//this is non-fatal for our purposes
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* extractTerms is currently the only query-independent means of introspecting queries but it only reveals
|
||||
* a list of terms for that query - not the boosts each individual term in that query may or may not have.
|
||||
* "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
|
||||
* in each child element.
|
||||
* Some discussion around this topic here:
|
||||
* http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
|
||||
* Unfortunately there seemed to be limited interest in requiring all Query objects to implement
|
||||
* something common which would allow access to child queries so what follows here are query-specific
|
||||
* implementations for accessing embedded query elements.
|
||||
*/
|
||||
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
|
||||
{
|
||||
BooleanClause[] queryClauses = query.getClauses();
|
||||
for (int i = 0; i < queryClauses.length; i++)
|
||||
{
|
||||
if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
|
||||
getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
|
||||
}
|
||||
}
|
||||
private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
|
||||
{
|
||||
getTerms(query.getQuery(),terms,prohibited,fieldName);
|
||||
}
|
||||
|
||||
/**
|
||||
* extractTerms is currently the only query-independent means of introspecting queries but it only reveals
|
||||
* a list of terms for that query - not the boosts each individual term in that query may or may not have.
|
||||
* "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
|
||||
* in each child element.
|
||||
* Some discussion around this topic here:
|
||||
* http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
|
||||
* Unfortunately there seemed to be limited interest in requiring all Query objects to implement
|
||||
* something common which would allow access to child queries so what follows here are query-specific
|
||||
* implementations for accessing embedded query elements.
|
||||
*/
|
||||
private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
|
||||
{
|
||||
BooleanClause[] queryClauses = query.getClauses();
|
||||
for (int i = 0; i < queryClauses.length; i++)
|
||||
{
|
||||
if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
|
||||
getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
|
||||
}
|
||||
}
|
||||
private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, boolean prohibited, String fieldName)
|
||||
{
|
||||
getTerms(query.getQuery(),terms,prohibited,fieldName);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -21,61 +21,61 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public class SimpleHTMLEncoder implements Encoder
|
||||
{
|
||||
public SimpleHTMLEncoder()
|
||||
{
|
||||
}
|
||||
public SimpleHTMLEncoder()
|
||||
{
|
||||
}
|
||||
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return htmlEncode(originalText);
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode string into HTML
|
||||
*/
|
||||
public final static String htmlEncode(String plainText)
|
||||
{
|
||||
if (plainText == null || plainText.length() == 0)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
public String encodeText(String originalText)
|
||||
{
|
||||
return htmlEncode(originalText);
|
||||
}
|
||||
|
||||
StringBuilder result = new StringBuilder(plainText.length());
|
||||
/**
|
||||
* Encode string into HTML
|
||||
*/
|
||||
public final static String htmlEncode(String plainText)
|
||||
{
|
||||
if (plainText == null || plainText.length() == 0)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
for (int index=0; index<plainText.length(); index++)
|
||||
{
|
||||
char ch = plainText.charAt(index);
|
||||
StringBuilder result = new StringBuilder(plainText.length());
|
||||
|
||||
switch (ch)
|
||||
{
|
||||
case '"':
|
||||
result.append(""");
|
||||
break;
|
||||
for (int index=0; index<plainText.length(); index++)
|
||||
{
|
||||
char ch = plainText.charAt(index);
|
||||
|
||||
case '&':
|
||||
result.append("&");
|
||||
break;
|
||||
switch (ch)
|
||||
{
|
||||
case '"':
|
||||
result.append(""");
|
||||
break;
|
||||
|
||||
case '<':
|
||||
result.append("<");
|
||||
break;
|
||||
case '&':
|
||||
result.append("&");
|
||||
break;
|
||||
|
||||
case '>':
|
||||
result.append(">");
|
||||
break;
|
||||
case '<':
|
||||
result.append("<");
|
||||
break;
|
||||
|
||||
default:
|
||||
if (ch < 128)
|
||||
{
|
||||
result.append(ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append("&#").append((int)ch).append(";");
|
||||
}
|
||||
}
|
||||
}
|
||||
case '>':
|
||||
result.append(">");
|
||||
break;
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
default:
|
||||
if (ch < 128)
|
||||
{
|
||||
result.append(ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append("&#").append((int)ch).append(";");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
}
|
|
@ -26,34 +26,34 @@ public class SimpleHTMLFormatter implements Formatter {
|
|||
private static final String DEFAULT_PRE_TAG = "<B>";
|
||||
private static final String DEFAULT_POST_TAG = "</B>";
|
||||
|
||||
private String preTag;
|
||||
private String postTag;
|
||||
|
||||
public SimpleHTMLFormatter(String preTag, String postTag) {
|
||||
this.preTag = preTag;
|
||||
this.postTag = postTag;
|
||||
}
|
||||
private String preTag;
|
||||
private String postTag;
|
||||
|
||||
/** Default constructor uses HTML: <B> tags to markup terms. */
|
||||
public SimpleHTMLFormatter() {
|
||||
this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
|
||||
}
|
||||
public SimpleHTMLFormatter(String preTag, String postTag) {
|
||||
this.preTag = preTag;
|
||||
this.postTag = postTag;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
|
||||
*/
|
||||
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
|
||||
if (tokenGroup.getTotalScore() <= 0) {
|
||||
return originalText;
|
||||
}
|
||||
|
||||
// Allocate StringBuilder with the right number of characters from the
|
||||
/** Default constructor uses HTML: <B> tags to markup terms. */
|
||||
public SimpleHTMLFormatter() {
|
||||
this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
|
||||
*/
|
||||
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
|
||||
if (tokenGroup.getTotalScore() <= 0) {
|
||||
return originalText;
|
||||
}
|
||||
|
||||
// Allocate StringBuilder with the right number of characters from the
|
||||
// beginning, to avoid char[] allocations in the middle of appends.
|
||||
StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
|
||||
returnBuffer.append(preTag);
|
||||
returnBuffer.append(originalText);
|
||||
returnBuffer.append(postTag);
|
||||
return returnBuffer.toString();
|
||||
}
|
||||
|
||||
StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length());
|
||||
returnBuffer.append(preTag);
|
||||
returnBuffer.append(originalText);
|
||||
returnBuffer.append(postTag);
|
||||
return returnBuffer.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,57 +22,49 @@ package org.apache.lucene.search.highlight;
|
|||
* doesn't work in Mozilla, thus this class.
|
||||
*
|
||||
* @see GradientFormatter
|
||||
*
|
||||
*/
|
||||
|
||||
public class SpanGradientFormatter
|
||||
extends GradientFormatter
|
||||
{
|
||||
public SpanGradientFormatter(float maxScore, String minForegroundColor,
|
||||
String maxForegroundColor, String minBackgroundColor,
|
||||
String maxBackgroundColor)
|
||||
{
|
||||
super( maxScore, minForegroundColor,
|
||||
maxForegroundColor, minBackgroundColor,
|
||||
maxBackgroundColor);
|
||||
}
|
||||
|
||||
extends GradientFormatter {
|
||||
public SpanGradientFormatter(float maxScore, String minForegroundColor,
|
||||
String maxForegroundColor, String minBackgroundColor,
|
||||
String maxBackgroundColor) {
|
||||
super(maxScore, minForegroundColor,
|
||||
maxForegroundColor, minBackgroundColor,
|
||||
maxBackgroundColor);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String highlightTerm(String originalText, TokenGroup tokenGroup)
|
||||
{
|
||||
if (tokenGroup.getTotalScore() == 0)
|
||||
return originalText;
|
||||
float score = tokenGroup.getTotalScore();
|
||||
if (score == 0)
|
||||
{
|
||||
return originalText;
|
||||
}
|
||||
|
||||
// try to size sb correctly
|
||||
StringBuilder sb = new StringBuilder( originalText.length() + EXTRA);
|
||||
|
||||
sb.append("<span style=\"");
|
||||
if (highlightForeground)
|
||||
{
|
||||
sb.append("color: ");
|
||||
sb.append(getForegroundColorString(score));
|
||||
sb.append("; ");
|
||||
}
|
||||
if (highlightBackground)
|
||||
{
|
||||
sb.append("background: ");
|
||||
sb.append(getBackgroundColorString(score));
|
||||
sb.append("; ");
|
||||
}
|
||||
sb.append("\">");
|
||||
sb.append(originalText);
|
||||
sb.append("</span>");
|
||||
return sb.toString();
|
||||
@Override
|
||||
public String highlightTerm(String originalText, TokenGroup tokenGroup) {
|
||||
if (tokenGroup.getTotalScore() == 0)
|
||||
return originalText;
|
||||
float score = tokenGroup.getTotalScore();
|
||||
if (score == 0) {
|
||||
return originalText;
|
||||
}
|
||||
|
||||
// guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
|
||||
private static final String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
|
||||
private static final int EXTRA = TEMPLATE.length();
|
||||
// try to size sb correctly
|
||||
StringBuilder sb = new StringBuilder(originalText.length() + EXTRA);
|
||||
|
||||
sb.append("<span style=\"");
|
||||
if (highlightForeground) {
|
||||
sb.append("color: ");
|
||||
sb.append(getForegroundColorString(score));
|
||||
sb.append("; ");
|
||||
}
|
||||
if (highlightBackground) {
|
||||
sb.append("background: ");
|
||||
sb.append(getBackgroundColorString(score));
|
||||
sb.append("; ");
|
||||
}
|
||||
sb.append("\">");
|
||||
sb.append(originalText);
|
||||
sb.append("</span>");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
|
||||
private static final String TEMPLATE = "<span style=\"background: #EEEEEE; color: #000000;\">...</span>";
|
||||
private static final int EXTRA = TEMPLATE.length();
|
||||
}
|
||||
|
|
|
@ -25,57 +25,57 @@ package org.apache.lucene.search.highlight;
|
|||
*/
|
||||
public class TextFragment
|
||||
{
|
||||
CharSequence markedUpText;
|
||||
int fragNum;
|
||||
int textStartPos;
|
||||
int textEndPos;
|
||||
float score;
|
||||
CharSequence markedUpText;
|
||||
int fragNum;
|
||||
int textStartPos;
|
||||
int textEndPos;
|
||||
float score;
|
||||
|
||||
public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
|
||||
{
|
||||
this.markedUpText=markedUpText;
|
||||
this.textStartPos = textStartPos;
|
||||
this.fragNum = fragNum;
|
||||
}
|
||||
public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
|
||||
{
|
||||
this.markedUpText=markedUpText;
|
||||
this.textStartPos = textStartPos;
|
||||
this.fragNum = fragNum;
|
||||
}
|
||||
|
||||
void setScore(float score)
|
||||
{
|
||||
this.score=score;
|
||||
}
|
||||
public float getScore()
|
||||
{
|
||||
return score;
|
||||
}
|
||||
/**
|
||||
* @param frag2 Fragment to be merged into this one
|
||||
*/
|
||||
void setScore(float score)
|
||||
{
|
||||
this.score=score;
|
||||
}
|
||||
public float getScore()
|
||||
{
|
||||
return score;
|
||||
}
|
||||
/**
|
||||
* @param frag2 Fragment to be merged into this one
|
||||
*/
|
||||
public void merge(TextFragment frag2)
|
||||
{
|
||||
textEndPos = frag2.textEndPos;
|
||||
score=Math.max(score,frag2.score);
|
||||
}
|
||||
/**
|
||||
* @param fragment
|
||||
* @return true if this fragment follows the one passed
|
||||
*/
|
||||
public boolean follows(TextFragment fragment)
|
||||
{
|
||||
return textStartPos == fragment.textEndPos;
|
||||
}
|
||||
* @param fragment
|
||||
* @return true if this fragment follows the one passed
|
||||
*/
|
||||
public boolean follows(TextFragment fragment)
|
||||
{
|
||||
return textStartPos == fragment.textEndPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the fragment sequence number
|
||||
*/
|
||||
public int getFragNum()
|
||||
{
|
||||
return fragNum;
|
||||
}
|
||||
/**
|
||||
* @return the fragment sequence number
|
||||
*/
|
||||
public int getFragNum()
|
||||
{
|
||||
return fragNum;
|
||||
}
|
||||
|
||||
/* Returns the marked-up text for this text fragment
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return markedUpText.subSequence(textStartPos, textEndPos).toString();
|
||||
}
|
||||
/* Returns the marked-up text for this text fragment
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return markedUpText.subSequence(textStartPos, textEndPos).toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue