LUCENE-3983: HTMLStripCharFilter: Stop upcasing HTML character entity names at class initialization time; instead, provide hard-coded upcased versions for a small set of them.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1340169 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-05-18 16:58:38 +00:00
parent 0b1d814c94
commit 5efed3447e
4 changed files with 46 additions and 17 deletions

View File

@ -62,8 +62,16 @@ CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
| "zwj" | "zwnj" )
%{
private static final Set<String> upperCaseVariantsAccepted
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
private static final Map<String,String> upperCaseVariantsAccepted
= new HashMap<String,String>();
static {
upperCaseVariantsAccepted.put("quot", "QUOT");
upperCaseVariantsAccepted.put("copy", "COPY");
upperCaseVariantsAccepted.put("gt", "GT");
upperCaseVariantsAccepted.put("lt", "LT");
upperCaseVariantsAccepted.put("reg", "REG");
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
@ -145,8 +153,9 @@ CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
if (upperCaseVariantsAccepted.contains(entities[i])) {
entityValues.put(entities[i].toUpperCase(), value);
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
if (upperCaseVariant != null) {
entityValues.put(upperCaseVariant, value);
}
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 3/24/12 4:50 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/18/12 12:24 PM */
package org.apache.lucene.analysis.charfilter;
@ -21,7 +21,8 @@ package org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.Version;
@ -39,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 3/24/12 4:50 PM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
* on 5/18/12 12:24 PM from the specification file
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@ -30522,8 +30523,16 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
private boolean zzEOFDone;
/* user code: */
private static final Set<String> upperCaseVariantsAccepted
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
private static final Map<String,String> upperCaseVariantsAccepted
= new HashMap<String,String>();
static {
upperCaseVariantsAccepted.put("quot", "QUOT");
upperCaseVariantsAccepted.put("copy", "COPY");
upperCaseVariantsAccepted.put("gt", "GT");
upperCaseVariantsAccepted.put("lt", "LT");
upperCaseVariantsAccepted.put("reg", "REG");
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
@ -30605,8 +30614,9 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
if (upperCaseVariantsAccepted.contains(entities[i])) {
entityValues.put(entities[i].toUpperCase(), value);
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
if (upperCaseVariant != null) {
entityValues.put(upperCaseVariant, value);
}
}
}

View File

@ -19,7 +19,8 @@ package org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.Version;

View File

@ -50,8 +50,16 @@ def main():
print output_line, ')'
print '%{'
print ' private static final Set<String> upperCaseVariantsAccepted'
print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
print ' private static final Map<String,String> upperCaseVariantsAccepted'
print ' = new HashMap<String,String>();'
print ' static {'
print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
print ' upperCaseVariantsAccepted.put("copy", "COPY");'
print ' upperCaseVariantsAccepted.put("gt", "GT");'
print ' upperCaseVariantsAccepted.put("lt", "LT");'
print ' upperCaseVariantsAccepted.put("reg", "REG");'
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
print ' }'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' static {'
@ -68,8 +76,9 @@ def main():
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
print ' entityValues.put(entities[i].toUpperCase(), value);'
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
print ' if (upperCaseVariant != null) {'
print ' entityValues.put(upperCaseVariant, value);'
print ' }'
print ' }'
print " }"