LUCENE-3690: Re-implemented HTMLStripCharFilter as a JFlex-generated scanner. Fixes LUCENE-2208, SOLR-882, and SOLR-42.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-01-22 05:20:46 +00:00
parent 17fe719bb5
commit f3a363708f
22 changed files with 36700 additions and 1358 deletions

View File

@ -793,6 +793,9 @@ New Features
* LUCENE-3121: Add TypeTokenFilter that filters tokens based on
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
markup. (Steve Rowe)
Bug fixes
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter

View File

@ -249,7 +249,42 @@ public class _TestUtil {
}
}
// TODO: make this more evil
private static final String[] HTML_CHAR_ENTITIES = {
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
"Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
"Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
"Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
"Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
"QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
"Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
"Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
"alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
"auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
"cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
"curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
"eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
"equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
"frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
"harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
"image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
"lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
"lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
"mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
"ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
"oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
"ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
"perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
"psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
"rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
"sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
"spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
"szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
"tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
"uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
"yuml", "zeta", "zwj", "zwnj"
};
public static String randomHtmlishString(Random random, int numElements) {
final int end = random.nextInt(numElements);
if (end == 0) {
@ -258,17 +293,80 @@ public class _TestUtil {
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++) {
int val = random.nextInt(10);
int val = random.nextInt(25);
switch(val) {
case 0: sb.append("<p>"); break;
case 1: sb.append("</p>"); break;
case 2: sb.append("<!--"); break;
case 3: sb.append("-->"); break;
case 4: sb.append("&#"); break;
case 5: sb.append(";"); break;
case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
default:
sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
case 1: {
sb.append("<");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
sb.append(' ');
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append('=');
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append("\"".substring(nextInt(random, 0, 1)));
sb.append(randomSimpleString(random));
sb.append("\"".substring(nextInt(random, 0, 1)));
}
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append("/".substring(nextInt(random, 0, 1)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 2: {
sb.append("</");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 3: sb.append(">"); break;
case 4: sb.append("</p>"); break;
case 5: sb.append("<!--"); break;
case 6: sb.append("<!--#"); break;
case 7: sb.append("<script><!-- f('"); break;
case 8: sb.append("</script>"); break;
case 9: sb.append("<?"); break;
case 10: sb.append("?>"); break;
case 11: sb.append("\""); break;
case 12: sb.append("\\\""); break;
case 13: sb.append("'"); break;
case 14: sb.append("\\'"); break;
case 15: sb.append("-->"); break;
case 16: {
sb.append("&");
switch(nextInt(random, 0, 2)) {
case 0: sb.append(randomSimpleString(random)); break;
case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
}
sb.append(";".substring(nextInt(random, 0, 1)));
break;
}
case 17: {
sb.append("&#");
if (0 == nextInt(random, 0, 1)) {
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 18: {
sb.append("&#x");
if (0 == nextInt(random, 0, 1)) {
sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 19: sb.append(";"); break;
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
case 21: sb.append("\n");
case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
default: sb.append(randomSimpleString(random));
}
}
return sb.toString();

View File

@ -31,7 +31,8 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
<target name="gen-uax29-supp-macros">
<subant target="gen-uax29-supp-macros">
@ -39,6 +40,29 @@
</subant>
</target>
<target name="jflex-HTMLStripCharFilter"
depends="init,jflex-check,generate-jflex-html-char-entities"
if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
outdir="src/java/org/apache/lucene/analysis/charfilter"
nobak="on"/>
<!-- Remove the inappropriate JFlex-generated constructors -->
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
</target>
<target name="generate-jflex-html-char-entities">
<exec dir="src/java/org/apache/lucene/analysis/charfilter"
output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
executable="${python.exe}" failonerror="true" logerror="true">
<arg value="htmlentity.py"/>
</exec>
</target>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>

View File

@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
import java.util.Arrays;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
0 : diffs[size-1];
}
/**
* <p>
* Adds an offset correction mapping at the given output stream offset.
* </p>
* <p>
* Assumption: the offset given with each successive call to this method
* will not be smaller than the offset given at the previous invocation.
* </p>
*
* @param off The output stream offset at which to apply the correction
* @param cumulativeDiff The input offset is given by adding this
* to the output offset
*/
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
diffs = ArrayUtil.grow(diffs);
}
assert (size == 0 || off >= offsets[size])
: "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
if (size == 0 || off != offsets[size - 1]) {
offsets[size] = off;
diffs[size++] = cumulativeDiff;
} else { // Overwrite the diff at the last recorded offset
diffs[size - 1] = cumulativeDiff;
}
}
}

View File

@ -0,0 +1,153 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
| "zwj" | "zwnj" )
%{
private static final Set<String> upperCaseVariantsAccepted
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
};
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
if (upperCaseVariantsAccepted.contains(entities[i])) {
entityValues.put(entities[i].toUpperCase(), value);
}
}
}
%}

View File

@ -0,0 +1,58 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
ID_Start_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD80D][\uDC00-\uDC2E]
| [\uD86E][\uDC00-\uDC1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD801][\uDC00-\uDC9D]
)
ID_Continue_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
| [\uD82C][\uDC00\uDC01]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD80D][\uDC00-\uDC2E]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD86E][\uDC00-\uDC1D]
| [\uDB40][\uDD00-\uDDEF]
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
)

View File

@ -0,0 +1,737 @@
package org.apache.lucene.analysis.charfilter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.util.Version;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
*/
%%
%unicode 6.0
%apiprivate
%type int
%final
%public
%char
%function nextChar
%class HTMLStripCharFilter
%extends BaseCharFilter
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
%xstate STYLE, STYLE_COMMENT
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
// [5] Name ::= NameStartChar (NameChar)*
//
// From UAX #31: Unicode Identifier and Pattern Syntax
// <http://unicode.org/reports/tr31/>:
//
// D1. Default Identifier Syntax
//
// <identifier> := <ID_Start> <ID_Continue>*
//
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
// From Apache httpd mod_include documentation
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
//
// Basic Elements
//
// The document is parsed as an HTML document, with special commands
// embedded as SGML comments. A command has the syntax:
//
// <!--#element attribute=value attribute=value ... -->
//
// The value will often be enclosed in double quotes, but single quotes (')
// and backticks (`) are also possible. Many commands only allow a single
// attribute-value pair. Note that the comment terminator (-->) should be
// preceded by whitespace to ensure that it isn't considered part of an SSI
// token. Note that the leading <!--# is one token and may not contain any
// whitespaces.
//
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
[bB][lL][uU][rR] |
[cC][hH][aA][nN][gG][eE] |
[cC][lL][iI][cC][kK] |
[dD][bB][lL][cC][lL][iI][cC][kK] |
[eE][rR][rR][oO][rR] |
[fF][oO][cC][uU][sS] |
[kK][eE][yY][dD][oO][wW][nN] |
[kK][eE][yY][pP][rR][eE][sS][sS] |
[kK][eE][yY][uU][pP] |
[lL][oO][aA][dD] |
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
[mM][oO][uU][sS][eE][oO][uU][tT] |
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
[mM][oO][uU][sS][eE][uU][pP] |
[rR][eE][sS][eE][tT] |
[sS][eE][lL][eE][cC][tT] |
[sS][uU][bB][mM][iI][tT] |
[uU][nN][lL][oO][aA][dD] )
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
InlineElment = ( [aAbBiIqQsSuU] |
[aA][bB][bB][rR] |
[aA][cC][rR][oO][nN][yY][mM] |
[bB][aA][sS][eE][fF][oO][nN][tT] |
[bB][dD][oO] |
[bB][iI][gG] |
[cC][iI][tT][eE] |
[cC][oO][dD][eE] |
[dD][fF][nN] |
[eE][mM] |
[fF][oO][nN][tT] |
[iI][mM][gG] |
[iI][nN][pP][uU][tT] |
[kK][bB][dD] |
[lL][aA][bB][eE][lL] |
[sS][aA][mM][pP] |
[sS][eE][lL][eE][cC][tT] |
[sS][mM][aA][lL][lL] |
[sS][pP][aA][nN] |
[sS][tT][rR][iI][kK][eE] |
[sS][tT][rR][oO][nN][gG] |
[sS][uU][bB] |
[sS][uU][pP] |
[tT][eE][xX][tT][aA][rR][eE][aA] |
[tT][tT] |
[vV][aA][rR] )
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
private static final char BR_START_TAG_REPLACEMENT = '\n';
private static final char BR_END_TAG_REPLACEMENT = '\n';
private static final char SCRIPT_REPLACEMENT = '\n';
private static final char STYLE_REPLACEMENT = '\n';
private CharArraySet escapedTags = null;
private int inputStart;
private int cumulativeDiff;
private boolean escapeBR = false;
private boolean escapeSCRIPT = false;
private boolean escapeSTYLE = false;
private int restoreState;
private int previousRestoreState;
private int outputCharCount;
private int eofReturnValue;
private TextSegment inputSegment
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
private TextSegment outputSegment = inputSegment;
private TextSegment entitySegment = new TextSegment(2);
/**
* @param source
*/
public HTMLStripCharFilter(CharStream source) {
super(source);
this.zzReader = source;
}
/**
* @param source
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {
escapeBR = true;
} else if (tag.equalsIgnoreCase("SCRIPT")) {
escapeSCRIPT = true;
} else if (tag.equalsIgnoreCase("STYLE")) {
escapeSTYLE = true;
} else {
if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
}
this.escapedTags.add(tag);
}
}
}
}
@Override
public int read() throws IOException {
if (outputSegment.isRead()) {
if (zzAtEOF) {
return -1;
}
int ch = nextChar();
++outputCharCount;
return ch;
}
int ch = outputSegment.nextChar();
++outputCharCount;
return ch;
}
@Override
public int read(char cbuf[], int off, int len) throws IOException {
int i = 0;
for ( ; i < len ; ++i) {
int ch = read();
if (ch == -1) break;
cbuf[off++] = (char)ch;
}
return i > 0 ? i : (len == 0 ? 0 : -1);
}
@Override
public void close() throws IOException {
yyclose();
}
static int getInitialBufferSize() { // Package private, for testing purposes
return ZZ_BUFFERSIZE;
}
private class TextSegment extends OpenStringBuilder {
/** The position from which the next char will be read. */
int pos = 0;
/** Wraps the given buffer and sets this.len to the given length. */
TextSegment(char[] buffer, int length) {
super(buffer, length);
}
/** Allocates an internal buffer of the given size. */
TextSegment(int size) {
super(size);
}
/** Sets len = 0 and pos = 0. */
void clear() {
reset();
restart();
}
/** Sets pos = 0 */
void restart() {
pos = 0;
}
/** Returns the next char in the segment. */
int nextChar() {
assert (! isRead()): "Attempting to read past the end of a segment.";
return buf[pos++];
}
/** Returns true when all characters in the text segment have been read */
boolean isRead() {
return pos >= len;
}
}
%}
%eofval{
return eofReturnValue;
%eofval}
%eof{
switch (zzLexicalState) {
case SCRIPT:
case COMMENT:
case SCRIPT_COMMENT:
case STYLE:
case STYLE_COMMENT:
case SINGLE_QUOTED_STRING:
case DOUBLE_QUOTED_STRING:
case END_TAG_TAIL_EXCLUDE:
case END_TAG_TAIL_SUBSTITUTE:
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
cumulativeDiff += yychar - inputStart;
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
break;
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
cumulativeDiff += inputSegment.length() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
}
case BANG:
case CDATA:
case AMPERSAND:
case NUMERIC_CHARACTER:
case END_TAG_TAIL_INCLUDE:
case START_TAG_TAIL_INCLUDE:
case LEFT_ANGLE_BRACKET:
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar();
break;
}
default: {
eofReturnValue = -1;
}
}
%eof}
%%
"&" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
"<" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
<AMPERSAND> {
{CharacterEntities} {
int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
}
<NUMERIC_CHARACTER> {
[xX] [0-9A-Fa-f]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
String hexCharRef
= new String(zzBuffer, zzStartRead + 1, matchLength - 1);
try {
int codePoint = Integer.parseInt(hexCharRef, 16);
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} catch(NumberFormatException e) {
assert false: "NumberFormatException parsing hex code point '"
+ hexCharRef + "'";
} catch(IllegalArgumentException e) {
assert false: "IllegalArgumentException getting chars "
+ "for hex code point '" + hexCharRef + "'";
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[0-9]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
try {
int codePoint = Integer.parseInt(decimalCharRef);
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} catch(NumberFormatException e) {
assert false: "NumberFormatException parsing code point '"
+ decimalCharRef + "'";
} catch(IllegalArgumentException e) {
assert false: "IllegalArgumentException getting chars for code point '"
+ decimalCharRef + "'";
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
}
<CHARACTER_REFERENCE_TAIL> {
";" {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<LEFT_ANGLE_BRACKET_SLASH> {
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
[bB][rR] \s* ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
}
<END_TAG_TAIL_INCLUDE> {
\s* ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<END_TAG_TAIL_EXCLUDE> {
\s* ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
}
<END_TAG_TAIL_SUBSTITUTE> {
\s* ">" {
cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
}
<LEFT_ANGLE_BRACKET> {
"!" { inputSegment.append('!'); yybegin(BANG); }
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
\s+ {
inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
"?" [^>]* [/?] ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
cumulativeDiff
+= inputSegment.length() + yylength() - outputSegment.length();
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
}
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
}
<START_TAG_TAIL_INCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<START_TAG_TAIL_EXCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
}
<START_TAG_TAIL_SUBSTITUTE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
cumulativeDiff += inputSegment.length() + yylength() - 1;
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
}
<BANG> {
"--" { yybegin(COMMENT); }
">" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [18] CDSect ::= CDStart CData CDEnd
// [19] CDStart ::= '<![CDATA['
// [20] CData ::= (Char* - (Char* ']]>' Char*))
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
cumulativeDiff += inputSegment.length() + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
[^] {
inputSegment.append(zzBuffer[zzStartRead]);
}
}
<CDATA> {
"]]>" {
cumulativeDiff += yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
[^] { return zzBuffer[zzStartRead]; }
}
<COMMENT> {
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"-->" {
cumulativeDiff += yychar - inputStart + yylength();
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
[^] { }
}
<SERVER_SIDE_INCLUDE> {
"-->" { yybegin(restoreState); }
"'" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
"\"" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
[^] { }
}
<SCRIPT_COMMENT> {
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(SCRIPT); }
[^] { }
}
<STYLE_COMMENT> {
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(STYLE); }
[^] { }
}
<SINGLE_QUOTED_STRING> {
"\\" [^] { }
"'" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<DOUBLE_QUOTED_STRING> {
"\\" [^] { }
"\"" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<SCRIPT> {
"<!--" { yybegin(SCRIPT_COMMENT); }
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
[^] { }
}
<STYLE> {
"<!--" { yybegin(STYLE_COMMENT); }
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
cumulativeDiff += yychar - inputStart;
int outputEnd = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
cumulativeDiff += yylength() - 1;
++outputEnd;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(outputEnd, cumulativeDiff);
return returnValue;
}
[^] { }
}
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
[^] {
yypushback(1);
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[^] { return zzBuffer[zzStartRead]; }

View File

@ -0,0 +1,530 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print get_apache_license()
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line, ')'
print '%{'
print ' private static final Set<String> upperCaseVariantsAccepted'
print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line[:-1]
print ' };'
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
print ' entityValues.put(entities[i].toUpperCase(), value);'
print ' }'
print ' }'
print " }"
print "%}"
def get_entity_text():
# The text below is taken verbatim from
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
text = r"""
F.1. XHTML Character Entities
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
F.1.1. XHTML Latin 1 Character Entities
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
<!-- file: xhtml-lat1.ent
Typical invocation:
<!ENTITY % xhtml-lat1
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"xhtml-lat1.ent" >
%xhtml-lat1;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!ENTITY nbsp "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
<!ENTITY iexcl "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
<!ENTITY cent "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
<!ENTITY pound "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
<!ENTITY yen "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
<!ENTITY sect "&#167;" ><!-- section sign, U+00A7 ISOnum -->
<!ENTITY uml "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
<!ENTITY copy "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
<!ENTITY ordf "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
<!ENTITY laquo "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
<!ENTITY not "&#172;" ><!-- not sign, U+00AC ISOnum -->
<!ENTITY shy "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
<!ENTITY reg "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
<!ENTITY macr "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
<!ENTITY deg "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
<!ENTITY sup2 "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
<!ENTITY sup3 "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
<!ENTITY acute "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
<!ENTITY micro "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
<!ENTITY para "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
<!ENTITY cedil "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
<!ENTITY sup1 "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
<!ENTITY ordm "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
<!ENTITY raquo "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
<!ENTITY Acirc "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
<!ENTITY Auml "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
<!ENTITY Aring "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
<!ENTITY AElig "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
<!ENTITY Ecirc "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
<!ENTITY Euml "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
<!ENTITY Icirc "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
<!ENTITY Iuml "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
<!ENTITY ETH "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
<!ENTITY Ocirc "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
<!ENTITY Ouml "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
<!ENTITY times "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
<!ENTITY Ucirc "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
<!ENTITY Uuml "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
<!ENTITY THORN "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
<!ENTITY szlig "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
<!ENTITY acirc "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
<!ENTITY auml "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
<!ENTITY aring "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
<!ENTITY aelig "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
<!ENTITY ecirc "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
<!ENTITY euml "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
<!ENTITY icirc "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
<!ENTITY iuml "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
<!ENTITY eth "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
<!ENTITY ocirc "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
<!ENTITY ouml "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
<!ENTITY ucirc "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
<!ENTITY uuml "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
<!ENTITY thorn "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
<!ENTITY yuml "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
<!-- end of xhtml-lat1.ent -->
F.1.2. XHTML Special Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
<!-- file: xhtml-special.ent
Typical invocation:
<!ENTITY % xhtml-special
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"xhtml-special.ent" >
%xhtml-special;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
Revisions:
2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- C0 Controls and Basic Latin -->
<!ENTITY lt "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
<!ENTITY gt "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
<!ENTITY amp "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
<!ENTITY apos "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
<!ENTITY quot "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
<!-- Latin Extended-A -->
<!ENTITY OElig "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
<!ENTITY oelig "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
<!-- ligature is a misnomer, this is a separate character in some languages -->
<!ENTITY Scaron "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
<!ENTITY scaron "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
<!ENTITY Yuml "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
<!-- Spacing Modifier Letters -->
<!ENTITY circ "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
<!ENTITY tilde "&#732;" ><!-- small tilde, U+02DC ISOdia -->
<!-- General Punctuation -->
<!ENTITY ensp "&#8194;" ><!-- en space, U+2002 ISOpub -->
<!ENTITY emsp "&#8195;" ><!-- em space, U+2003 ISOpub -->
<!ENTITY thinsp "&#8201;" ><!-- thin space, U+2009 ISOpub -->
<!ENTITY zwnj "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
<!ENTITY zwj "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
<!ENTITY lrm "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
<!ENTITY rlm "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
<!ENTITY ndash "&#8211;" ><!-- en dash, U+2013 ISOpub -->
<!ENTITY mdash "&#8212;" ><!-- em dash, U+2014 ISOpub -->
<!ENTITY lsquo "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
<!ENTITY rsquo "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
<!ENTITY sbquo "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
<!ENTITY ldquo "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
<!ENTITY rdquo "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
<!ENTITY bdquo "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
<!ENTITY dagger "&#8224;" ><!-- dagger, U+2020 ISOpub -->
<!ENTITY Dagger "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
<!ENTITY permil "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
<!-- lsaquo is proposed but not yet ISO standardized -->
<!ENTITY lsaquo "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
<!-- rsaquo is proposed but not yet ISO standardized -->
<!ENTITY rsaquo "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
<!ENTITY euro "&#8364;" ><!-- euro sign, U+20AC NEW -->
<!-- end of xhtml-special.ent -->
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
<!-- ...................................................................... -->
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
<!-- file: xhtml-symbol.ent
Typical invocation:
<!ENTITY % xhtml-symbol
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"xhtml-symbol.ent" >
%xhtml-symbol;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- Latin Extended-B -->
<!ENTITY fnof "&#402;" ><!-- latin small f with hook = function
= florin, U+0192 ISOtech -->
<!-- Greek -->
<!ENTITY Alpha "&#913;" ><!-- greek capital letter alpha, U+0391 -->
<!ENTITY Beta "&#914;" ><!-- greek capital letter beta, U+0392 -->
<!ENTITY Gamma "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
<!ENTITY Delta "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
<!ENTITY Epsilon "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
<!ENTITY Zeta "&#918;" ><!-- greek capital letter zeta, U+0396 -->
<!ENTITY Eta "&#919;" ><!-- greek capital letter eta, U+0397 -->
<!ENTITY Theta "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
<!ENTITY Iota "&#921;" ><!-- greek capital letter iota, U+0399 -->
<!ENTITY Kappa "&#922;" ><!-- greek capital letter kappa, U+039A -->
<!ENTITY Lambda "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
<!ENTITY Mu "&#924;" ><!-- greek capital letter mu, U+039C -->
<!ENTITY Nu "&#925;" ><!-- greek capital letter nu, U+039D -->
<!ENTITY Xi "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
<!ENTITY Omicron "&#927;" ><!-- greek capital letter omicron, U+039F -->
<!ENTITY Pi "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
<!ENTITY Rho "&#929;" ><!-- greek capital letter rho, U+03A1 -->
<!-- there is no Sigmaf, and no U+03A2 character either -->
<!ENTITY Sigma "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
<!ENTITY Tau "&#932;" ><!-- greek capital letter tau, U+03A4 -->
<!ENTITY Upsilon "&#933;" ><!-- greek capital letter upsilon,
U+03A5 ISOgrk3 -->
<!ENTITY Phi "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
<!ENTITY Chi "&#935;" ><!-- greek capital letter chi, U+03A7 -->
<!ENTITY Psi "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
<!ENTITY Omega "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
<!ENTITY alpha "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
<!ENTITY beta "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
<!ENTITY gamma "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
<!ENTITY delta "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
<!ENTITY epsilon "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
<!ENTITY zeta "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
<!ENTITY eta "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
<!ENTITY theta "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
<!ENTITY iota "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
<!ENTITY kappa "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
<!ENTITY lambda "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
<!ENTITY mu "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
<!ENTITY nu "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
<!ENTITY xi "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
<!ENTITY omicron "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
<!ENTITY pi "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
<!ENTITY rho "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
<!ENTITY sigmaf "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
<!ENTITY sigma "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
<!ENTITY tau "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
<!ENTITY upsilon "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
<!ENTITY phi "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
<!ENTITY chi "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
<!ENTITY psi "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
<!ENTITY omega "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
<!ENTITY upsih "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
<!ENTITY piv "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
<!-- General Punctuation -->
<!ENTITY bull "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub -->
<!-- bullet is NOT the same as bullet operator, U+2219 -->
<!ENTITY hellip "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
<!ENTITY prime "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
<!ENTITY Prime "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
<!ENTITY oline "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
<!ENTITY frasl "&#8260;" ><!-- fraction slash, U+2044 NEW -->
<!-- Letterlike Symbols -->
<!ENTITY weierp "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
<!ENTITY image "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
<!ENTITY real "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
<!ENTITY trade "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
<!ENTITY alefsym "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
the same glyph could be used to depict both characters -->
<!-- Arrows -->
<!ENTITY larr "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
<!ENTITY uarr "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
<!ENTITY rarr "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
<!ENTITY darr "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
<!ENTITY harr "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
<!ENTITY crarr "&#8629;" ><!-- downwards arrow with corner leftwards
= carriage return, U+21B5 NEW -->
<!ENTITY lArr "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests -->
<!ENTITY uArr "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
<!ENTITY rArr "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
<!-- Unicode does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests -->
<!ENTITY dArr "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
<!ENTITY hArr "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
<!-- Mathematical Operators -->
<!ENTITY forall "&#8704;" ><!-- for all, U+2200 ISOtech -->
<!ENTITY part "&#8706;" ><!-- partial differential, U+2202 ISOtech -->
<!ENTITY exist "&#8707;" ><!-- there exists, U+2203 ISOtech -->
<!ENTITY empty "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
<!ENTITY nabla "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
<!ENTITY isin "&#8712;" ><!-- element of, U+2208 ISOtech -->
<!ENTITY notin "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
<!ENTITY ni "&#8715;" ><!-- contains as member, U+220B ISOtech -->
<!-- should there be a more memorable name than 'ni'? -->
<!ENTITY prod "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both -->
<!ENTITY sum "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both -->
<!ENTITY minus "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
<!ENTITY lowast "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
<!ENTITY radic "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
<!ENTITY prop "&#8733;" ><!-- proportional to, U+221D ISOtech -->
<!ENTITY infin "&#8734;" ><!-- infinity, U+221E ISOtech -->
<!ENTITY ang "&#8736;" ><!-- angle, U+2220 ISOamso -->
<!ENTITY and "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
<!ENTITY or "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
<!ENTITY cap "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
<!ENTITY cup "&#8746;" ><!-- union = cup, U+222A ISOtech -->
<!ENTITY int "&#8747;" ><!-- integral, U+222B ISOtech -->
<!ENTITY there4 "&#8756;" ><!-- therefore, U+2234 ISOtech -->
<!ENTITY sim "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
<!-- tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both -->
<!ENTITY cong "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
<!ENTITY asymp "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
<!ENTITY ne "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
<!ENTITY equiv "&#8801;" ><!-- identical to, U+2261 ISOtech -->
<!ENTITY le "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
<!ENTITY ge "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
<!ENTITY sub "&#8834;" ><!-- subset of, U+2282 ISOtech -->
<!ENTITY sup "&#8835;" ><!-- superset of, U+2283 ISOtech -->
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn -->
<!ENTITY nsub "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
<!ENTITY sube "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
<!ENTITY supe "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
<!ENTITY oplus "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
<!ENTITY otimes "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
<!ENTITY perp "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
<!ENTITY sdot "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
<!-- Miscellaneous Technical -->
<!ENTITY lceil "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
<!ENTITY rceil "&#8969;" ><!-- right ceiling, U+2309 ISOamsc -->
<!ENTITY lfloor "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc -->
<!ENTITY rfloor "&#8971;" ><!-- right floor, U+230B ISOamsc -->
<!ENTITY lang "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
<!-- lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' -->
<!ENTITY rang "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
<!-- rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' -->
<!-- Geometric Shapes -->
<!ENTITY loz "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
<!-- Miscellaneous Symbols -->
<!ENTITY spades "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
<!-- black here seems to mean filled as opposed to hollow -->
<!ENTITY clubs "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
<!ENTITY hearts "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
<!ENTITY diams "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
<!-- end of xhtml-symbol.ent -->
"""
return text
def get_apache_license():
license = r"""/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
return license
main()

View File

@ -17,6 +17,42 @@
-->
<html><head></head>
<body>
Filters that normalize text before tokenization.
<p>
Chainable filters that normalize text before tokenization and provide
mappings between normalized text offsets and the corresponding offset
in the original text.
</p>
<H2>CharFilter offset mappings</H2>
<p>
CharFilters modify an input stream via a series of substring
replacements (including deletions and insertions) to produce an output
stream. There are three possible replacement cases: the replacement
string has the same length as the original substring; the replacement
is shorter; and the replacement is longer. In the latter two cases
(when the replacement has a different length than the original),
one or more offset correction mappings are required.
</p>
<p>
When the replacement is shorter than the original (e.g. when the
replacement is the empty string), a single offset correction mapping
should be added at the replacement's end offset in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string (a positive value).
</p>
<p>
When the replacement is longer than the original (e.g. when the
original is the empty string), you should add as many offset
correction mappings as the difference between the lengths of the
replacement string and the original substring, starting at the
end offset the original substring would have had in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string so far (a negative value).
</p>
</body>
</html>

View File

@ -23,6 +23,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.junit.Ignore;
import org.apache.lucene.util._TestUtil;
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
@ -41,8 +42,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->";
String gold = " this is some text here is a link and " +
"another link . " +
String gold = "\nthis is some text\n here is a link and " +
"another link. " +
"This is an entity: & plus a <. Here is an &. ";
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder();
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
assertEquals(gold, builder.toString());
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
//Some sanity checks, but not a full-fledged check
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testMSWord14GeneratedHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
String gold = "This is a test";
StringBuilder builder = new StringBuilder();
int ch = 0;
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
gold, builder.toString().trim());
}
public void testGamma() throws Exception {
String test = "&Gamma;";
String gold = "\u0393";
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testEntities() throws Exception {
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testMoreEntities() throws Exception {
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
}
public void testReserved() throws Exception {
@ -147,8 +161,176 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testMalformedHTML() throws Exception {
String test = "a <a hr<ef=aa<a>> </close</a>";
String gold = "a <a hr<ef=aa > </close ";
String[] testGold = {
"a <a hr<ef=aa<a>> </close</a>",
"a <a hr<ef=aa> </close",
"<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
"Submit a Site",
"<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
"Christian Science",
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
"\n",
// "<" before ">" inhibits tag recognition
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
"<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
"",
"<link title=\"^\\\" 21Sta's Blog\" rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://21sta.com/blog/inc/opensearch.php\" />",
"\n",
"<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
"?",
"<a href='/modern-furniture' ' id='21txt' class='offtab' onMouseout=\"this.className='offtab'; return true;\" onMouseover=\"this.className='ontab'; return true;\">",
"",
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
"",
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
"Levi.com/BMX 2008 Clip of the Week 29...",
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
"Printer Friendly",
"<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
"Add to Favorites",
"<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
"At",
"E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
"E-mail: XXXXXX@example.com ",
"<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
"\nA'13?\n",
"<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
"\nHubert \"Geese\" Ausby\n",
"<href=\"http://anbportal.com/mms/login.asp\">",
"\n",
"<a href=\"",
"<a href=\"",
"<a href=\">",
"",
"<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
"#",
"<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
"",
"<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
"",
"<a href=#Services & Support>",
"",
// "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' + document.getElementById('advancedlink').style.display ; document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
"",
"<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\" hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
"",
"<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
"\n",
"<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
"#",
"<a href= >",
"",
"<ahref=http:..",
"<ahref=http:..",
"<ahref=http:..>",
"\n",
"<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
"\nA",
"<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
"",
"<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
"",
"<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
"",
"<a class=\"at\" name=\"Lamborghini href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
"Lamborghini /a>",
"<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
"",
"<a href=/myspace !style='color:#993333'>",
"",
"<meta name=3DProgId content=3DExcel.Sheet>",
"\n",
"<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
"\n",
"<td bgcolor=3D\"#FFFFFF\" nowrap>",
"\n",
"<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
"\"predicciones mundiales 2009\"",
"<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
"",
"<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
"Bishop\"",
"<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 &amp; 5 miles CC combined start</a>",
"BHAA Eircom 2 & 5 miles CC combined start",
"<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
"",
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
"",
// "<" before ">" inhibits tag recognition
"<input type=\"text\" value=\"<search here>\">",
"<input type=\"text\" value=\"\n\">",
"<input type=\"text\" value=\"<search here\">",
"<input type=\"text\" value=\"\n",
"<input type=\"text\" value=\"search here>\">",
"\">",
// "<" and ">" chars are accepted in on[Event] attribute values
"<input type=\"text\" value=\"&lt;search here&gt;\" onFocus=\"this.value='<search here>'\">",
"",
"<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
"\n\n\n",
"<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
"\n\n\n\n\n\n\n\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
@ -156,36 +338,71 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
assertEquals("Test: '" + test + "'", gold, result);
}
}
public void testBufferOverflow() throws Exception {
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
testBuilder.append("ah<?> ??????");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
testBuilder.append("-->foo");
processBuffer(testBuilder.toString(), "Failed w/ comment");
String gold = "foo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<?");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("?>");
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
testBuilder.setLength(0);
testBuilder.append("<b ");
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
testBuilder.append("/>");
processBuffer(testBuilder.toString(), "Failed on tag");
gold = "";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
private void appendChars(StringBuilder testBuilder, int numChars) {
@ -208,7 +425,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
test, builder.toString());
}
public void testComment() throws Exception {
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
@ -247,7 +466,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
}
public void testOffsets() throws Exception {
doTestOffsets("hello X how X are you");
// doTestOffsets("hello X how X are you");
doTestOffsets("hello <p> X<p> how <p>X are you");
doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
@ -255,7 +474,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
@Ignore("broken offsets: see LUCENE-2208")
static void assertLegalOffsets(String in) throws Exception {
int length = in.length();
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
int ch = 0;
int off = 0;
while ((ch = reader.read()) != -1) {
int correction = reader.correctOffset(off);
assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
correction <= length);
off++;
}
}
public void testLegalOffsets() throws Exception {
assertLegalOffsets("hello world");
assertLegalOffsets("hello &#x world");
}
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@ -274,4 +510,311 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
public void testServerSideIncludes() throws Exception {
String test = "one<img src=\"image.png\"\n"
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
gold = "one\ntwo";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testScriptQuotes() throws Exception {
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
gold = "hello\n";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testEscapeScript() throws Exception {
String test = "one<script no-value-attr>callSomeMethod();</script>two";
String gold = "one<script no-value-attr></script>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testStyle() throws Exception {
String test = "one<style type=\"text/css\">\n"
+ "<!--\n"
+ "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
+ "-->\n"
+ "</style>two";
String gold = "one\ntwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testEscapeStyle() throws Exception {
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
String gold = "one<style type=\"text/css\"></style>two";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testBR() throws Exception {
String[] testGold = {
"one<BR />two<br>three",
"one\ntwo\nthree",
"one<BR some stuff here too>two</BR>",
"one\ntwo\n",
};
for (int i = 0 ; i < testGold.length ; i += 2) {
String test = testGold[i];
String gold = testGold[i + 1];
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
assertEquals("Test: '" + test + "'", gold, result);
}
}
public void testEscapeBR() throws Exception {
String test = "one<BR class='whatever'>two</\nBR\n>";
String gold = "one<BR class='whatever'>two</\nBR\n>";
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(test)), escapedTags);
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testInlineTagsNoSpace() throws Exception {
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
String gold = "onetwo2e.three";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testCDATA() throws Exception {
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
String gold = "one<one><two>three<four></four></two></one>two";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
gold = "onetwo<![CDATA[three]]>fourfive";
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
ch = 0;
builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testUppercaseCharacterEntityVariants() throws Exception {
String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
String gold = " \"-\u00A9>><<\u00AE&";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testMSWordMalformedProcessingInstruction() throws Exception {
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
String gold = "onetwo";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testSupplementaryCharsInTags() throws Exception {
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
gold, builder.toString());
}
public void testRandomBrokenHTML() throws Exception {
int maxNumElements = 10000;
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text)));
while (reader.read() != -1);
}
public void testRandomText() throws Exception {
StringBuilder text = new StringBuilder();
int minNumWords = 10;
int maxNumWords = 10000;
int minWordLength = 3;
int maxWordLength = 20;
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
switch (_TestUtil.nextInt(random, 0, 4)) {
case 0: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
text.append(' ');
}
break;
}
case 1: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomRealisticUnicodeString
(random, minWordLength, maxWordLength));
text.append(' ');
}
break;
}
default: { // ASCII 50% of the time
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomSimpleString(random));
text.append(' ');
}
}
}
Reader reader = new HTMLStripCharFilter
(CharReader.get(new StringReader(text.toString())));
while (reader.read() != -1);
}
}

View File

@ -0,0 +1,653 @@
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:w="urn:schemas-microsoft-com:office:word"
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=ProgId content=Word.Document>
<meta name=Generator content="Microsoft Word 14">
<meta name=Originator content="Microsoft Word 14">
<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
<!--[if gte mso 9]><xml>
<o:DocumentProperties>
<o:Author>s</o:Author>
<o:LastAuthor>s</o:LastAuthor>
<o:Revision>1</o:Revision>
<o:TotalTime>1</o:TotalTime>
<o:Created>2012-01-13T03:36:00Z</o:Created>
<o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
<o:Pages>1</o:Pages>
<o:Words>8</o:Words>
<o:Characters>48</o:Characters>
<o:Lines>1</o:Lines>
<o:Paragraphs>1</o:Paragraphs>
<o:CharactersWithSpaces>55</o:CharactersWithSpaces>
<o:Version>14.00</o:Version>
</o:DocumentProperties>
<o:OfficeDocumentSettings>
<o:AllowPNG/>
</o:OfficeDocumentSettings>
</xml><![endif]-->
<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
<link rel=colorSchemeMapping
href="This%20is%20a%20test_files/colorschememapping.xml">
<!--[if gte mso 9]><xml>
<w:WordDocument>
<w:SpellingState>Clean</w:SpellingState>
<w:GrammarState>Clean</w:GrammarState>
<w:TrackMoves>false</w:TrackMoves>
<w:TrackFormatting/>
<w:PunctuationKerning/>
<w:ValidateAgainstSchemas/>
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
<w:DoNotPromoteQF/>
<w:LidThemeOther>EN-US</w:LidThemeOther>
<w:LidThemeAsian>X-NONE</w:LidThemeAsian>
<w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
<w:Compatibility>
<w:BreakWrappedTables/>
<w:SnapToGridInCell/>
<w:WrapTextWithPunct/>
<w:UseAsianBreakRules/>
<w:DontGrowAutofit/>
<w:SplitPgBreakAndParaMark/>
<w:EnableOpenTypeKerning/>
<w:DontFlipMirrorIndents/>
<w:OverrideTableStyleHps/>
</w:Compatibility>
<m:mathPr>
<m:mathFont m:val="Cambria Math"/>
<m:brkBin m:val="before"/>
<m:brkBinSub m:val="&#45;-"/>
<m:smallFrac m:val="off"/>
<m:dispDef/>
<m:lMargin m:val="0"/>
<m:rMargin m:val="0"/>
<m:defJc m:val="centerGroup"/>
<m:wrapIndent m:val="1440"/>
<m:intLim m:val="subSup"/>
<m:naryLim m:val="undOvr"/>
</m:mathPr></w:WordDocument>
</xml><![endif]--><!--[if gte mso 9]><xml>
<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
DefSemiHidden="true" DefQFormat="false" DefPriority="99"
LatentStyleCount="267">
<w:LsdException Locked="false" Priority="0" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
<w:LsdException Locked="false" Priority="9" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
<w:LsdException Locked="false" Priority="10" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Title"/>
<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
<w:LsdException Locked="false" Priority="11" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
<w:LsdException Locked="false" Priority="22" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
<w:LsdException Locked="false" Priority="20" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
<w:LsdException Locked="false" Priority="59" SemiHidden="false"
UnhideWhenUsed="false" Name="Table Grid"/>
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
<w:LsdException Locked="false" Priority="1" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 1"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
<w:LsdException Locked="false" Priority="34" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
<w:LsdException Locked="false" Priority="29" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
<w:LsdException Locked="false" Priority="30" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 1"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 2"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 2"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 3"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 3"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 4"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 4"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 5"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 5"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
UnhideWhenUsed="false" Name="Light List Accent 6"/>
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
UnhideWhenUsed="false" Name="Dark List Accent 6"/>
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
<w:LsdException Locked="false" Priority="19" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
<w:LsdException Locked="false" Priority="21" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
<w:LsdException Locked="false" Priority="31" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
<w:LsdException Locked="false" Priority="32" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
<w:LsdException Locked="false" Priority="33" SemiHidden="false"
UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
</w:LatentStyles>
</xml><![endif]-->
<style>
<!--
/* Font Definitions */
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;
mso-font-charset:1;
mso-generic-font-family:roman;
mso-font-format:other;
mso-font-pitch:variable;
mso-font-signature:0 0 0 0 0 0;}
@font-face
{font-family:Cambria;
panose-1:2 4 5 3 5 4 6 3 2 4;
mso-font-charset:0;
mso-generic-font-family:roman;
mso-font-pitch:variable;
mso-font-signature:-536870145 1073743103 0 0 415 0;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;
mso-font-charset:0;
mso-generic-font-family:swiss;
mso-font-pitch:variable;
mso-font-signature:-520092929 1073786111 9 0 415 0;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-parent:"";
margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
mso-pagination:widow-orphan;
font-size:11.0pt;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-fareast-font-family:Calibri;
mso-fareast-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
h1
{mso-style-priority:9;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Heading 1 Char";
mso-style-next:Normal;
margin-top:24.0pt;
margin-right:0in;
margin-bottom:0in;
margin-left:0in;
margin-bottom:.0001pt;
line-height:115%;
mso-pagination:widow-orphan lines-together;
page-break-after:avoid;
mso-outline-level:1;
font-size:14.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#365F91;
mso-themecolor:accent1;
mso-themeshade:191;
mso-font-kerning:0pt;}
p.MsoTitle, li.MsoTitle, div.MsoTitle
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
margin-top:0in;
margin-right:0in;
margin-bottom:15.0pt;
margin-left:0in;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin:0in;
margin-bottom:.0001pt;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin:0in;
margin-bottom:.0001pt;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
{mso-style-priority:10;
mso-style-unhide:no;
mso-style-qformat:yes;
mso-style-link:"Title Char";
mso-style-next:Normal;
mso-style-type:export-only;
margin-top:0in;
margin-right:0in;
margin-bottom:15.0pt;
margin-left:0in;
mso-add-space:auto;
mso-pagination:widow-orphan;
border:none;
mso-border-bottom-alt:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;
padding:0in;
mso-padding-alt:0in 0in 4.0pt 0in;
font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
span.TitleChar
{mso-style-name:"Title Char";
mso-style-priority:10;
mso-style-unhide:no;
mso-style-locked:yes;
mso-style-link:Title;
mso-ansi-font-size:26.0pt;
mso-bidi-font-size:26.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#17365D;
mso-themecolor:text2;
mso-themeshade:191;
letter-spacing:.25pt;
mso-font-kerning:14.0pt;}
span.Heading1Char
{mso-style-name:"Heading 1 Char";
mso-style-priority:9;
mso-style-unhide:no;
mso-style-locked:yes;
mso-style-link:"Heading 1";
mso-ansi-font-size:14.0pt;
mso-bidi-font-size:14.0pt;
font-family:"Cambria","serif";
mso-ascii-font-family:Cambria;
mso-ascii-theme-font:major-latin;
mso-fareast-font-family:"Times New Roman";
mso-fareast-theme-font:major-fareast;
mso-hansi-font-family:Cambria;
mso-hansi-theme-font:major-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:major-bidi;
color:#365F91;
mso-themecolor:accent1;
mso-themeshade:191;
font-weight:bold;}
.MsoChpDefault
{mso-style-type:export-only;
mso-default-props:yes;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-fareast-font-family:Calibri;
mso-fareast-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
.MsoPapDefault
{mso-style-type:export-only;
margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;
mso-header-margin:.5in;
mso-footer-margin:.5in;
mso-paper-source:0;}
div.WordSection1
{page:WordSection1;}
-->
</style>
<!--[if gte mso 10]>
<style>
/* Style Definitions */
table.MsoNormalTable
{mso-style-name:"Table Normal";
mso-tstyle-rowband-size:0;
mso-tstyle-colband-size:0;
mso-style-noshow:yes;
mso-style-priority:99;
mso-style-parent:"";
mso-padding-alt:0in 5.4pt 0in 5.4pt;
mso-para-margin-top:0in;
mso-para-margin-right:0in;
mso-para-margin-bottom:10.0pt;
mso-para-margin-left:0in;
line-height:115%;
mso-pagination:widow-orphan;
font-size:11.0pt;
font-family:"Calibri","sans-serif";
mso-ascii-font-family:Calibri;
mso-ascii-theme-font:minor-latin;
mso-hansi-font-family:Calibri;
mso-hansi-theme-font:minor-latin;
mso-bidi-font-family:"Times New Roman";
mso-bidi-theme-font:minor-bidi;}
</style>
<![endif]--><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026"/>
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1"/>
</o:shapelayout></xml><![endif]-->
</head>
<body lang=EN-US style='tab-interval:.5in'>
<div class=WordSection1>
<div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
<p class=MsoTitle>This is a test</p>
</div>
</div>
</body>
</html>

View File

@ -113,6 +113,23 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
</java>
</target>
<property name="html.strip.charfilter.supp.macros.output.file"
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
dir="."
fork="true"
failonerror="true"
output="${html.strip.charfilter.supp.macros.output.file}">
<classpath>
<path refid="additional.dependencies"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
</java>
</target>
<target name="compile-tools" depends="common.compile-tools">
<compile
srcdir="src/tools/java"

View File

@ -0,0 +1,110 @@
package org.apache.lucene.analysis.icu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.DateFormat;
import java.util.*;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.VersionInfo;
/** creates a macro to augment jflex's unicode support for > BMP */
public class GenerateHTMLStripCharFilterSupplementaryMacros {
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
private static final String NL = System.getProperty("line.separator");
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.US);
static {
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2010 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
public static void main(String args[]) throws Exception {
outputHeader();
outputMacro("ID_Start_Supp", "[:ID_Start:]");
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
}
static void outputHeader() {
System.out.print(APACHE_LICENSE);
System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
System.out.println(DATE_FORMAT.format(new Date()));
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
System.out.print(NL + NL);
}
// we have to carefully output the possibilities as compact utf-16
// range expressions, or jflex will OOM!
static void outputMacro(String name, String pattern) {
UnicodeSet set = new UnicodeSet(pattern);
set.removeAll(BMP);
System.out.println(name + " = (");
// if the set is empty, we have to do this or jflex will barf
if (set.isEmpty()) {
System.out.println("\t []");
}
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
char utf16[] = Character.toChars(it.codepoint);
UnicodeSet trails = utf16ByLead.get(utf16[0]);
if (trails == null) {
trails = new UnicodeSet();
utf16ByLead.put(utf16[0], trails);
}
trails.add(utf16[1]);
}
Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
String trail = entry.getValue().getRegexEquivalent();
UnicodeSet leads = utf16ByTrail.get(trail);
if (leads == null) {
leads = new UnicodeSet();
utf16ByTrail.put(trail, leads);
}
leads.add(entry.getKey());
}
boolean isFirst = true;
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
System.out.print( isFirst ? "\t " : "\t| ");
isFirst = false;
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
}
System.out.println(")");
}
}

View File

@ -401,6 +401,14 @@ Upgrading from Solr 3.5
* As doGet() methods in SimplePostTool was changed to static, the client applications of this
class need to be recompiled.
* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
character offsets it provided, triggering e.g. exceptions in highlighting.
HTMLStripCharFilter has been re-implemented, addressing this and other
issues. See the entry for LUCENE-3690 in the Bug Fixes section below for a
detailed list of changes. For people who depend on the behavior of
HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
(bugs and all) is preserved as LegacyHTMLStripCharFilter.
New Features
----------------------
* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
@ -483,6 +491,41 @@ Bug Fixes
* SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
HTMLStripCharFilter as a JFlex-generated scanner. See below for a list
of bug fixes and other changes. To get the same behavior as
HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
use LegacyHTMLStripCharFilter, which is the previous implementation.
Behavior changes from the previous version:
- Known offset bugs are fixed.
- The "Mark invalid" exceptions reported in SOLR-1283 are no longer
triggered (the bug is still present in LegacyHTMLStripCharFilter).
- The character entity "&apos;" is now always properly decoded.
- More cases of <script> tags are now properly stripped.
- CDATA sections are now handled properly.
- Valid tag name characters now include the supplementary Unicode characters
from Unicode character classes [:ID_Start:] and [:ID_Continue:].
- Uppercase character entities "&QUOT;", "&COPY;", "&GT;", "&LT;", "&REG;",
and "&AMP;" are now recognized and handled as if they were in lowercase.
- Opening tags with unbalanced quotation marks are now properly stripped.
- Literal "<" and ">" characters in opening tags, regardless of whether they
appear inside quotation marks, now inhibit recognition (and stripping) of
the tags. The only exception to this is for values of event-handler
attributes, e.g. "onClick", "onLoad", "onSelect".
- A newline '\n' is substituted instead of a space for stripped HTML markup.
- Nothing is substituted for opening and closing inline tags - they are
simply removed. The list of inline tags is (case insensitively): <a>,
<abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
<em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
<select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
<tt>, <u>, and <var>.
- HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
feature: opening and closing tags with the given names, including any
attributes and their values, are left intact in the output.
(Steve Rowe)
Other Changes
----------------------
* SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)

View File

@ -21,12 +21,18 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Factory for {@link HTMLStripCharFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;charFilter class="solr.HTMLStripCharFilterFactory"/&gt;
* &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre
@ -34,8 +40,31 @@ import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
*/
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
Set<String> escapedTags = null;
Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
public HTMLStripCharFilter create(CharStream input) {
return new HTMLStripCharFilter(input);
HTMLStripCharFilter charFilter;
if (null == escapedTags) {
charFilter = new HTMLStripCharFilter(input);
} else {
charFilter = new HTMLStripCharFilter(input, escapedTags);
}
return charFilter;
}
@Override
public void init(Map<String,String> args) {
super.init(args);
String escapedTagsArg = args.get("escapedTags");
if (null != escapedTagsArg) {
Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
while (matcher.find()) {
if (null == escapedTags) {
escapedTags = new HashSet<String>();
}
escapedTags.add(matcher.group(0));
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,58 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.CharStream;
/**
* Factory for {@link LegacyHTMLStripCharFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_html_legacy" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;charFilter class="solr.LegacyHTMLStripCharFilterFactory"/&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
* <p>
* This factory is <b>NOT</b> recommended for new users and should be
* considered <b>UNSUPPORTED</b>.
* </p>
* <p>
* In Solr version 3.5 and earlier, <tt>HTMLStripCharFilter(Factory)</tt>
* had known bugs in the offsets it provided, triggering e.g. exceptions in
* highlighting.
* </p>
* <p>
* This class is provided as possible alternative for people who depend on
* the "broken" behavior of <tt>HTMLStripCharFilter</tt> in Solr version 3.5
* and earlier, and/or who don't like the changes introduced by the Solr 3.6+
* version of <tt>HTMLStripCharFilterFactory</tt>. (See the 3.6.0 release
* section of lucene/CHANGES.txt for a list of differences in behavior.)
* </p>
* @deprecated use {@link HTMLStripCharFilterFactory}
*/
@Deprecated
public class LegacyHTMLStripCharFilterFactory extends BaseCharFilterFactory {
public LegacyHTMLStripCharFilter create(CharStream input) {
return new LegacyHTMLStripCharFilter(input);
}
}

View File

@ -0,0 +1,321 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
public class LegacyHTMLStripCharFilterTest extends BaseTokenStreamTestCase {
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
//
public void test() throws IOException {
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
"This is an entity: &amp; plus a &lt;. Here is an &. <!-- is a comment -->";
String gold = " this is some text here is a link and " +
"another link . " +
"This is an entity: & plus a <. Here is an &. ";
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(html)));
StringBuilder builder = new StringBuilder();
int ch = -1;
char [] goldArray = gold.toCharArray();
int position = 0;
while ((ch = reader.read()) != -1){
char theChar = (char) ch;
builder.append(theChar);
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
position++;
}
assertEquals(gold, builder.toString());
}
//Some sanity checks, but not a full-fledged check
public void testHTML() throws Exception {
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
StringBuilder builder = new StringBuilder();
int ch = -1;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String str = builder.toString();
assertTrue("Entity not properly escaped", str.indexOf("&lt;") == -1);//there is one > in the text
assertTrue("Forrest should have been stripped out", str.indexOf("forrest") == -1 && str.indexOf("Forrest") == -1);
assertTrue("File should start with 'Welcome to Solr' after trimming", str.trim().startsWith("Welcome to Solr"));
assertTrue("File should start with 'Foundation.' after trimming", str.trim().endsWith("Foundation."));
}
public void testGamma() throws Exception {
String test = "&Gamma;";
String gold = "\u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
}
public void testEntities() throws Exception {
String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
}
public void testMoreEntities() throws Exception {
String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
String gold = " <junk/> ! @ and ";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
}
public void testReserved() throws Exception {
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
Set<String> set = new HashSet<String>();
set.add("reserved");
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Result: " + result);
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved"), result.indexOf("reserved") == 9);
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 15), result.indexOf("reserved", 15) == 38);
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 41), result.indexOf("reserved", 41) == 54);
assertTrue("Other tag should be removed", result.indexOf("other") == -1);
}
public void testMalformedHTML() throws Exception {
String test = "a <a hr<ef=aa<a>> </close</a>";
String gold = "a <a hr<ef=aa > </close ";
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)));
StringBuilder builder = new StringBuilder();
int ch = 0;
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
String result = builder.toString();
// System.out.println("Resu: " + result + "<EOL>");
// System.out.println("Gold: " + gold + "<EOL>");
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
}
public void testBufferOverflow() throws Exception {
StringBuilder testBuilder = new StringBuilder(LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
testBuilder.append("ah<?> ??????");
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
testBuilder.setLength(0);
testBuilder.append("<!--");//comments
appendChars(testBuilder, 3*LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
testBuilder.append("-->foo");
processBuffer(testBuilder.toString(), "Failed w/ comment");
testBuilder.setLength(0);
testBuilder.append("<?");
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("?>");
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
testBuilder.setLength(0);
testBuilder.append("<b ");
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
testBuilder.append("/>");
processBuffer(testBuilder.toString(), "Failed on tag");
}
private void appendChars(StringBuilder testBuilder, int numChars) {
int i1 = numChars / 2;
for (int i = 0; i < i1; i++){
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes LegacyHTMLStripCharFilter think it is a processing instruction
}
}
private void processBuffer(String test, String assertMsg) throws IOException {
// System.out.println("-------------------processBuffer----------");
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
}
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
}
public void testComment() throws Exception {
String test = "<!--- three dashes, still a valid comment ---> ";
String gold = " ";
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
int ch = 0;
StringBuilder builder = new StringBuilder();
try {
while ((ch = reader.read()) != -1){
builder.append((char)ch);
}
} finally {
// System.out.println("String: " + builder.toString());
}
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
}
public void doTestOffsets(String in) throws Exception {
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
int ch = 0;
int off = 0; // offset in the reader
int strOff = -1; // offset in the original string
while ((ch = reader.read()) != -1) {
int correctedOff = reader.correctOffset(off);
if (ch == 'X') {
strOff = in.indexOf('X',strOff+1);
assertEquals(strOff, correctedOff);
}
off++;
}
}
public void testOffsets() throws Exception {
doTestOffsets("hello X how X are you");
doTestOffsets("hello <p> X<p> how <p>X are you");
doTestOffsets("X &amp; X &#40; X &lt; &gt; X");
// test backtracking
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
}
@Ignore("broken offsets: see LUCENE-2208")
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
public void testRandomBrokenHTML() throws Exception {
int maxNumElements = 10000;
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
Reader reader
= new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(text)));
while (reader.read() != -1);
}
public void testRandomText() throws Exception {
StringBuilder text = new StringBuilder();
int minNumWords = 10;
int maxNumWords = 10000;
int minWordLength = 3;
int maxWordLength = 20;
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
switch (_TestUtil.nextInt(random, 0, 4)) {
case 0: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
text.append(' ');
}
break;
}
case 1: {
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomRealisticUnicodeString
(random, minWordLength, maxWordLength));
text.append(' ');
}
break;
}
default: { // ASCII 50% of the time
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
text.append(_TestUtil.randomSimpleString(random));
text.append(' ');
}
}
}
Reader reader = new LegacyHTMLStripCharFilter
(CharReader.get(new StringReader(text.toString())));
while (reader.read() != -1);
}
}

View File

@ -0,0 +1,130 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure this factory is working
*/
public class TestHTMLStripCharFilterFactory extends BaseTokenTestCase {
public void testNothingChanged() throws IOException {
// 11111111112
// 012345678901234567890
final String text = "this is only a test.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("escapedTags", "a, Title");
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "only", "a", "test." },
new int[] { 0, 5, 8, 13, 15 },
new int[] { 4, 7, 12, 14, 20 });
}
public void testNoEscapedTags() throws IOException {
// 11111111112222222222333333333344
// 012345678901234567890123456789012345678901
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "only", "a", "test." },
new int[] { 3, 12, 18, 27, 32 },
new int[] { 11, 14, 26, 28, 41 });
}
public void testEscapedTags() throws IOException {
// 11111111112222222222333333333344
// 012345678901234567890123456789012345678901
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("escapedTags", "U i");
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "<u>this</u>", "is", "only", "a", "<I>test</I>." },
new int[] { 0, 12, 18, 27, 29 },
new int[] { 11, 14, 26, 28, 41 });
}
public void testSeparatorOnlyEscapedTags() throws IOException {
// 11111111112222222222333333333344
// 012345678901234567890123456789012345678901
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("escapedTags", ",, , ");
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "only", "a", "test." },
new int[] { 3, 12, 18, 27, 32 },
new int[] { 11, 14, 26, 28, 41 });
}
public void testEmptyEscapedTags() throws IOException {
// 11111111112222222222333333333344
// 012345678901234567890123456789012345678901
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("escapedTags", "");
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "only", "a", "test." },
new int[] { 3, 12, 18, 27, 32 },
new int[] { 11, 14, 26, 28, 41 });
}
public void testSingleEscapedTag() throws IOException {
// 11111111112222222222333333333344
// 012345678901234567890123456789012345678901
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("escapedTags", ", B\r\n\t");
factory.init(args);
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "<b>only</b>", "a", "test." },
new int[] { 3, 12, 15, 27, 32 },
new int[] { 11, 14, 26, 28, 41 });
}
}

View File

@ -0,0 +1,350 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta content="Apache Forrest" name="Generator">
<meta name="Forrest-version" content="0.8">
<meta name="Forrest-skin-name" content="pelt">
<title>Welcome to Solr</title>
<link type="text/css" href="skin/basic.css" rel="stylesheet">
<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
<link type="text/css" href="skin/profile.css" rel="stylesheet">
<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
<link rel="shortcut icon" href="images/favicon.ico">
</head>
<body onload="init()">
<script type="text/javascript">ndeSetTextSize();</script>
<div id="top">
<!--+
|breadtrail
+-->
<div class="breadtrail">
<a href="http://www.apache.org/">apache</a> &gt; <a href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
+-->
<div class="header">
<!--+
|start group logo
+-->
<div class="grouplogo">
<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
+-->
<!--+
|start Project Logo
+-->
<div class="projectlogo">
<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr" src="images/solr_small.png" title="Solr Description"></a>
</div>
<!--+
|end Project Logo
+-->
<!--+
|start Search
+-->
<div class="searchbox">
<form action="http://www.google.com/search" method="get" class="roundtopsmall">
<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp;
<input name="Search" value="Search" type="submit">
</form>
</div>
<!--+
|end search
+-->
<!--+
|start Tabs
+-->
<ul id="tabs">
<li class="current">
<a class="selected" href="index.html">Main</a>
</li>
<li>
<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
</li>
</ul>
<!--+
|end Tabs
+-->
</div>
</div>
<div id="main">
<div id="publishedStrip">
<!--+
|start Subtabs
+-->
<div id="level2tabs"></div>
<!--+
|end Endtabs
+-->
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
// --></script>
</div>
<!--+
|breadtrail
+-->
<div class="breadtrail">
&nbsp;
</div>
<!--+
|start Menu, mainarea
+-->
<!--+
|start Menu
+-->
<div id="menu">
<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">About</div>
<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
<div class="menupage">
<div class="menupagetitle">Welcome</div>
</div>
<div class="menuitem">
<a href="who.html" title="Solr Committers">Who We Are</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
<div id="menu_1.2" class="menuitemgroup">
<div class="menuitem">
<a href="features.html">Features</a>
</div>
<div class="menuitem">
<a href="tutorial.html">Tutorial</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
</div>
<div class="menuitem">
<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
</div>
<div class="menuitem">
<a href="api/index.html">javadoc</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
<div id="menu_1.3" class="menuitemgroup">
<div class="menuitem">
<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
</div>
<div class="menuitem">
<a href="mailing_lists.html">Mailing Lists</a>
</div>
<div class="menuitem">
<a href="issue_tracking.html">Issue Tracking</a>
</div>
<div class="menuitem">
<a href="version_control.html">Version Control</a>
</div>
</div>
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
<a href="http://lucene.apache.org/java/">Lucene Java</a>
</div>
<div class="menuitem">
<a href="http://lucene.apache.org/nutch/">Nutch</a>
</div>
</div>
<div id="credit">
<hr>
<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
</div>
<div id="roundbottom">
<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
<!--+
|alternative credits
+-->
<div id="credit2"></div>
</div>
<!--+
|end Menu
+-->
<!--+
|start content
+-->
<div id="content">
<div title="Portable Document Format" class="pdflink">
<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
PDF</a>
</div>
<h1>Welcome to Solr</h1>
<div id="minitoc-area">
<ul class="minitoc">
<li>
<a href="#intro">What Is Solr?</a>
</li>
<li>
<a href="#news">News</a>
<ul class="minitoc">
<li>
<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at OSSummit Asia</a>
</li>
<li>
<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
</li>
<li>
<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
</li>
<li>
<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
</li>
<li>
<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
</li>
<li>
<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
</li>
<li>
<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
</li>
<li>
<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
</li>
<li>
<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
</li>
</ul>
</li>
</ul>
</div>
<a name="N1000D"></a><a name="intro"></a>
<h2 class="boxed">What Is Solr?</h2>
<div class="section">
<p>
Solr is an open source enterprise search server based on the
<a href="http://lucene.apache.org/java/">Lucene Java</a> search library, with XML/HTTP and JSON APIs,
hit highlighting, faceted search, caching, replication, and a web administration interface.
It runs in a Java servlet container such as <a href="http://tomcat.apache.org">Tomcat</a>.
</p>
<p>
See the complete <a href="features.html">feature list</a> for more details, then check out the <a href="tutorial.html">tutorial</a>.
</p>
</div>
<a name="N1002A"></a><a name="news"></a>
<h2 class="boxed">News</h2>
<div class="section">
<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
<p>
<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
Lucene and Solr tutorials!
</p>
<p>The following talks and trainings are scheduled for the upcoming 2008 OSSummit:</p>
<ul>
<li>
<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by Erik Hatcher (originally by Grant Ingersoll). An all-day training focusing on getting started with Lucene - the core library under Solr.</li>
<li>
<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by Erik Hatcher. All you need to know to use Solr effectively.</li>
<li>
<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher. A rapid series of examples of many Lucene and Solr using applications.</li>
</ul>
<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
<p>
<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007.
</p>
<p>The following talks and trainings are scheduled for this year's conference:</p>
<ul>
<li>November 12: <a href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by Grant Ingersoll. An all-day training focusing on getting started with Lucene.</li>
<li>November 16, 9:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the Box</a> by Chris Hostetter. Introduction to Solr.</li>
<li>November 16, 10:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical Search Site using Apache Software</a> by Ken Krugler. Will cover many Lucene-based projects.</li>
<li>November 16, 3:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene performance.</li>
<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
</ul>
<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
<p>
This is the first release since Solr graduated from the Incubator,
bringing many new features, including CSV/delimited-text data
loading, time based autocommit, faster faceting, negative filters,
a spell-check handler, sounds-like word filters, regex text filters,
and more flexible plugins.
</p>
<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
<p>
Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
</p>
<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
<p>
This is the first release since Solr joined the Incubator, and brings
many new features and performance optimizations including highlighting,
faceted search, and JSON/Python/Ruby response formats.
</p>
<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
<p>Chris Hostetter will be presenting
<strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>
at ApacheCon US 2006, on October 13th at 4:30pm.
See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
</p>
<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
<p>Yonik Seeley will be presenting
<strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>
at ApacheCon Europe 2006, on June 29th at 5:30pm.
See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
</p>
<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
<h3 class="boxed">21 February 2006: nightly builds</h3>
<p>Solr now has nightly builds. This automatically creates a
<a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
night</a>. All unit tests must pass, or a message is sent to
the developers mailing list and no new version is created. This
also updates the <a href="api/index.html">javadoc</a>.</p>
<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
Solr was originally developed by CNET Networks, and is widely used within CNET
to provide high relevancy search and faceted browsing capabilities.
</p>
</div>
</div>
<!--+
|end content
+-->
<div class="clearboth">&nbsp;</div>
</div>
<div id="footer">
<!--+
|start bottomstrip
+-->
<div class="lastmodified">
<script type="text/javascript"><!--
document.write("Last Published: " + document.lastModified);
// --></script>
</div>
<div class="copyright">
Copyright &copy;
2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
</div>
<div id="logos"></div>
<!--+
|end bottomstrip
+-->
</div>
</body>
</html>

View File

@ -326,8 +326,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
NamedList indexPart = textType.get("index");
assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
assertEquals(" whátëvêr ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
assertEquals("\n\nwhátëvêr\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
assertEquals("\n\nwhatever\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
List<NamedList> tokenList = (List<NamedList>)indexPart.get(MockTokenizer.class.getName());
assertNotNull("Expecting MockTokenizer analysis breakdown", tokenList);