mirror of https://github.com/apache/lucene.git
LUCENE-3690: Re-implemented HTMLStripCharFilter as a JFlex-generated scanner. Fixes LUCENE-2208, SOLR-882, and SOLR-42.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1234452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
17fe719bb5
commit
f3a363708f
|
@ -793,6 +793,9 @@ New Features
|
|||
* LUCENE-3121: Add TypeTokenFilter that filters tokens based on
|
||||
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
|
||||
|
||||
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
|
||||
markup. (Steve Rowe)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-3595: Fixed FieldCacheRangeFilter and FieldCacheTermsFilter
|
||||
|
|
|
@ -249,7 +249,42 @@ public class _TestUtil {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: make this more evil
|
||||
private static final String[] HTML_CHAR_ENTITIES = {
|
||||
"AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
|
||||
"Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
|
||||
"Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
|
||||
"Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
|
||||
"Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
|
||||
"Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
|
||||
"QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
|
||||
"Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
|
||||
"Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
|
||||
"alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
|
||||
"auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
|
||||
"cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
|
||||
"curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
|
||||
"eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
|
||||
"equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
|
||||
"frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
|
||||
"harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
|
||||
"image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
|
||||
"lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
|
||||
"lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
|
||||
"mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
|
||||
"ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
|
||||
"oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
|
||||
"ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
|
||||
"perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
|
||||
"psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
|
||||
"rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
|
||||
"sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
|
||||
"spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
|
||||
"szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
|
||||
"tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
|
||||
"uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
|
||||
"yuml", "zeta", "zwj", "zwnj"
|
||||
};
|
||||
|
||||
public static String randomHtmlishString(Random random, int numElements) {
|
||||
final int end = random.nextInt(numElements);
|
||||
if (end == 0) {
|
||||
|
@ -258,17 +293,80 @@ public class _TestUtil {
|
|||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < end; i++) {
|
||||
int val = random.nextInt(10);
|
||||
int val = random.nextInt(25);
|
||||
switch(val) {
|
||||
case 0: sb.append("<p>"); break;
|
||||
case 1: sb.append("</p>"); break;
|
||||
case 2: sb.append("<!--"); break;
|
||||
case 3: sb.append("-->"); break;
|
||||
case 4: sb.append("&#"); break;
|
||||
case 5: sb.append(";"); break;
|
||||
case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
|
||||
default:
|
||||
sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
|
||||
case 1: {
|
||||
sb.append("<");
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(randomSimpleString(random));
|
||||
for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
|
||||
sb.append(' ');
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append(" ".substring(nextInt(random, 0, 1)));
|
||||
sb.append('=');
|
||||
sb.append(" ".substring(nextInt(random, 0, 1)));
|
||||
sb.append("\"".substring(nextInt(random, 0, 1)));
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append("\"".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append("/".substring(nextInt(random, 0, 1)));
|
||||
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
sb.append("</");
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(randomSimpleString(random));
|
||||
sb.append(" ".substring(nextInt(random, 0, 4)));
|
||||
sb.append(">".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 3: sb.append(">"); break;
|
||||
case 4: sb.append("</p>"); break;
|
||||
case 5: sb.append("<!--"); break;
|
||||
case 6: sb.append("<!--#"); break;
|
||||
case 7: sb.append("<script><!-- f('"); break;
|
||||
case 8: sb.append("</script>"); break;
|
||||
case 9: sb.append("<?"); break;
|
||||
case 10: sb.append("?>"); break;
|
||||
case 11: sb.append("\""); break;
|
||||
case 12: sb.append("\\\""); break;
|
||||
case 13: sb.append("'"); break;
|
||||
case 14: sb.append("\\'"); break;
|
||||
case 15: sb.append("-->"); break;
|
||||
case 16: {
|
||||
sb.append("&");
|
||||
switch(nextInt(random, 0, 2)) {
|
||||
case 0: sb.append(randomSimpleString(random)); break;
|
||||
case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
|
||||
}
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
break;
|
||||
}
|
||||
case 17: {
|
||||
sb.append("&#");
|
||||
if (0 == nextInt(random, 0, 1)) {
|
||||
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 18: {
|
||||
sb.append("&#x");
|
||||
if (0 == nextInt(random, 0, 1)) {
|
||||
sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
|
||||
sb.append(";".substring(nextInt(random, 0, 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 19: sb.append(";"); break;
|
||||
case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
|
||||
case 21: sb.append("\n");
|
||||
case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
|
||||
default: sb.append(randomSimpleString(random));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
|
|
|
@ -31,7 +31,8 @@
|
|||
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
|
||||
|
||||
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
|
||||
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
|
||||
jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
|
||||
jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
|
||||
|
||||
<target name="gen-uax29-supp-macros">
|
||||
<subant target="gen-uax29-supp-macros">
|
||||
|
@ -39,6 +40,29 @@
|
|||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="jflex-HTMLStripCharFilter"
|
||||
depends="init,jflex-check,generate-jflex-html-char-entities"
|
||||
if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/charfilter"
|
||||
nobak="on"/>
|
||||
<!-- Remove the inappropriate JFlex-generated constructors -->
|
||||
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
|
||||
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||
replace="" flags="sg"/>
|
||||
</target>
|
||||
|
||||
<target name="generate-jflex-html-char-entities">
|
||||
<exec dir="src/java/org/apache/lucene/analysis/charfilter"
|
||||
output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
|
||||
executable="${python.exe}" failonerror="true" logerror="true">
|
||||
<arg value="htmlentity.py"/>
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charfilter;
|
|||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Base utility class for implementing a {@link CharFilter}.
|
||||
* You subclass this, and then record mappings by calling
|
||||
|
@ -71,6 +73,19 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
0 : diffs[size-1];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Adds an offset correction mapping at the given output stream offset.
|
||||
* </p>
|
||||
* <p>
|
||||
* Assumption: the offset given with each successive call to this method
|
||||
* will not be smaller than the offset given at the previous invocation.
|
||||
* </p>
|
||||
*
|
||||
* @param off The output stream offset at which to apply the correction
|
||||
* @param cumulativeDiff The input offset is given by adding this
|
||||
* to the output offset
|
||||
*/
|
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) {
|
||||
if (offsets == null) {
|
||||
offsets = new int[64];
|
||||
|
@ -80,7 +95,15 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
diffs = ArrayUtil.grow(diffs);
|
||||
}
|
||||
|
||||
assert (size == 0 || off >= offsets[size])
|
||||
: "Offset #" + size + "(" + off + ") is less than the last recorded offset "
|
||||
+ offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
|
||||
|
||||
if (size == 0 || off != offsets[size - 1]) {
|
||||
offsets[size] = off;
|
||||
diffs[size++] = cumulativeDiff;
|
||||
} else { // Overwrite the diff at the last recorded offset
|
||||
diffs[size - 1] = cumulativeDiff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
|
||||
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
|
||||
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
|
||||
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
|
||||
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
|
||||
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
|
||||
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
|
||||
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
|
||||
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
|
||||
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
|
||||
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
|
||||
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
|
||||
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
|
||||
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
|
||||
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
|
||||
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
|
||||
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
|
||||
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
|
||||
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
|
||||
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
|
||||
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
|
||||
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
|
||||
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
|
||||
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
|
||||
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
|
||||
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
|
||||
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
|
||||
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
|
||||
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
|
||||
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
|
||||
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
|
||||
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
|
||||
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
|
||||
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
|
||||
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
|
||||
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
|
||||
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
|
||||
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
|
||||
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
|
||||
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
|
||||
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
|
||||
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
|
||||
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
|
||||
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
|
||||
| "zwj" | "zwnj" )
|
||||
%{
|
||||
private static final Set<String> upperCaseVariantsAccepted
|
||||
= new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
|
||||
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
|
||||
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
|
||||
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
|
||||
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
|
||||
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
|
||||
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
|
||||
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
|
||||
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
|
||||
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
|
||||
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
|
||||
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
|
||||
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
|
||||
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
|
||||
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
|
||||
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
|
||||
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
|
||||
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
|
||||
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
|
||||
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
|
||||
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
|
||||
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
|
||||
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
|
||||
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
|
||||
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
|
||||
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
|
||||
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
|
||||
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
|
||||
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
|
||||
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
|
||||
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
|
||||
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
|
||||
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
|
||||
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
|
||||
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
|
||||
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
|
||||
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
|
||||
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
|
||||
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
|
||||
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
|
||||
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
|
||||
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
|
||||
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
|
||||
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
|
||||
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
|
||||
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
|
||||
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
|
||||
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
|
||||
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
|
||||
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
|
||||
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
|
||||
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
|
||||
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
|
||||
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
|
||||
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
|
||||
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
|
||||
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
|
||||
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
|
||||
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
|
||||
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
|
||||
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
|
||||
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
|
||||
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
|
||||
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
|
||||
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
|
||||
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
|
||||
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
|
||||
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
|
||||
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
|
||||
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
|
||||
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
|
||||
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
|
||||
};
|
||||
for (int i = 0 ; i < entities.length ; i += 2) {
|
||||
Character value = entities[i + 1].charAt(0);
|
||||
entityValues.put(entities[i], value);
|
||||
if (upperCaseVariantsAccepted.contains(entities[i])) {
|
||||
entityValues.put(entities[i].toUpperCase(), value);
|
||||
}
|
||||
}
|
||||
}
|
||||
%}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
ID_Start_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD801][\uDC00-\uDC9D]
|
||||
)
|
||||
ID_Continue_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uDB40][\uDD00-\uDDEF]
|
||||
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,737 @@
|
|||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.OpenStringBuilder;
|
||||
|
||||
|
||||
/**
|
||||
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
|
||||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%apiprivate
|
||||
%type int
|
||||
%final
|
||||
%public
|
||||
%char
|
||||
%function nextChar
|
||||
%class HTMLStripCharFilter
|
||||
%extends BaseCharFilter
|
||||
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
|
||||
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
|
||||
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
|
||||
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
|
||||
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
|
||||
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
|
||||
%xstate STYLE, STYLE_COMMENT
|
||||
|
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
||||
//
|
||||
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
|
||||
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
|
||||
// [5] Name ::= NameStartChar (NameChar)*
|
||||
//
|
||||
// From UAX #31: Unicode Identifier and Pattern Syntax
|
||||
// <http://unicode.org/reports/tr31/>:
|
||||
//
|
||||
// D1. Default Identifier Syntax
|
||||
//
|
||||
// <identifier> := <ID_Start> <ID_Continue>*
|
||||
//
|
||||
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
|
||||
|
||||
// From Apache httpd mod_include documentation
|
||||
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
|
||||
//
|
||||
// Basic Elements
|
||||
//
|
||||
// The document is parsed as an HTML document, with special commands
|
||||
// embedded as SGML comments. A command has the syntax:
|
||||
//
|
||||
// <!--#element attribute=value attribute=value ... -->
|
||||
//
|
||||
// The value will often be enclosed in double quotes, but single quotes (')
|
||||
// and backticks (`) are also possible. Many commands only allow a single
|
||||
// attribute-value pair. Note that the comment terminator (-->) should be
|
||||
// preceded by whitespace to ensure that it isn't considered part of an SSI
|
||||
// token. Note that the leading <!--# is one token and may not contain any
|
||||
// whitespaces.
|
||||
//
|
||||
|
||||
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
|
||||
[bB][lL][uU][rR] |
|
||||
[cC][hH][aA][nN][gG][eE] |
|
||||
[cC][lL][iI][cC][kK] |
|
||||
[dD][bB][lL][cC][lL][iI][cC][kK] |
|
||||
[eE][rR][rR][oO][rR] |
|
||||
[fF][oO][cC][uU][sS] |
|
||||
[kK][eE][yY][dD][oO][wW][nN] |
|
||||
[kK][eE][yY][pP][rR][eE][sS][sS] |
|
||||
[kK][eE][yY][uU][pP] |
|
||||
[lL][oO][aA][dD] |
|
||||
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
|
||||
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
|
||||
[mM][oO][uU][sS][eE][oO][uU][tT] |
|
||||
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
|
||||
[mM][oO][uU][sS][eE][uU][pP] |
|
||||
[rR][eE][sS][eE][tT] |
|
||||
[sS][eE][lL][eE][cC][tT] |
|
||||
[sS][uU][bB][mM][iI][tT] |
|
||||
[uU][nN][lL][oO][aA][dD] )
|
||||
|
||||
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
|
||||
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
|
||||
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
|
||||
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
|
||||
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
|
||||
|
||||
InlineElment = ( [aAbBiIqQsSuU] |
|
||||
[aA][bB][bB][rR] |
|
||||
[aA][cC][rR][oO][nN][yY][mM] |
|
||||
[bB][aA][sS][eE][fF][oO][nN][tT] |
|
||||
[bB][dD][oO] |
|
||||
[bB][iI][gG] |
|
||||
[cC][iI][tT][eE] |
|
||||
[cC][oO][dD][eE] |
|
||||
[dD][fF][nN] |
|
||||
[eE][mM] |
|
||||
[fF][oO][nN][tT] |
|
||||
[iI][mM][gG] |
|
||||
[iI][nN][pP][uU][tT] |
|
||||
[kK][bB][dD] |
|
||||
[lL][aA][bB][eE][lL] |
|
||||
[sS][aA][mM][pP] |
|
||||
[sS][eE][lL][eE][cC][tT] |
|
||||
[sS][mM][aA][lL][lL] |
|
||||
[sS][pP][aA][nN] |
|
||||
[sS][tT][rR][iI][kK][eE] |
|
||||
[sS][tT][rR][oO][nN][gG] |
|
||||
[sS][uU][bB] |
|
||||
[sS][uU][pP] |
|
||||
[tT][eE][xX][tT][aA][rR][eE][aA] |
|
||||
[tT][tT] |
|
||||
[vV][aA][rR] )
|
||||
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
|
||||
|
||||
%{
|
||||
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
|
||||
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
|
||||
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
|
||||
private static final char BR_START_TAG_REPLACEMENT = '\n';
|
||||
private static final char BR_END_TAG_REPLACEMENT = '\n';
|
||||
private static final char SCRIPT_REPLACEMENT = '\n';
|
||||
private static final char STYLE_REPLACEMENT = '\n';
|
||||
|
||||
private CharArraySet escapedTags = null;
|
||||
private int inputStart;
|
||||
private int cumulativeDiff;
|
||||
private boolean escapeBR = false;
|
||||
private boolean escapeSCRIPT = false;
|
||||
private boolean escapeSTYLE = false;
|
||||
private int restoreState;
|
||||
private int previousRestoreState;
|
||||
private int outputCharCount;
|
||||
private int eofReturnValue;
|
||||
private TextSegment inputSegment
|
||||
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
|
||||
private TextSegment outputSegment = inputSegment;
|
||||
private TextSegment entitySegment = new TextSegment(2);
|
||||
|
||||
/**
|
||||
* @param source
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param source
|
||||
* @param escapedTags Tags in this set (both start and end tags)
|
||||
* will not be filtered out.
|
||||
*/
|
||||
public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
|
||||
super(source);
|
||||
this.zzReader = source;
|
||||
if (null != escapedTags) {
|
||||
for (String tag : escapedTags) {
|
||||
if (tag.equalsIgnoreCase("BR")) {
|
||||
escapeBR = true;
|
||||
} else if (tag.equalsIgnoreCase("SCRIPT")) {
|
||||
escapeSCRIPT = true;
|
||||
} else if (tag.equalsIgnoreCase("STYLE")) {
|
||||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
if (outputSegment.isRead()) {
|
||||
if (zzAtEOF) {
|
||||
return -1;
|
||||
}
|
||||
int ch = nextChar();
|
||||
++outputCharCount;
|
||||
return ch;
|
||||
}
|
||||
int ch = outputSegment.nextChar();
|
||||
++outputCharCount;
|
||||
return ch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
int i = 0;
|
||||
for ( ; i < len ; ++i) {
|
||||
int ch = read();
|
||||
if (ch == -1) break;
|
||||
cbuf[off++] = (char)ch;
|
||||
}
|
||||
return i > 0 ? i : (len == 0 ? 0 : -1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
yyclose();
|
||||
}
|
||||
|
||||
static int getInitialBufferSize() { // Package private, for testing purposes
|
||||
return ZZ_BUFFERSIZE;
|
||||
}
|
||||
|
||||
private class TextSegment extends OpenStringBuilder {
|
||||
/** The position from which the next char will be read. */
|
||||
int pos = 0;
|
||||
|
||||
/** Wraps the given buffer and sets this.len to the given length. */
|
||||
TextSegment(char[] buffer, int length) {
|
||||
super(buffer, length);
|
||||
}
|
||||
|
||||
/** Allocates an internal buffer of the given size. */
|
||||
TextSegment(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
/** Sets len = 0 and pos = 0. */
|
||||
void clear() {
|
||||
reset();
|
||||
restart();
|
||||
}
|
||||
|
||||
/** Sets pos = 0 */
|
||||
void restart() {
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
/** Returns the next char in the segment. */
|
||||
int nextChar() {
|
||||
assert (! isRead()): "Attempting to read past the end of a segment.";
|
||||
return buf[pos++];
|
||||
}
|
||||
|
||||
/** Returns true when all characters in the text segment have been read */
|
||||
boolean isRead() {
|
||||
return pos >= len;
|
||||
}
|
||||
}
|
||||
%}
|
||||
|
||||
%eofval{
|
||||
return eofReturnValue;
|
||||
%eofval}
|
||||
%eof{
|
||||
switch (zzLexicalState) {
|
||||
case SCRIPT:
|
||||
case COMMENT:
|
||||
case SCRIPT_COMMENT:
|
||||
case STYLE:
|
||||
case STYLE_COMMENT:
|
||||
case SINGLE_QUOTED_STRING:
|
||||
case DOUBLE_QUOTED_STRING:
|
||||
case END_TAG_TAIL_EXCLUDE:
|
||||
case END_TAG_TAIL_SUBSTITUTE:
|
||||
case START_TAG_TAIL_EXCLUDE:
|
||||
case SERVER_SIDE_INCLUDE:
|
||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
outputSegment.clear();
|
||||
eofReturnValue = -1;
|
||||
break;
|
||||
}
|
||||
case CHARACTER_REFERENCE_TAIL: { // Substitute
|
||||
// At end of file, allow char refs without semicolons
|
||||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
}
|
||||
case BANG:
|
||||
case CDATA:
|
||||
case AMPERSAND:
|
||||
case NUMERIC_CHARACTER:
|
||||
case END_TAG_TAIL_INCLUDE:
|
||||
case START_TAG_TAIL_INCLUDE:
|
||||
case LEFT_ANGLE_BRACKET:
|
||||
case LEFT_ANGLE_BRACKET_SLASH:
|
||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
||||
outputSegment = inputSegment;
|
||||
eofReturnValue = outputSegment.nextChar();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
eofReturnValue = -1;
|
||||
}
|
||||
}
|
||||
%eof}
|
||||
|
||||
%%
|
||||
|
||||
"&" {
|
||||
inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
|
||||
"<" {
|
||||
inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
|
||||
<AMPERSAND> {
|
||||
{CharacterEntities} {
|
||||
int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
|
||||
}
|
||||
|
||||
<NUMERIC_CHARACTER> {
|
||||
[xX] [0-9A-Fa-f]+ {
|
||||
int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
|
||||
String hexCharRef
|
||||
= new String(zzBuffer, zzStartRead + 1, matchLength - 1);
|
||||
try {
|
||||
int codePoint = Integer.parseInt(hexCharRef, 16);
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} catch(NumberFormatException e) {
|
||||
assert false: "NumberFormatException parsing hex code point '"
|
||||
+ hexCharRef + "'";
|
||||
} catch(IllegalArgumentException e) {
|
||||
assert false: "IllegalArgumentException getting chars "
|
||||
+ "for hex code point '" + hexCharRef + "'";
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
[0-9]+ {
|
||||
int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
|
||||
String decimalCharRef = yytext();
|
||||
try {
|
||||
int codePoint = Integer.parseInt(decimalCharRef);
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} catch(NumberFormatException e) {
|
||||
assert false: "NumberFormatException parsing code point '"
|
||||
+ decimalCharRef + "'";
|
||||
} catch(IllegalArgumentException e) {
|
||||
assert false: "IllegalArgumentException getting chars for code point '"
|
||||
+ decimalCharRef + "'";
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<CHARACTER_REFERENCE_TAIL> {
|
||||
";" {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET_SLASH> {
|
||||
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
|
||||
[bB][rR] \s* ">" {
|
||||
yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
{InlineElment} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
{Name} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_INCLUDE> {
|
||||
\s* ">" {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_EXCLUDE> {
|
||||
\s* ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
}
|
||||
|
||||
<END_TAG_TAIL_SUBSTITUTE> {
|
||||
\s* ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET> {
|
||||
"!" { inputSegment.append('!'); yybegin(BANG); }
|
||||
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
|
||||
\s+ {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
"?" [^>]* [/?] ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff
|
||||
+= inputSegment.length() + yylength() - outputSegment.length();
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
|
||||
yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
|
||||
yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
|
||||
{InlineElment} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
{Name} {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_INCLUDE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_EXCLUDE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
}
|
||||
|
||||
<START_TAG_TAIL_SUBSTITUTE> {
|
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
<BANG> {
|
||||
"--" { yybegin(COMMENT); }
|
||||
">" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>:
|
||||
//
|
||||
// [18] CDSect ::= CDStart CData CDEnd
|
||||
// [19] CDStart ::= '<![CDATA['
|
||||
// [20] CData ::= (Char* - (Char* ']]>' Char*))
|
||||
// [21] CDEnd ::= ']]>'
|
||||
//
|
||||
"[CDATA[" {
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
[^] {
|
||||
inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
}
|
||||
|
||||
<CDATA> {
|
||||
"]]>" {
|
||||
cumulativeDiff += yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
||||
}
|
||||
|
||||
<COMMENT> {
|
||||
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"-->" {
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SERVER_SIDE_INCLUDE> {
|
||||
"-->" { yybegin(restoreState); }
|
||||
"'" {
|
||||
previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
"\"" {
|
||||
previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SCRIPT_COMMENT> {
|
||||
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
|
||||
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
|
||||
"-->" { yybegin(SCRIPT); }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<STYLE_COMMENT> {
|
||||
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
|
||||
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
|
||||
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
|
||||
"-->" { yybegin(STYLE); }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SINGLE_QUOTED_STRING> {
|
||||
"\\" [^] { }
|
||||
"'" { yybegin(restoreState); restoreState = previousRestoreState; }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<DOUBLE_QUOTED_STRING> {
|
||||
"\\" [^] { }
|
||||
"\"" { yybegin(restoreState); restoreState = previousRestoreState; }
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<SCRIPT> {
|
||||
"<!--" { yybegin(SCRIPT_COMMENT); }
|
||||
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<STYLE> {
|
||||
"<!--" { yybegin(STYLE_COMMENT); }
|
||||
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
int outputEnd = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
cumulativeDiff += yylength() - 1;
|
||||
++outputEnd;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(outputEnd, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
[^] { }
|
||||
}
|
||||
|
||||
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
|
||||
[^] {
|
||||
yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
outputSegment.restart();
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
|
@ -0,0 +1,530 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import re
|
||||
|
||||
# A simple python script to generate an HTML entity map and a regex alternation
|
||||
# for inclusion in HTMLStripCharFilter.jflex.
|
||||
|
||||
def main():
|
||||
print get_apache_license()
|
||||
codes = {}
|
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
||||
for line in get_entity_text().split('\n'):
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
key = match.group(1)
|
||||
if key == 'quot': codes[key] = r'\"'
|
||||
elif key == 'nbsp': codes[key] = ' ';
|
||||
else : codes[key] = r'\u%04X' % int(match.group(2))
|
||||
|
||||
keys = sorted(codes)
|
||||
|
||||
first_entry = True
|
||||
output_line = 'CharacterEntities = ( '
|
||||
for key in keys:
|
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
||||
first_entry = False
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
if key in ('quot','copy','gt','lt','reg','amp'):
|
||||
new_entry = ' | "%s"' % key.upper()
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line, ')'
|
||||
|
||||
print '%{'
|
||||
print ' private static final Set<String> upperCaseVariantsAccepted'
|
||||
print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
|
||||
print ' private static final CharArrayMap<Character> entityValues'
|
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
|
||||
print ' static {'
|
||||
print ' String[] entities = {'
|
||||
output_line = ' '
|
||||
for key in keys:
|
||||
new_entry = ' "%s", "%s",' % (key, codes[key])
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line[:-1]
|
||||
print ' };'
|
||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
|
||||
print ' Character value = entities[i + 1].charAt(0);'
|
||||
print ' entityValues.put(entities[i], value);'
|
||||
print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
|
||||
print ' entityValues.put(entities[i].toUpperCase(), value);'
|
||||
print ' }'
|
||||
print ' }'
|
||||
print " }"
|
||||
print "%}"
|
||||
|
||||
def get_entity_text():
|
||||
# The text below is taken verbatim from
|
||||
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
|
||||
text = r"""
|
||||
F.1. XHTML Character Entities
|
||||
|
||||
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
|
||||
F.1.1. XHTML Latin 1 Character Entities
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
|
||||
<!-- file: xhtml-lat1.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-lat1
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
||||
"xhtml-lat1.ent" >
|
||||
%xhtml-lat1;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
|
||||
|
||||
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
-->
|
||||
|
||||
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
|
||||
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
|
||||
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
|
||||
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
|
||||
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
|
||||
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
|
||||
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
|
||||
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
|
||||
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
|
||||
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
|
||||
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
|
||||
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
|
||||
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
|
||||
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
|
||||
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
|
||||
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
|
||||
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
|
||||
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
|
||||
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
|
||||
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
|
||||
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
|
||||
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
|
||||
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
|
||||
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
|
||||
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
|
||||
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
|
||||
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
|
||||
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
|
||||
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
|
||||
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
|
||||
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
|
||||
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
|
||||
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
|
||||
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
|
||||
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
|
||||
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
|
||||
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
|
||||
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
|
||||
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
|
||||
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
|
||||
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
|
||||
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
|
||||
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
|
||||
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
|
||||
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
|
||||
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
|
||||
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
|
||||
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
|
||||
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
|
||||
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
|
||||
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
|
||||
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
|
||||
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
|
||||
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
|
||||
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
|
||||
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
|
||||
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
|
||||
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
|
||||
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
|
||||
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
|
||||
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
|
||||
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
|
||||
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
|
||||
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
|
||||
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
|
||||
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
|
||||
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
|
||||
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
|
||||
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
|
||||
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
|
||||
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
|
||||
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
|
||||
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
|
||||
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
|
||||
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
|
||||
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
|
||||
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
|
||||
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
|
||||
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
|
||||
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
|
||||
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
|
||||
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
|
||||
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
|
||||
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
|
||||
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
|
||||
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
|
||||
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
|
||||
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
|
||||
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
|
||||
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
|
||||
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
|
||||
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
|
||||
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
|
||||
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
|
||||
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
|
||||
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
|
||||
<!-- end of xhtml-lat1.ent -->
|
||||
|
||||
F.1.2. XHTML Special Characters
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
|
||||
<!-- file: xhtml-special.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-special
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
||||
"xhtml-special.ent" >
|
||||
%xhtml-special;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
|
||||
|
||||
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
|
||||
Revisions:
|
||||
2000-10-28: added ' and altered XML Predefined Entities for compatibility
|
||||
-->
|
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
||||
numbers are given for each character, in hex. Entity values are
|
||||
decimal conversions of the ISO 10646 values and refer to the
|
||||
document character set. Names are Unicode [UNICODE] names.
|
||||
-->
|
||||
|
||||
<!-- C0 Controls and Basic Latin -->
|
||||
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
|
||||
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
|
||||
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
|
||||
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
|
||||
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
|
||||
|
||||
<!-- Latin Extended-A -->
|
||||
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
|
||||
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
|
||||
|
||||
<!-- ligature is a misnomer, this is a separate character in some languages -->
|
||||
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
|
||||
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
|
||||
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
|
||||
|
||||
<!-- Spacing Modifier Letters -->
|
||||
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
|
||||
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
|
||||
|
||||
<!-- General Punctuation -->
|
||||
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
|
||||
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
|
||||
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
|
||||
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
|
||||
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
|
||||
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
|
||||
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
|
||||
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
|
||||
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
|
||||
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
|
||||
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
|
||||
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
|
||||
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
|
||||
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
|
||||
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
|
||||
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
|
||||
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
|
||||
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
|
||||
|
||||
<!-- lsaquo is proposed but not yet ISO standardized -->
|
||||
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
|
||||
<!-- rsaquo is proposed but not yet ISO standardized -->
|
||||
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
|
||||
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
|
||||
|
||||
<!-- end of xhtml-special.ent -->
|
||||
|
||||
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
|
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
|
||||
|
||||
<!-- ...................................................................... -->
|
||||
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
|
||||
<!-- file: xhtml-symbol.ent
|
||||
|
||||
Typical invocation:
|
||||
|
||||
<!ENTITY % xhtml-symbol
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
||||
"xhtml-symbol.ent" >
|
||||
%xhtml-symbol;
|
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
|
||||
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
|
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
|
||||
|
||||
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
|
||||
|
||||
Portions (C) International Organization for Standardization 1986:
|
||||
Permission to copy in any form is granted for use with conforming
|
||||
SGML systems and applications as defined in ISO 8879, provided
|
||||
this notice is included in all copies.
|
||||
-->
|
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced.
|
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
|
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
|
||||
numbers are given for each character, in hex. Entity values are
|
||||
decimal conversions of the ISO 10646 values and refer to the
|
||||
document character set. Names are Unicode [UNICODE] names.
|
||||
-->
|
||||
|
||||
<!-- Latin Extended-B -->
|
||||
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
|
||||
= florin, U+0192 ISOtech -->
|
||||
|
||||
<!-- Greek -->
|
||||
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
|
||||
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
|
||||
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
|
||||
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
|
||||
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
|
||||
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
|
||||
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
|
||||
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
|
||||
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
|
||||
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
|
||||
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
|
||||
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
|
||||
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
|
||||
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
|
||||
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
|
||||
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
|
||||
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
|
||||
<!-- there is no Sigmaf, and no U+03A2 character either -->
|
||||
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
|
||||
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
|
||||
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
|
||||
U+03A5 ISOgrk3 -->
|
||||
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
|
||||
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
|
||||
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
|
||||
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
|
||||
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
|
||||
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
|
||||
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
|
||||
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
|
||||
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
|
||||
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
|
||||
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
|
||||
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
|
||||
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
|
||||
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
|
||||
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
|
||||
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
|
||||
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
|
||||
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
|
||||
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
|
||||
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
|
||||
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
|
||||
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
|
||||
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
|
||||
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
|
||||
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
|
||||
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
|
||||
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
|
||||
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
|
||||
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
|
||||
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
|
||||
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
|
||||
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
|
||||
|
||||
<!-- General Punctuation -->
|
||||
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
|
||||
<!-- bullet is NOT the same as bullet operator, U+2219 -->
|
||||
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
|
||||
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
|
||||
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
|
||||
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
|
||||
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
|
||||
|
||||
<!-- Letterlike Symbols -->
|
||||
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
|
||||
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
|
||||
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
|
||||
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
|
||||
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
|
||||
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
|
||||
the same glyph could be used to depict both characters -->
|
||||
|
||||
<!-- Arrows -->
|
||||
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
|
||||
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
|
||||
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
|
||||
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
|
||||
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
|
||||
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
|
||||
= carriage return, U+21B5 NEW -->
|
||||
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
|
||||
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
|
||||
but also does not have any other character for that function. So ? lArr can
|
||||
be used for 'is implied by' as ISOtech suggests -->
|
||||
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
|
||||
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
|
||||
<!-- Unicode does not say this is the 'implies' character but does not have
|
||||
another character with this function so ?
|
||||
rArr can be used for 'implies' as ISOtech suggests -->
|
||||
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
|
||||
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
|
||||
|
||||
<!-- Mathematical Operators -->
|
||||
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
|
||||
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
|
||||
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
|
||||
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
|
||||
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
|
||||
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
|
||||
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
|
||||
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
|
||||
<!-- should there be a more memorable name than 'ni'? -->
|
||||
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
|
||||
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
|
||||
the same glyph might be used for both -->
|
||||
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
|
||||
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
|
||||
though the same glyph might be used for both -->
|
||||
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
|
||||
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
|
||||
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
|
||||
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
|
||||
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
|
||||
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
|
||||
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
|
||||
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
|
||||
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
|
||||
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
|
||||
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
|
||||
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
|
||||
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
|
||||
<!-- tilde operator is NOT the same character as the tilde, U+007E,
|
||||
although the same glyph might be used to represent both -->
|
||||
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
|
||||
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
|
||||
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
|
||||
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
|
||||
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
|
||||
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
|
||||
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
|
||||
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
|
||||
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
|
||||
font encoding and is not included. Should it be, for symmetry?
|
||||
It is in ISOamsn -->
|
||||
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
|
||||
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
|
||||
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
|
||||
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
|
||||
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
|
||||
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
|
||||
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
|
||||
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
|
||||
|
||||
<!-- Miscellaneous Technical -->
|
||||
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
|
||||
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
|
||||
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
|
||||
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
|
||||
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
|
||||
<!-- lang is NOT the same character as U+003C 'less than'
|
||||
or U+2039 'single left-pointing angle quotation mark' -->
|
||||
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
|
||||
<!-- rang is NOT the same character as U+003E 'greater than'
|
||||
or U+203A 'single right-pointing angle quotation mark' -->
|
||||
|
||||
<!-- Geometric Shapes -->
|
||||
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
|
||||
|
||||
<!-- Miscellaneous Symbols -->
|
||||
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
|
||||
<!-- black here seems to mean filled as opposed to hollow -->
|
||||
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
|
||||
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
|
||||
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
|
||||
|
||||
<!-- end of xhtml-symbol.ent -->
|
||||
"""
|
||||
return text
|
||||
|
||||
def get_apache_license():
|
||||
license = r"""/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
"""
|
||||
return license
|
||||
|
||||
main()
|
|
@ -17,6 +17,42 @@
|
|||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Filters that normalize text before tokenization.
|
||||
<p>
|
||||
Chainable filters that normalize text before tokenization and provide
|
||||
mappings between normalized text offsets and the corresponding offset
|
||||
in the original text.
|
||||
</p>
|
||||
<H2>CharFilter offset mappings</H2>
|
||||
<p>
|
||||
CharFilters modify an input stream via a series of substring
|
||||
replacements (including deletions and insertions) to produce an output
|
||||
stream. There are three possible replacement cases: the replacement
|
||||
string has the same length as the original substring; the replacement
|
||||
is shorter; and the replacement is longer. In the latter two cases
|
||||
(when the replacement has a different length than the original),
|
||||
one or more offset correction mappings are required.
|
||||
</p>
|
||||
<p>
|
||||
When the replacement is shorter than the original (e.g. when the
|
||||
replacement is the empty string), a single offset correction mapping
|
||||
should be added at the replacement's end offset in the output stream.
|
||||
The <code>cumulativeDiff</code> parameter to the
|
||||
<code>addOffCorrectMapping()</code> method will be the sum of all
|
||||
previous replacement offset adjustments, with the addition of the
|
||||
difference between the lengths of the original substring and the
|
||||
replacement string (a positive value).
|
||||
</p>
|
||||
<p>
|
||||
When the replacement is longer than the original (e.g. when the
|
||||
original is the empty string), you should add as many offset
|
||||
correction mappings as the difference between the lengths of the
|
||||
replacement string and the original substring, starting at the
|
||||
end offset the original substring would have had in the output stream.
|
||||
The <code>cumulativeDiff</code> parameter to the
|
||||
<code>addOffCorrectMapping()</code> method will be the sum of all
|
||||
previous replacement offset adjustments, with the addition of the
|
||||
difference between the lengths of the original substring and the
|
||||
replacement string so far (a negative value).
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -31,7 +32,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.junit.Ignore;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -41,8 +42,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
|
||||
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
|
||||
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
|
||||
String gold = " this is some text here is a link and " +
|
||||
"another link . " +
|
||||
String gold = "\nthis is some text\n here is a link and " +
|
||||
"another link. " +
|
||||
"This is an entity: & plus a <. Here is an &. ";
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
@ -56,7 +57,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
|
||||
position++;
|
||||
}
|
||||
assertEquals(gold, builder.toString());
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
//Some sanity checks, but not a full-fledged check
|
||||
|
@ -77,6 +79,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testMSWord14GeneratedHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("MS-Word 14 generated.htm");
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
String gold = "This is a test";
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString().trim());
|
||||
}
|
||||
|
||||
|
||||
public void testGamma() throws Exception {
|
||||
String test = "Γ";
|
||||
String gold = "\u0393";
|
||||
|
@ -89,9 +109,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testEntities() throws Exception {
|
||||
|
@ -106,9 +124,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testMoreEntities() throws Exception {
|
||||
|
@ -123,9 +139,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
|
||||
assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
|
||||
}
|
||||
|
||||
public void testReserved() throws Exception {
|
||||
|
@ -147,8 +161,176 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testMalformedHTML() throws Exception {
|
||||
String test = "a <a hr<ef=aa<a>> </close</a>";
|
||||
String gold = "a <a hr<ef=aa > </close ";
|
||||
String[] testGold = {
|
||||
"a <a hr<ef=aa<a>> </close</a>",
|
||||
"a <a hr<ef=aa> </close",
|
||||
|
||||
"<a href=http://dmoz.org/cgi-bin/add.cgi?where=/arts/\" class=lu style=\"font-size: 9px\" target=dmoz>Submit a Site</a>",
|
||||
"Submit a Site",
|
||||
|
||||
"<a href=javascript:ioSwitch('p8','http://www.csmonitor.com/') title=expand id=e8 class=expanded rel=http://www.csmonitor.com/>Christian Science",
|
||||
"Christian Science",
|
||||
|
||||
"<link rel=\"alternate\" type=\"application/rss+xml\" title=\"San Francisco \" 2008 RSS Feed\" href=\"http://2008.sf.wordcamp.org/feed/\" />",
|
||||
"\n",
|
||||
|
||||
// "<" before ">" inhibits tag recognition
|
||||
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
|
||||
"<a href=\" http://www.surgery4was.happyhost.org/video-of-arthroscopic-knee-surgery symptoms.html, heat congestive heart failure <a href=\" http://www.symptoms1bad.happyhost.org/canine",
|
||||
|
||||
"<a href=\"http://ucblibraries.colorado.edu/how/index.htm\"class=\"pageNavAreaText\">",
|
||||
"",
|
||||
|
||||
"<link title=\"^\\\" 21Sta's Blog\" rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"http://21sta.com/blog/inc/opensearch.php\" />",
|
||||
"\n",
|
||||
|
||||
"<a href=\"#postcomment\" title=\"\"Leave a comment\";\">?",
|
||||
"?",
|
||||
|
||||
"<a href='/modern-furniture' ' id='21txt' class='offtab' onMouseout=\"this.className='offtab'; return true;\" onMouseover=\"this.className='ontab'; return true;\">",
|
||||
"",
|
||||
|
||||
"<a href='http://alievi.wordpress.com/category/01-todos-posts/' style='font-size: 275%; padding: 1px; margin: 1px;' title='01 - Todos Post's (83)'>",
|
||||
"",
|
||||
|
||||
"The <a href=<a href=\"http://www.advancedmd.com>medical\">http://www.advancedmd.com>medical</a> practice software</a>",
|
||||
"The <a href=medical\">http://www.advancedmd.com>medical practice software",
|
||||
|
||||
"<a href=\"node/21426\" class=\"clipTitle2\" title=\"Levi.com/BMX 2008 Clip of the Week 29 \"Morgan Wade Leftover Clips\"\">Levi.com/BMX 2008 Clip of the Week 29...",
|
||||
"Levi.com/BMX 2008 Clip of the Week 29...",
|
||||
|
||||
"<a href=\"printer_friendly.php?branch=&year=&submit=go&screen=\";\">Printer Friendly",
|
||||
"Printer Friendly",
|
||||
|
||||
"<a href=#\" ondragstart=\"return false\" onclick=\"window.external.AddFavorite('http://www.amazingtextures.com', 'Amazing Textures');return false\" onmouseover=\"window.status='Add to Favorites';return true\">Add to Favorites",
|
||||
"Add to Favorites",
|
||||
|
||||
"<a href=\"../at_home/at_home_search.html\"../_home/at_home_search.html\">At",
|
||||
"At",
|
||||
|
||||
"E-mail: <a href=\"\"mailto:XXXXXX@example.com\" \">XXXXXX@example.com </a>",
|
||||
"E-mail: XXXXXX@example.com ",
|
||||
|
||||
"<li class=\"farsi\"><a title=\"A'13?\" alt=\"A'13?\" href=\"http://www.america.gov/persian\" alt=\"\" name=\"A'13?\"A'13? title=\"A'13?\">A'13?</a></li>",
|
||||
"\nA'13?\n",
|
||||
|
||||
"<li><a href=\"#28\" title=\"Hubert \"Geese\" Ausby\">Hubert \"Geese\" Ausby</a></li>",
|
||||
"\nHubert \"Geese\" Ausby\n",
|
||||
|
||||
"<href=\"http://anbportal.com/mms/login.asp\">",
|
||||
"\n",
|
||||
|
||||
"<a href=\"",
|
||||
"<a href=\"",
|
||||
|
||||
"<a href=\">",
|
||||
"",
|
||||
|
||||
"<a rel=\"nofollow\" href=\"http://anissanina31.skyrock.com/1895039493-Hi-tout-le-monde.html\" title=\" Hi, tout le monde !>#</a>",
|
||||
"#",
|
||||
|
||||
"<a href=\"http://annunciharleydavidsonusate.myblog.it/\" title=\"Annunci Moto e Accessori Harley Davidson\" target=\"_blank\"><img src=\"http://annunciharleydavidsonusate.myblog.it/images/Antipixel.gif\" /></a>",
|
||||
"",
|
||||
|
||||
"<a href=\"video/addvideo&v=120838887181\" onClick=\"return confirm('Are you sure you want add this video to your profile? If it exists some video in your profile will be overlapped by this video!!')\" \" onmouseover=\"this.className='border2'\" onmouseout=\"this.className=''\">",
|
||||
"",
|
||||
|
||||
"<a href=#Services & Support>",
|
||||
"",
|
||||
|
||||
// "<" and ">" chars are accepted in on[Event] attribute values
|
||||
"<input type=\"image\" src=\"http://apologyindex.com/ThemeFiles/83401-72905/images/btn_search.gif\"value=\"Search\" name=\"Search\" alt=\"Search\" class=\"searchimage\" onclick=\"incom ='&sc=' + document.getElementById('sel').value ; var dt ='&dt=' + document.getElementById('dt').value; var searchKeyword = document.getElementById('q').value ; searchKeyword = searchKeyword.replace(/\\s/g,''); if (searchKeyword.length < 3){alert('Nothing to search. Search keyword should contain atleast 3 chars.'); return false; } var al='&al=' + document.getElementById('advancedlink').style.display ; document.location.href='http://apologyindex.com/search.aspx?q=' + document.getElementById('q').value + incom + dt + al;\" />",
|
||||
"",
|
||||
|
||||
"<input type=\"image\" src=\"images/afbe.gif\" width=\"22\" height=\"22\" hspace=\"4\" title=\"Add to Favorite\" alt=\"Add to Favorite\"onClick=\" if(window.sidebar){ window.sidebar.addPanel(document.title,location.href,''); }else if(window.external){ window.external.AddFavorite(location.href,document.title); }else if(window.opera&&window.print) { return true; }\">",
|
||||
"",
|
||||
|
||||
"<area shape=\"rect\" coords=\"12,153,115,305\" href=\"http://statenislandtalk.com/v-web/gallery/Osmundsen-family\"Art's Norwegian Roots in Rogaland\">",
|
||||
"\n",
|
||||
|
||||
"<a rel=\"nofollow\" href=\"http://arth26.skyrock.com/660188240-bonzai.html\" title=\"bonza>#",
|
||||
"#",
|
||||
|
||||
"<a href= >",
|
||||
"",
|
||||
|
||||
"<ahref=http:..",
|
||||
"<ahref=http:..",
|
||||
|
||||
"<ahref=http:..>",
|
||||
"\n",
|
||||
|
||||
"<ahref=\"http://aseigo.bddf.ca/cms/1025\">A",
|
||||
"\nA",
|
||||
|
||||
"<a href=\"javascript:calendar_window=window.open('/calendar.aspx?formname=frmCalendar.txtDate','calendar_window','width=154,height=188');calendar_window.focus()\">",
|
||||
"",
|
||||
|
||||
"<a href=\"/applications/defenseaerospace/19+rackmounts\" title=\"19\" Rackmounts\">",
|
||||
"",
|
||||
|
||||
"<a href=http://www.azimprimerie.fr/flash/backup/lewes-zip-code/savage-model-110-manual.html title=savage model 110 manual rel=dofollow>",
|
||||
"",
|
||||
|
||||
"<a class=\"at\" name=\"Lamborghini href=\"http://lamborghini.coolbegin.com\">Lamborghini /a>",
|
||||
"Lamborghini /a>",
|
||||
|
||||
"<A href='newslink.php?news_link=http%3A%2F%2Fwww.worldnetdaily.com%2Findex.php%3Ffa%3DPAGE.view%26pageId%3D85729&news_title=Florida QB makes 'John 3:16' hottest Google search Tebow inscribed Bible reference on eye black for championship game' TARGET=_blank>",
|
||||
"",
|
||||
|
||||
"<a href=/myspace !style='color:#993333'>",
|
||||
"",
|
||||
|
||||
"<meta name=3DProgId content=3DExcel.Sheet>",
|
||||
"\n",
|
||||
|
||||
"<link id=3D\"shLink\" href=3D\"PSABrKelly-BADMINTONCupResults08FINAL2008_09_19=_files/sheet004.htm\">",
|
||||
"\n",
|
||||
|
||||
"<td bgcolor=3D\"#FFFFFF\" nowrap>",
|
||||
"\n",
|
||||
|
||||
"<a href=\"http://basnect.info/usersearch/\"predicciones-mundiales-2009\".html\">\"predicciones mundiales 2009\"</a>",
|
||||
"\"predicciones mundiales 2009\"",
|
||||
|
||||
"<a class=\"comment-link\" href=\"https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588\"location.href=https://www.blogger.com/comment.g?blogID=19402125&postID=114070605958684588;>",
|
||||
"",
|
||||
|
||||
"<a href = \"/videos/Bishop\"/\" title = \"click to see more Bishop\" videos\">Bishop\"</a>",
|
||||
"Bishop\"",
|
||||
|
||||
"<a href=\"http://bhaa.ie/calendar/event.php?eid=20081203150127531\"\">BHAA Eircom 2 & 5 miles CC combined start</a>",
|
||||
"BHAA Eircom 2 & 5 miles CC combined start",
|
||||
|
||||
"<a href=\"http://people.tribe.net/wolfmana\" onClick='setClick(\"Application[tribe].Person[bb7df210-9dc0-478c-917f-436b896bcb79]\")'\" title=\"Mana\">",
|
||||
"",
|
||||
|
||||
"<a href=\"http://blog.edu-cyberpg.com/ct.ashx?id=6143c528-080c-4bb2-b765-5ec56c8256d3&url=http%3a%2f%2fwww.gsa.ac.uk%2fmackintoshsketchbook%2f\"\" eudora=\"autourl\">",
|
||||
"",
|
||||
|
||||
// "<" before ">" inhibits tag recognition
|
||||
"<input type=\"text\" value=\"<search here>\">",
|
||||
"<input type=\"text\" value=\"\n\">",
|
||||
|
||||
"<input type=\"text\" value=\"<search here\">",
|
||||
"<input type=\"text\" value=\"\n",
|
||||
|
||||
"<input type=\"text\" value=\"search here>\">",
|
||||
"\">",
|
||||
|
||||
// "<" and ">" chars are accepted in on[Event] attribute values
|
||||
"<input type=\"text\" value=\"<search here>\" onFocus=\"this.value='<search here>'\">",
|
||||
"",
|
||||
|
||||
"<![if ! IE]>\n<link href=\"http://i.deviantart.com/icons/favicon.png\" rel=\"shortcut icon\"/>\n<![endif]>",
|
||||
"\n\n\n",
|
||||
|
||||
"<![if supportMisalignedColumns]>\n<tr height=0 style='display:none'>\n<td width=64 style='width:48pt'></td>\n</tr>\n<![endif]>",
|
||||
"\n\n\n\n\n\n\n\n",
|
||||
};
|
||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
|
@ -156,36 +338,71 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
assertEquals("Test: '" + test + "'", gold, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testBufferOverflow() throws Exception {
|
||||
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
|
||||
StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize() + 50);
|
||||
testBuilder.append("ah<?> ??????");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<!--");//comments
|
||||
appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
|
||||
appendChars(testBuilder, 3 * HTMLStripCharFilter.getInitialBufferSize() + 500);//comments have two lookaheads
|
||||
|
||||
testBuilder.append("-->foo");
|
||||
processBuffer(testBuilder.toString(), "Failed w/ comment");
|
||||
String gold = "foo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<?");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("?>");
|
||||
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<b ");
|
||||
appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
|
||||
testBuilder.append("/>");
|
||||
processBuffer(testBuilder.toString(), "Failed on tag");
|
||||
|
||||
gold = "";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(testBuilder.toString())));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
private void appendChars(StringBuilder testBuilder, int numChars) {
|
||||
|
@ -208,7 +425,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
} finally {
|
||||
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
|
||||
}
|
||||
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
|
||||
assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
|
||||
test, builder.toString());
|
||||
}
|
||||
|
||||
public void testComment() throws Exception {
|
||||
|
@ -225,7 +443,8 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
|
||||
|
@ -247,7 +466,7 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
doTestOffsets("hello X how X are you");
|
||||
// doTestOffsets("hello X how X are you");
|
||||
doTestOffsets("hello <p> X<p> how <p>X are you");
|
||||
doTestOffsets("X & X ( X < > X");
|
||||
|
||||
|
@ -255,7 +474,24 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
|
||||
}
|
||||
|
||||
@Ignore("broken offsets: see LUCENE-2208")
|
||||
static void assertLegalOffsets(String in) throws Exception {
|
||||
int length = in.length();
|
||||
HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
int ch = 0;
|
||||
int off = 0;
|
||||
while ((ch = reader.read()) != -1) {
|
||||
int correction = reader.correctOffset(off);
|
||||
assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length,
|
||||
correction <= length);
|
||||
off++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testLegalOffsets() throws Exception {
|
||||
assertLegalOffsets("hello world");
|
||||
assertLegalOffsets("hello &#x world");
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
|
@ -274,4 +510,311 @@ public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
|||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
}
|
||||
|
||||
public void testServerSideIncludes() throws Exception {
|
||||
String test = "one<img src=\"image.png\"\n"
|
||||
+ " alt = \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}' -->\"\n\n"
|
||||
+ " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
|
||||
|
||||
test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"--> --></script>two";
|
||||
gold = "one\ntwo";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testScriptQuotes() throws Exception {
|
||||
String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\"); --></script>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
|
||||
gold = "hello\n";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testEscapeScript() throws Exception {
|
||||
String test = "one<script no-value-attr>callSomeMethod();</script>two";
|
||||
String gold = "one<script no-value-attr></script>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("SCRIPT"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testStyle() throws Exception {
|
||||
String test = "one<style type=\"text/css\">\n"
|
||||
+ "<!--\n"
|
||||
+ "@import url('http://www.lasletrasdecanciones.com/css.css');\n"
|
||||
+ "-->\n"
|
||||
+ "</style>two";
|
||||
String gold = "one\ntwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testEscapeStyle() throws Exception {
|
||||
String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; } </style>two";
|
||||
String gold = "one<style type=\"text/css\"></style>two";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("STYLE"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testBR() throws Exception {
|
||||
String[] testGold = {
|
||||
"one<BR />two<br>three",
|
||||
"one\ntwo\nthree",
|
||||
|
||||
"one<BR some stuff here too>two</BR>",
|
||||
"one\ntwo\n",
|
||||
};
|
||||
for (int i = 0 ; i < testGold.length ; i += 2) {
|
||||
String test = testGold[i];
|
||||
String gold = testGold[i + 1];
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
assertEquals("Test: '" + test + "'", gold, result);
|
||||
}
|
||||
}
|
||||
public void testEscapeBR() throws Exception {
|
||||
String test = "one<BR class='whatever'>two</\nBR\n>";
|
||||
String gold = "one<BR class='whatever'>two</\nBR\n>";
|
||||
Set<String> escapedTags = new HashSet<String>(Arrays.asList("BR"));
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(test)), escapedTags);
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testInlineTagsNoSpace() throws Exception {
|
||||
String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
|
||||
String gold = "onetwo2e.three";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testCDATA() throws Exception {
|
||||
String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
|
||||
String gold = "one<one><two>three<four></four></two></one>two";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
|
||||
test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
|
||||
gold = "onetwo<![CDATA[three]]>fourfive";
|
||||
reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
ch = 0;
|
||||
builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testUppercaseCharacterEntityVariants() throws Exception {
|
||||
String test = " "-©>><<®&";
|
||||
String gold = " \"-\u00A9>><<\u00AE&";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testMSWordMalformedProcessingInstruction() throws Exception {
|
||||
String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\" />two";
|
||||
String gold = "onetwo";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testSupplementaryCharsInTags() throws Exception {
|
||||
String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three 瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
|
||||
String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
|
||||
Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
|
||||
gold, builder.toString());
|
||||
}
|
||||
|
||||
public void testRandomBrokenHTML() throws Exception {
|
||||
int maxNumElements = 10000;
|
||||
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text)));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
|
||||
public void testRandomText() throws Exception {
|
||||
StringBuilder text = new StringBuilder();
|
||||
int minNumWords = 10;
|
||||
int maxNumWords = 10000;
|
||||
int minWordLength = 3;
|
||||
int maxWordLength = 20;
|
||||
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
|
||||
switch (_TestUtil.nextInt(random, 0, 4)) {
|
||||
case 0: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomRealisticUnicodeString
|
||||
(random, minWordLength, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: { // ASCII 50% of the time
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomSimpleString(random));
|
||||
text.append(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
Reader reader = new HTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text.toString())));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,653 @@
|
|||
<html xmlns:v="urn:schemas-microsoft-com:vml"
|
||||
xmlns:o="urn:schemas-microsoft-com:office:office"
|
||||
xmlns:w="urn:schemas-microsoft-com:office:word"
|
||||
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
|
||||
xmlns="http://www.w3.org/TR/REC-html40">
|
||||
|
||||
<head>
|
||||
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
|
||||
<meta name=ProgId content=Word.Document>
|
||||
<meta name=Generator content="Microsoft Word 14">
|
||||
<meta name=Originator content="Microsoft Word 14">
|
||||
<link rel=File-List href="This%20is%20a%20test_files/filelist.xml">
|
||||
<!--[if gte mso 9]><xml>
|
||||
<o:DocumentProperties>
|
||||
<o:Author>s</o:Author>
|
||||
<o:LastAuthor>s</o:LastAuthor>
|
||||
<o:Revision>1</o:Revision>
|
||||
<o:TotalTime>1</o:TotalTime>
|
||||
<o:Created>2012-01-13T03:36:00Z</o:Created>
|
||||
<o:LastSaved>2012-01-13T03:37:00Z</o:LastSaved>
|
||||
<o:Pages>1</o:Pages>
|
||||
<o:Words>8</o:Words>
|
||||
<o:Characters>48</o:Characters>
|
||||
<o:Lines>1</o:Lines>
|
||||
<o:Paragraphs>1</o:Paragraphs>
|
||||
<o:CharactersWithSpaces>55</o:CharactersWithSpaces>
|
||||
<o:Version>14.00</o:Version>
|
||||
</o:DocumentProperties>
|
||||
<o:OfficeDocumentSettings>
|
||||
<o:AllowPNG/>
|
||||
</o:OfficeDocumentSettings>
|
||||
</xml><![endif]-->
|
||||
<link rel=themeData href="This%20is%20a%20test_files/themedata.thmx">
|
||||
<link rel=colorSchemeMapping
|
||||
href="This%20is%20a%20test_files/colorschememapping.xml">
|
||||
<!--[if gte mso 9]><xml>
|
||||
<w:WordDocument>
|
||||
<w:SpellingState>Clean</w:SpellingState>
|
||||
<w:GrammarState>Clean</w:GrammarState>
|
||||
<w:TrackMoves>false</w:TrackMoves>
|
||||
<w:TrackFormatting/>
|
||||
<w:PunctuationKerning/>
|
||||
<w:ValidateAgainstSchemas/>
|
||||
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
||||
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
||||
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
||||
<w:DoNotPromoteQF/>
|
||||
<w:LidThemeOther>EN-US</w:LidThemeOther>
|
||||
<w:LidThemeAsian>X-NONE</w:LidThemeAsian>
|
||||
<w:LidThemeComplexScript>X-NONE</w:LidThemeComplexScript>
|
||||
<w:Compatibility>
|
||||
<w:BreakWrappedTables/>
|
||||
<w:SnapToGridInCell/>
|
||||
<w:WrapTextWithPunct/>
|
||||
<w:UseAsianBreakRules/>
|
||||
<w:DontGrowAutofit/>
|
||||
<w:SplitPgBreakAndParaMark/>
|
||||
<w:EnableOpenTypeKerning/>
|
||||
<w:DontFlipMirrorIndents/>
|
||||
<w:OverrideTableStyleHps/>
|
||||
</w:Compatibility>
|
||||
<m:mathPr>
|
||||
<m:mathFont m:val="Cambria Math"/>
|
||||
<m:brkBin m:val="before"/>
|
||||
<m:brkBinSub m:val="--"/>
|
||||
<m:smallFrac m:val="off"/>
|
||||
<m:dispDef/>
|
||||
<m:lMargin m:val="0"/>
|
||||
<m:rMargin m:val="0"/>
|
||||
<m:defJc m:val="centerGroup"/>
|
||||
<m:wrapIndent m:val="1440"/>
|
||||
<m:intLim m:val="subSup"/>
|
||||
<m:naryLim m:val="undOvr"/>
|
||||
</m:mathPr></w:WordDocument>
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<w:LatentStyles DefLockedState="false" DefUnhideWhenUsed="true"
|
||||
DefSemiHidden="true" DefQFormat="false" DefPriority="99"
|
||||
LatentStyleCount="267">
|
||||
<w:LsdException Locked="false" Priority="0" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Normal"/>
|
||||
<w:LsdException Locked="false" Priority="9" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="heading 1"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 2"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 3"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 4"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 5"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 6"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 7"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 8"/>
|
||||
<w:LsdException Locked="false" Priority="9" QFormat="true" Name="heading 9"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 1"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 2"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 3"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 4"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 5"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 6"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 7"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 8"/>
|
||||
<w:LsdException Locked="false" Priority="39" Name="toc 9"/>
|
||||
<w:LsdException Locked="false" Priority="35" QFormat="true" Name="caption"/>
|
||||
<w:LsdException Locked="false" Priority="10" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Title"/>
|
||||
<w:LsdException Locked="false" Priority="1" Name="Default Paragraph Font"/>
|
||||
<w:LsdException Locked="false" Priority="11" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtitle"/>
|
||||
<w:LsdException Locked="false" Priority="22" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Strong"/>
|
||||
<w:LsdException Locked="false" Priority="20" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="59" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Table Grid"/>
|
||||
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Placeholder Text"/>
|
||||
<w:LsdException Locked="false" Priority="1" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="No Spacing"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" UnhideWhenUsed="false" Name="Revision"/>
|
||||
<w:LsdException Locked="false" Priority="34" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="List Paragraph"/>
|
||||
<w:LsdException Locked="false" Priority="29" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Quote"/>
|
||||
<w:LsdException Locked="false" Priority="30" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Quote"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 1"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 2"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 3"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 4"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 5"/>
|
||||
<w:LsdException Locked="false" Priority="60" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Shading Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="61" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="62" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Light Grid Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="63" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="64" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Shading 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="65" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="66" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium List 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="67" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 1 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="68" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 2 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="69" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Medium Grid 3 Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="70" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Dark List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="71" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Shading Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="72" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful List Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="73" SemiHidden="false"
|
||||
UnhideWhenUsed="false" Name="Colorful Grid Accent 6"/>
|
||||
<w:LsdException Locked="false" Priority="19" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtle Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="21" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Emphasis"/>
|
||||
<w:LsdException Locked="false" Priority="31" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Subtle Reference"/>
|
||||
<w:LsdException Locked="false" Priority="32" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Intense Reference"/>
|
||||
<w:LsdException Locked="false" Priority="33" SemiHidden="false"
|
||||
UnhideWhenUsed="false" QFormat="true" Name="Book Title"/>
|
||||
<w:LsdException Locked="false" Priority="37" Name="Bibliography"/>
|
||||
<w:LsdException Locked="false" Priority="39" QFormat="true" Name="TOC Heading"/>
|
||||
</w:LatentStyles>
|
||||
</xml><![endif]-->
|
||||
<style>
|
||||
<!--
|
||||
/* Font Definitions */
|
||||
@font-face
|
||||
{font-family:"Cambria Math";
|
||||
panose-1:2 4 5 3 5 4 6 3 2 4;
|
||||
mso-font-charset:1;
|
||||
mso-generic-font-family:roman;
|
||||
mso-font-format:other;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:0 0 0 0 0 0;}
|
||||
@font-face
|
||||
{font-family:Cambria;
|
||||
panose-1:2 4 5 3 5 4 6 3 2 4;
|
||||
mso-font-charset:0;
|
||||
mso-generic-font-family:roman;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:-536870145 1073743103 0 0 415 0;}
|
||||
@font-face
|
||||
{font-family:Calibri;
|
||||
panose-1:2 15 5 2 2 2 4 3 2 4;
|
||||
mso-font-charset:0;
|
||||
mso-generic-font-family:swiss;
|
||||
mso-font-pitch:variable;
|
||||
mso-font-signature:-520092929 1073786111 9 0 415 0;}
|
||||
/* Style Definitions */
|
||||
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
||||
{mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-parent:"";
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:10.0pt;
|
||||
margin-left:0in;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan;
|
||||
font-size:11.0pt;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-fareast-font-family:Calibri;
|
||||
mso-fareast-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
h1
|
||||
{mso-style-priority:9;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Heading 1 Char";
|
||||
mso-style-next:Normal;
|
||||
margin-top:24.0pt;
|
||||
margin-right:0in;
|
||||
margin-bottom:0in;
|
||||
margin-left:0in;
|
||||
margin-bottom:.0001pt;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan lines-together;
|
||||
page-break-after:avoid;
|
||||
mso-outline-level:1;
|
||||
font-size:14.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#365F91;
|
||||
mso-themecolor:accent1;
|
||||
mso-themeshade:191;
|
||||
mso-font-kerning:0pt;}
|
||||
p.MsoTitle, li.MsoTitle, div.MsoTitle
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:15.0pt;
|
||||
margin-left:0in;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpFirst, li.MsoTitleCxSpFirst, div.MsoTitleCxSpFirst
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin:0in;
|
||||
margin-bottom:.0001pt;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpMiddle, li.MsoTitleCxSpMiddle, div.MsoTitleCxSpMiddle
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin:0in;
|
||||
margin-bottom:.0001pt;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
p.MsoTitleCxSpLast, li.MsoTitleCxSpLast, div.MsoTitleCxSpLast
|
||||
{mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-qformat:yes;
|
||||
mso-style-link:"Title Char";
|
||||
mso-style-next:Normal;
|
||||
mso-style-type:export-only;
|
||||
margin-top:0in;
|
||||
margin-right:0in;
|
||||
margin-bottom:15.0pt;
|
||||
margin-left:0in;
|
||||
mso-add-space:auto;
|
||||
mso-pagination:widow-orphan;
|
||||
border:none;
|
||||
mso-border-bottom-alt:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;
|
||||
padding:0in;
|
||||
mso-padding-alt:0in 0in 4.0pt 0in;
|
||||
font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
span.TitleChar
|
||||
{mso-style-name:"Title Char";
|
||||
mso-style-priority:10;
|
||||
mso-style-unhide:no;
|
||||
mso-style-locked:yes;
|
||||
mso-style-link:Title;
|
||||
mso-ansi-font-size:26.0pt;
|
||||
mso-bidi-font-size:26.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#17365D;
|
||||
mso-themecolor:text2;
|
||||
mso-themeshade:191;
|
||||
letter-spacing:.25pt;
|
||||
mso-font-kerning:14.0pt;}
|
||||
span.Heading1Char
|
||||
{mso-style-name:"Heading 1 Char";
|
||||
mso-style-priority:9;
|
||||
mso-style-unhide:no;
|
||||
mso-style-locked:yes;
|
||||
mso-style-link:"Heading 1";
|
||||
mso-ansi-font-size:14.0pt;
|
||||
mso-bidi-font-size:14.0pt;
|
||||
font-family:"Cambria","serif";
|
||||
mso-ascii-font-family:Cambria;
|
||||
mso-ascii-theme-font:major-latin;
|
||||
mso-fareast-font-family:"Times New Roman";
|
||||
mso-fareast-theme-font:major-fareast;
|
||||
mso-hansi-font-family:Cambria;
|
||||
mso-hansi-theme-font:major-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:major-bidi;
|
||||
color:#365F91;
|
||||
mso-themecolor:accent1;
|
||||
mso-themeshade:191;
|
||||
font-weight:bold;}
|
||||
.MsoChpDefault
|
||||
{mso-style-type:export-only;
|
||||
mso-default-props:yes;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-fareast-font-family:Calibri;
|
||||
mso-fareast-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
.MsoPapDefault
|
||||
{mso-style-type:export-only;
|
||||
margin-bottom:10.0pt;
|
||||
line-height:115%;}
|
||||
@page WordSection1
|
||||
{size:8.5in 11.0in;
|
||||
margin:1.0in 1.0in 1.0in 1.0in;
|
||||
mso-header-margin:.5in;
|
||||
mso-footer-margin:.5in;
|
||||
mso-paper-source:0;}
|
||||
div.WordSection1
|
||||
{page:WordSection1;}
|
||||
-->
|
||||
</style>
|
||||
<!--[if gte mso 10]>
|
||||
<style>
|
||||
/* Style Definitions */
|
||||
table.MsoNormalTable
|
||||
{mso-style-name:"Table Normal";
|
||||
mso-tstyle-rowband-size:0;
|
||||
mso-tstyle-colband-size:0;
|
||||
mso-style-noshow:yes;
|
||||
mso-style-priority:99;
|
||||
mso-style-parent:"";
|
||||
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
||||
mso-para-margin-top:0in;
|
||||
mso-para-margin-right:0in;
|
||||
mso-para-margin-bottom:10.0pt;
|
||||
mso-para-margin-left:0in;
|
||||
line-height:115%;
|
||||
mso-pagination:widow-orphan;
|
||||
font-size:11.0pt;
|
||||
font-family:"Calibri","sans-serif";
|
||||
mso-ascii-font-family:Calibri;
|
||||
mso-ascii-theme-font:minor-latin;
|
||||
mso-hansi-font-family:Calibri;
|
||||
mso-hansi-theme-font:minor-latin;
|
||||
mso-bidi-font-family:"Times New Roman";
|
||||
mso-bidi-theme-font:minor-bidi;}
|
||||
</style>
|
||||
<![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapedefaults v:ext="edit" spidmax="1026"/>
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapelayout v:ext="edit">
|
||||
<o:idmap v:ext="edit" data="1"/>
|
||||
</o:shapelayout></xml><![endif]-->
|
||||
</head>
|
||||
|
||||
<body lang=EN-US style='tab-interval:.5in'>
|
||||
|
||||
<div class=WordSection1>
|
||||
|
||||
<div style='mso-element:para-border-div;border:none;border-bottom:solid #4F81BD 1.0pt;
|
||||
mso-border-bottom-themecolor:accent1;padding:0in 0in 4.0pt 0in'>
|
||||
|
||||
<p class=MsoTitle>This is a test</p>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
|
@ -113,6 +113,23 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
|||
</java>
|
||||
</target>
|
||||
|
||||
<property name="html.strip.charfilter.supp.macros.output.file"
|
||||
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
|
||||
|
||||
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true"
|
||||
output="${html.strip.charfilter.supp.macros.output.file}">
|
||||
<classpath>
|
||||
<path refid="additional.dependencies"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-tools" depends="common.compile-tools">
|
||||
<compile
|
||||
srcdir="src/tools/java"
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/** creates a macro to augment jflex's unicode support for > BMP */
|
||||
public class GenerateHTMLStripCharFilterSupplementaryMacros {
|
||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||
(DateFormat.FULL, DateFormat.FULL, Locale.US);
|
||||
static {
|
||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
}
|
||||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2010 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
+ " * Unless required by applicable law or agreed to in writing, software" + NL
|
||||
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
|
||||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
outputHeader();
|
||||
outputMacro("ID_Start_Supp", "[:ID_Start:]");
|
||||
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
|
||||
}
|
||||
|
||||
static void outputHeader() {
|
||||
System.out.print(APACHE_LICENSE);
|
||||
System.out.print("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString() + " on ");
|
||||
System.out.println(DATE_FORMAT.format(new Date()));
|
||||
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
|
||||
System.out.print(NL + NL);
|
||||
}
|
||||
|
||||
// we have to carefully output the possibilities as compact utf-16
|
||||
// range expressions, or jflex will OOM!
|
||||
static void outputMacro(String name, String pattern) {
|
||||
UnicodeSet set = new UnicodeSet(pattern);
|
||||
set.removeAll(BMP);
|
||||
System.out.println(name + " = (");
|
||||
// if the set is empty, we have to do this or jflex will barf
|
||||
if (set.isEmpty()) {
|
||||
System.out.println("\t []");
|
||||
}
|
||||
|
||||
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<Character,UnicodeSet>();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
|
||||
char utf16[] = Character.toChars(it.codepoint);
|
||||
UnicodeSet trails = utf16ByLead.get(utf16[0]);
|
||||
if (trails == null) {
|
||||
trails = new UnicodeSet();
|
||||
utf16ByLead.put(utf16[0], trails);
|
||||
}
|
||||
trails.add(utf16[1]);
|
||||
}
|
||||
|
||||
Map<String,UnicodeSet> utf16ByTrail = new HashMap<String,UnicodeSet>();
|
||||
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
|
||||
String trail = entry.getValue().getRegexEquivalent();
|
||||
UnicodeSet leads = utf16ByTrail.get(trail);
|
||||
if (leads == null) {
|
||||
leads = new UnicodeSet();
|
||||
utf16ByTrail.put(trail, leads);
|
||||
}
|
||||
leads.add(entry.getKey());
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
|
||||
System.out.print( isFirst ? "\t " : "\t| ");
|
||||
isFirst = false;
|
||||
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
|
||||
}
|
||||
System.out.println(")");
|
||||
}
|
||||
}
|
|
@ -401,6 +401,14 @@ Upgrading from Solr 3.5
|
|||
* As doGet() methods in SimplePostTool was changed to static, the client applications of this
|
||||
class need to be recompiled.
|
||||
|
||||
* In Solr version 3.5 and earlier, HTMLStripCharFilter had known bugs in the
|
||||
character offsets it provided, triggering e.g. exceptions in highlighting.
|
||||
HTMLStripCharFilter has been re-implemented, addressing this and other
|
||||
issues. See the entry for LUCENE-3690 in the Bug Fixes section below for a
|
||||
detailed list of changes. For people who depend on the behavior of
|
||||
HTMLStripCharFilter in Solr version 3.5 and earlier: the old implementation
|
||||
(bugs and all) is preserved as LegacyHTMLStripCharFilter.
|
||||
|
||||
New Features
|
||||
----------------------
|
||||
* SOLR-2904: BinaryUpdateRequestHandler should be able to accept multiple update requests from
|
||||
|
@ -483,6 +491,41 @@ Bug Fixes
|
|||
|
||||
* SOLR-2970: CSV ResponseWriter returns fields defined as stored=false in schema (janhoy)
|
||||
|
||||
* LUCENE-3690, LUCENE-2208, SOLR-882, SOLR-42: Re-implemented
|
||||
HTMLStripCharFilter as a JFlex-generated scanner. See below for a list
|
||||
of bug fixes and other changes. To get the same behavior as
|
||||
HTMLStripCharFilter in Solr version 3.5 and earlier (including the bugs),
|
||||
use LegacyHTMLStripCharFilter, which is the previous implementation.
|
||||
|
||||
Behavior changes from the previous version:
|
||||
|
||||
- Known offset bugs are fixed.
|
||||
- The "Mark invalid" exceptions reported in SOLR-1283 are no longer
|
||||
triggered (the bug is still present in LegacyHTMLStripCharFilter).
|
||||
- The character entity "'" is now always properly decoded.
|
||||
- More cases of <script> tags are now properly stripped.
|
||||
- CDATA sections are now handled properly.
|
||||
- Valid tag name characters now include the supplementary Unicode characters
|
||||
from Unicode character classes [:ID_Start:] and [:ID_Continue:].
|
||||
- Uppercase character entities """, "©", ">", "<", "®",
|
||||
and "&" are now recognized and handled as if they were in lowercase.
|
||||
- Opening tags with unbalanced quotation marks are now properly stripped.
|
||||
- Literal "<" and ">" characters in opening tags, regardless of whether they
|
||||
appear inside quotation marks, now inhibit recognition (and stripping) of
|
||||
the tags. The only exception to this is for values of event-handler
|
||||
attributes, e.g. "onClick", "onLoad", "onSelect".
|
||||
- A newline '\n' is substituted instead of a space for stripped HTML markup.
|
||||
- Nothing is substituted for opening and closing inline tags - they are
|
||||
simply removed. The list of inline tags is (case insensitively): <a>,
|
||||
<abbr>, <acronym>, <b>, <basefont>, <bdo>, <big>, <cite>, <code>, <dfn>,
|
||||
<em>, <font>, <i>, <img>, <input>, <kbd>, <label>, <q>, <s>, <samp>,
|
||||
<select>, <small>, <span>, <strike>, <strong>, <sub>, <sup>, <textarea>,
|
||||
<tt>, <u>, and <var>.
|
||||
- HTMLStripCharFilterFactory now handles HTMLStripCharFilter's "escapedTags"
|
||||
feature: opening and closing tags with the given names, including any
|
||||
attributes and their values, are left intact in the output.
|
||||
(Steve Rowe)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
* SOLR-2922: Upgrade commons-io and commons-lang to 2.1 and 2.6, respectively. (koji)
|
||||
|
|
|
@ -21,12 +21,18 @@ package org.apache.solr.analysis;
|
|||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Factory for {@link HTMLStripCharFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <charFilter class="solr.HTMLStripCharFilterFactory"/>
|
||||
* <charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" />
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre
|
||||
|
@ -34,8 +40,31 @@ import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
|||
*/
|
||||
public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
|
||||
|
||||
Set<String> escapedTags = null;
|
||||
Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
|
||||
|
||||
public HTMLStripCharFilter create(CharStream input) {
|
||||
return new HTMLStripCharFilter(input);
|
||||
HTMLStripCharFilter charFilter;
|
||||
if (null == escapedTags) {
|
||||
charFilter = new HTMLStripCharFilter(input);
|
||||
} else {
|
||||
charFilter = new HTMLStripCharFilter(input, escapedTags);
|
||||
}
|
||||
return charFilter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init(args);
|
||||
String escapedTagsArg = args.get("escapedTags");
|
||||
if (null != escapedTagsArg) {
|
||||
Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
|
||||
while (matcher.find()) {
|
||||
if (null == escapedTags) {
|
||||
escapedTags = new HashSet<String>();
|
||||
}
|
||||
escapedTags.add(matcher.group(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
|
||||
/**
|
||||
* Factory for {@link LegacyHTMLStripCharFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_html_legacy" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <charFilter class="solr.LegacyHTMLStripCharFilterFactory"/>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* </fieldType>
|
||||
* </pre>
|
||||
* <p>
|
||||
* This factory is <b>NOT</b> recommended for new users and should be
|
||||
* considered <b>UNSUPPORTED</b>.
|
||||
* </p>
|
||||
* <p>
|
||||
* In Solr version 3.5 and earlier, <tt>HTMLStripCharFilter(Factory)</tt>
|
||||
* had known bugs in the offsets it provided, triggering e.g. exceptions in
|
||||
* highlighting.
|
||||
* </p>
|
||||
* <p>
|
||||
* This class is provided as possible alternative for people who depend on
|
||||
* the "broken" behavior of <tt>HTMLStripCharFilter</tt> in Solr version 3.5
|
||||
* and earlier, and/or who don't like the changes introduced by the Solr 3.6+
|
||||
* version of <tt>HTMLStripCharFilterFactory</tt>. (See the 3.6.0 release
|
||||
* section of lucene/CHANGES.txt for a list of differences in behavior.)
|
||||
* </p>
|
||||
* @deprecated use {@link HTMLStripCharFilterFactory}
|
||||
*/
|
||||
@Deprecated
|
||||
public class LegacyHTMLStripCharFilterFactory extends BaseCharFilterFactory {
|
||||
|
||||
public LegacyHTMLStripCharFilter create(CharStream input) {
|
||||
return new LegacyHTMLStripCharFilter(input);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,321 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Ignore;
|
||||
|
||||
public class LegacyHTMLStripCharFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
//this is some text here is a link and another link . This is an entity: & plus a <. Here is an &
|
||||
//
|
||||
public void test() throws IOException {
|
||||
String html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
|
||||
"another <a href=\"http://lucene.apache.org/\">link</a>. " +
|
||||
"This is an entity: & plus a <. Here is an &. <!-- is a comment -->";
|
||||
String gold = " this is some text here is a link and " +
|
||||
"another link . " +
|
||||
"This is an entity: & plus a <. Here is an &. ";
|
||||
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(html)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
char [] goldArray = gold.toCharArray();
|
||||
int position = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
char theChar = (char) ch;
|
||||
builder.append(theChar);
|
||||
assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal: " + goldArray[position]
|
||||
+ " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
|
||||
position++;
|
||||
}
|
||||
assertEquals(gold, builder.toString());
|
||||
}
|
||||
|
||||
//Some sanity checks, but not a full-fledged check
|
||||
public void testHTML() throws Exception {
|
||||
InputStream stream = getClass().getResourceAsStream("htmlStripReaderTest.html");
|
||||
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new InputStreamReader(stream, "UTF-8")));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = -1;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String str = builder.toString();
|
||||
assertTrue("Entity not properly escaped", str.indexOf("<") == -1);//there is one > in the text
|
||||
assertTrue("Forrest should have been stripped out", str.indexOf("forrest") == -1 && str.indexOf("Forrest") == -1);
|
||||
assertTrue("File should start with 'Welcome to Solr' after trimming", str.trim().startsWith("Welcome to Solr"));
|
||||
|
||||
assertTrue("File should start with 'Foundation.' after trimming", str.trim().endsWith("Foundation."));
|
||||
|
||||
}
|
||||
|
||||
public void testGamma() throws Exception {
|
||||
String test = "Γ";
|
||||
String gold = "\u0393";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
}
|
||||
|
||||
public void testEntities() throws Exception {
|
||||
String test = " <foo> Übermensch = Γ bar Γ";
|
||||
String gold = " <foo> \u00DCbermensch = \u0393 bar \u0393";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
}
|
||||
|
||||
public void testMoreEntities() throws Exception {
|
||||
String test = " <junk/> ! @ and ’";
|
||||
String gold = " <junk/> ! @ and ’";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold, result.equals(gold) == true);
|
||||
}
|
||||
|
||||
public void testReserved() throws Exception {
|
||||
String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
|
||||
Set<String> set = new HashSet<String>();
|
||||
set.add("reserved");
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Result: " + result);
|
||||
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved"), result.indexOf("reserved") == 9);
|
||||
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 15), result.indexOf("reserved", 15) == 38);
|
||||
assertTrue("Escaped tag not preserved: " + result.indexOf("reserved", 41), result.indexOf("reserved", 41) == 54);
|
||||
assertTrue("Other tag should be removed", result.indexOf("other") == -1);
|
||||
}
|
||||
|
||||
public void testMalformedHTML() throws Exception {
|
||||
String test = "a <a hr<ef=aa<a>> </close</a>";
|
||||
String gold = "a <a hr<ef=aa > </close ";
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)));
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
String result = builder.toString();
|
||||
// System.out.println("Resu: " + result + "<EOL>");
|
||||
// System.out.println("Gold: " + gold + "<EOL>");
|
||||
assertTrue(result + " is not equal to " + gold + "<EOS>", result.equals(gold) == true);
|
||||
}
|
||||
|
||||
public void testBufferOverflow() throws Exception {
|
||||
StringBuilder testBuilder = new StringBuilder(LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 50);
|
||||
testBuilder.append("ah<?> ??????");
|
||||
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<!--");//comments
|
||||
appendChars(testBuilder, 3*LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have two lookaheads
|
||||
|
||||
testBuilder.append("-->foo");
|
||||
processBuffer(testBuilder.toString(), "Failed w/ comment");
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<?");
|
||||
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
testBuilder.append("?>");
|
||||
processBuffer(testBuilder.toString(), "Failed with proc. instr.");
|
||||
|
||||
testBuilder.setLength(0);
|
||||
testBuilder.append("<b ");
|
||||
appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
|
||||
testBuilder.append("/>");
|
||||
processBuffer(testBuilder.toString(), "Failed on tag");
|
||||
|
||||
}
|
||||
|
||||
private void appendChars(StringBuilder testBuilder, int numChars) {
|
||||
int i1 = numChars / 2;
|
||||
for (int i = 0; i < i1; i++){
|
||||
testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead limit, since <?> makes LegacyHTMLStripCharFilter think it is a processing instruction
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processBuffer(String test, String assertMsg) throws IOException {
|
||||
// System.out.println("-------------------processBuffer----------");
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
|
||||
}
|
||||
assertTrue(assertMsg + "::: " + builder.toString() + " is not equal to " + test, builder.toString().equals(test) == true);
|
||||
}
|
||||
|
||||
public void testComment() throws Exception {
|
||||
|
||||
String test = "<!--- three dashes, still a valid comment ---> ";
|
||||
String gold = " ";
|
||||
Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force the use of BufferedReader
|
||||
int ch = 0;
|
||||
StringBuilder builder = new StringBuilder();
|
||||
try {
|
||||
while ((ch = reader.read()) != -1){
|
||||
builder.append((char)ch);
|
||||
}
|
||||
} finally {
|
||||
// System.out.println("String: " + builder.toString());
|
||||
}
|
||||
assertTrue(builder.toString() + " is not equal to " + gold + "<EOS>", builder.toString().equals(gold) == true);
|
||||
}
|
||||
|
||||
|
||||
public void doTestOffsets(String in) throws Exception {
|
||||
LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(in))));
|
||||
int ch = 0;
|
||||
int off = 0; // offset in the reader
|
||||
int strOff = -1; // offset in the original string
|
||||
while ((ch = reader.read()) != -1) {
|
||||
int correctedOff = reader.correctOffset(off);
|
||||
|
||||
if (ch == 'X') {
|
||||
strOff = in.indexOf('X',strOff+1);
|
||||
assertEquals(strOff, correctedOff);
|
||||
}
|
||||
|
||||
off++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
doTestOffsets("hello X how X are you");
|
||||
doTestOffsets("hello <p> X<p> how <p>X are you");
|
||||
doTestOffsets("X & X ( X < > X");
|
||||
|
||||
// test backtracking
|
||||
doTestOffsets("X < &zz >X &# < X > < &l > &g < X");
|
||||
}
|
||||
|
||||
@Ignore("broken offsets: see LUCENE-2208")
|
||||
public void testRandom() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
|
||||
}
|
||||
};
|
||||
|
||||
int numRounds = RANDOM_MULTIPLIER * 10000;
|
||||
checkRandomData(random, analyzer, numRounds);
|
||||
}
|
||||
|
||||
public void testRandomBrokenHTML() throws Exception {
|
||||
int maxNumElements = 10000;
|
||||
String text = _TestUtil.randomHtmlishString(random, maxNumElements);
|
||||
Reader reader
|
||||
= new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(text)));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
|
||||
public void testRandomText() throws Exception {
|
||||
StringBuilder text = new StringBuilder();
|
||||
int minNumWords = 10;
|
||||
int maxNumWords = 10000;
|
||||
int minWordLength = 3;
|
||||
int maxWordLength = 20;
|
||||
int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
|
||||
switch (_TestUtil.nextInt(random, 0, 4)) {
|
||||
case 0: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomRealisticUnicodeString
|
||||
(random, minWordLength, maxWordLength));
|
||||
text.append(' ');
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: { // ASCII 50% of the time
|
||||
for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
|
||||
text.append(_TestUtil.randomSimpleString(random));
|
||||
text.append(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
Reader reader = new LegacyHTMLStripCharFilter
|
||||
(CharReader.get(new StringReader(text.toString())));
|
||||
while (reader.read() != -1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure this factory is working
|
||||
*/
|
||||
public class TestHTMLStripCharFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
|
||||
public void testNothingChanged() throws IOException {
|
||||
// 11111111112
|
||||
// 012345678901234567890
|
||||
final String text = "this is only a test.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("escapedTags", "a, Title");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "only", "a", "test." },
|
||||
new int[] { 0, 5, 8, 13, 15 },
|
||||
new int[] { 4, 7, 12, 14, 20 });
|
||||
}
|
||||
|
||||
public void testNoEscapedTags() throws IOException {
|
||||
// 11111111112222222222333333333344
|
||||
// 012345678901234567890123456789012345678901
|
||||
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "only", "a", "test." },
|
||||
new int[] { 3, 12, 18, 27, 32 },
|
||||
new int[] { 11, 14, 26, 28, 41 });
|
||||
}
|
||||
|
||||
public void testEscapedTags() throws IOException {
|
||||
// 11111111112222222222333333333344
|
||||
// 012345678901234567890123456789012345678901
|
||||
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("escapedTags", "U i");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "<u>this</u>", "is", "only", "a", "<I>test</I>." },
|
||||
new int[] { 0, 12, 18, 27, 29 },
|
||||
new int[] { 11, 14, 26, 28, 41 });
|
||||
}
|
||||
|
||||
public void testSeparatorOnlyEscapedTags() throws IOException {
|
||||
// 11111111112222222222333333333344
|
||||
// 012345678901234567890123456789012345678901
|
||||
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("escapedTags", ",, , ");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "only", "a", "test." },
|
||||
new int[] { 3, 12, 18, 27, 32 },
|
||||
new int[] { 11, 14, 26, 28, 41 });
|
||||
}
|
||||
|
||||
public void testEmptyEscapedTags() throws IOException {
|
||||
// 11111111112222222222333333333344
|
||||
// 012345678901234567890123456789012345678901
|
||||
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("escapedTags", "");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "only", "a", "test." },
|
||||
new int[] { 3, 12, 18, 27, 32 },
|
||||
new int[] { 11, 14, 26, 28, 41 });
|
||||
}
|
||||
|
||||
public void testSingleEscapedTag() throws IOException {
|
||||
// 11111111112222222222333333333344
|
||||
// 012345678901234567890123456789012345678901
|
||||
final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
|
||||
HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("escapedTags", ", B\r\n\t");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(CharReader.get(new StringReader(text)));
|
||||
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "<b>only</b>", "a", "test." },
|
||||
new int[] { 3, 12, 15, 27, 32 },
|
||||
new int[] { 11, 14, 26, 28, 41 });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,350 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-skin-name" content="pelt">
|
||||
<title>Welcome to Solr</title>
|
||||
<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
||||
<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
|
||||
<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
|
||||
<link type="text/css" href="skin/profile.css" rel="stylesheet">
|
||||
<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
|
||||
<link rel="shortcut icon" href="images/favicon.ico">
|
||||
</head>
|
||||
<body onload="init()">
|
||||
<script type="text/javascript">ndeSetTextSize();</script>
|
||||
<div id="top">
|
||||
<!--+
|
||||
|breadtrail
|
||||
+-->
|
||||
<div class="breadtrail">
|
||||
<a href="http://www.apache.org/">apache</a> > <a href="http://lucene.apache.org/">lucene</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
|
||||
</div>
|
||||
<!--+
|
||||
|header
|
||||
+-->
|
||||
<div class="header">
|
||||
<!--+
|
||||
|start group logo
|
||||
+-->
|
||||
<div class="grouplogo">
|
||||
<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
|
||||
</div>
|
||||
<!--+
|
||||
|end group logo
|
||||
+-->
|
||||
<!--+
|
||||
|start Project Logo
|
||||
+-->
|
||||
<div class="projectlogo">
|
||||
<a href="http://lucene.apache.org/solr/"><img class="logoImage" alt="Solr" src="images/solr_small.png" title="Solr Description"></a>
|
||||
</div>
|
||||
<!--+
|
||||
|end Project Logo
|
||||
+-->
|
||||
<!--+
|
||||
|start Search
|
||||
+-->
|
||||
<div class="searchbox">
|
||||
<form action="http://www.google.com/search" method="get" class="roundtopsmall">
|
||||
<input value="lucene.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">
|
||||
<input name="Search" value="Search" type="submit">
|
||||
</form>
|
||||
</div>
|
||||
<!--+
|
||||
|end search
|
||||
+-->
|
||||
<!--+
|
||||
|start Tabs
|
||||
+-->
|
||||
<ul id="tabs">
|
||||
<li class="current">
|
||||
<a class="selected" href="index.html">Main</a>
|
||||
</li>
|
||||
<li>
|
||||
<a class="unselected" href="http://wiki.apache.org/solr">Wiki</a>
|
||||
</li>
|
||||
</ul>
|
||||
<!--+
|
||||
|end Tabs
|
||||
+-->
|
||||
</div>
|
||||
</div>
|
||||
<div id="main">
|
||||
<div id="publishedStrip">
|
||||
<!--+
|
||||
|start Subtabs
|
||||
+-->
|
||||
<div id="level2tabs"></div>
|
||||
<!--+
|
||||
|end Endtabs
|
||||
+-->
|
||||
<script type="text/javascript"><!--
|
||||
document.write("Last Published: " + document.lastModified);
|
||||
// --></script>
|
||||
</div>
|
||||
<!--+
|
||||
|breadtrail
|
||||
+-->
|
||||
<div class="breadtrail">
|
||||
|
||||
|
||||
</div>
|
||||
<!--+
|
||||
|start Menu, mainarea
|
||||
+-->
|
||||
<!--+
|
||||
|start Menu
|
||||
+-->
|
||||
<div id="menu">
|
||||
<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">About</div>
|
||||
<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
|
||||
<div class="menupage">
|
||||
<div class="menupagetitle">Welcome</div>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="who.html" title="Solr Committers">Who We Are</a>
|
||||
</div>
|
||||
</div>
|
||||
<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
|
||||
<div id="menu_1.2" class="menuitemgroup">
|
||||
<div class="menuitem">
|
||||
<a href="features.html">Features</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="tutorial.html">Tutorial</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="http://wiki.apache.org/solr/">Docs (Wiki)</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="http://wiki.apache.org/solr/FAQ">FAQ</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="api/index.html">javadoc</a>
|
||||
</div>
|
||||
</div>
|
||||
<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
|
||||
<div id="menu_1.3" class="menuitemgroup">
|
||||
<div class="menuitem">
|
||||
<a href="http://www.apache.org/dyn/closer.cgi/lucene/solr/">Download</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="mailing_lists.html">Mailing Lists</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="issue_tracking.html">Issue Tracking</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="version_control.html">Version Control</a>
|
||||
</div>
|
||||
</div>
|
||||
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
|
||||
<div id="menu_1.4" class="menuitemgroup">
|
||||
<div class="menuitem">
|
||||
<a href="http://lucene.apache.org/java/">Lucene Java</a>
|
||||
</div>
|
||||
<div class="menuitem">
|
||||
<a href="http://lucene.apache.org/nutch/">Nutch</a>
|
||||
</div>
|
||||
</div>
|
||||
<div id="credit">
|
||||
<hr>
|
||||
<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
|
||||
</div>
|
||||
<div id="roundbottom">
|
||||
<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
|
||||
<!--+
|
||||
|alternative credits
|
||||
+-->
|
||||
<div id="credit2"></div>
|
||||
</div>
|
||||
<!--+
|
||||
|end Menu
|
||||
+-->
|
||||
<!--+
|
||||
|start content
|
||||
+-->
|
||||
<div id="content">
|
||||
<div title="Portable Document Format" class="pdflink">
|
||||
<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
|
||||
PDF</a>
|
||||
</div>
|
||||
<h1>Welcome to Solr</h1>
|
||||
<div id="minitoc-area">
|
||||
<ul class="minitoc">
|
||||
<li>
|
||||
<a href="#intro">What Is Solr?</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#news">News</a>
|
||||
<ul class="minitoc">
|
||||
<li>
|
||||
<a href="#02+October+2007+-+Solr+at+OSSummit+Asia">02 October 2007 - Solr at OSSummit Asia</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<a name="N1000D"></a><a name="intro"></a>
|
||||
<h2 class="boxed">What Is Solr?</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
Solr is an open source enterprise search server based on the
|
||||
<a href="http://lucene.apache.org/java/">Lucene Java</a> search library, with XML/HTTP and JSON APIs,
|
||||
hit highlighting, faceted search, caching, replication, and a web administration interface.
|
||||
It runs in a Java servlet container such as <a href="http://tomcat.apache.org">Tomcat</a>.
|
||||
</p>
|
||||
<p>
|
||||
See the complete <a href="features.html">feature list</a> for more details, then check out the <a href="tutorial.html">tutorial</a>.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
<a name="N1002A"></a><a name="news"></a>
|
||||
<h2 class="boxed">News</h2>
|
||||
<div class="section">
|
||||
<a name="N10030"></a><a name="02+October+2007+-+Solr+at+OSSummit+Asia"></a>
|
||||
<h3 class="boxed">02 October 2007 - Solr at OSSummit Asia</h3>
|
||||
<p>
|
||||
<a href="http://www.ossummit.com"><img alt="OSSummit Asia logo" class="float-right" src="http://www.ossummit.com/2007/images/logo.png"></a>
|
||||
Lucene and Solr tutorials!
|
||||
</p>
|
||||
<p>The following talks and trainings are scheduled for the upcoming 2008 OSSummit:</p>
|
||||
<ul>
|
||||
|
||||
<li>
|
||||
<a href="http://www.ossummit.com/2007/program/talk/8">Lucene Boot Camp</a> by Erik Hatcher (originally by Grant Ingersoll). An all-day training focusing on getting started with Lucene - the core library under Solr.</li>
|
||||
|
||||
<li>
|
||||
<a href="http://www.ossummit.com/2007/program/talk/25">Solr in a Day</a> by Erik Hatcher. All you need to know to use Solr effectively.</li>
|
||||
|
||||
<li>
|
||||
<a href="http://www.ossummit.com/2007/program/talk/67">Lucene Case Studies</a> by Erik Hatcher. A rapid series of examples of many Lucene and Solr using applications.</li>
|
||||
|
||||
</ul>
|
||||
<a name="N10058"></a><a name="03+September+2007+-+Lucene+at+ApacheCon+Atlanta"></a>
|
||||
<h3 class="boxed">03 September 2007 - Lucene at ApacheCon Atlanta</h3>
|
||||
<p>
|
||||
<a href="http://www.us.apachecon.com"><img alt="ApacheCon US logo" class="float-right" src="http://www.apache.org/ads/ApacheCon/2007-usa-125x125.png"></a>
|
||||
Lucene will once again be well represented at ApacheCon USA in Atlanta this November 12-16, 2007.
|
||||
</p>
|
||||
<p>The following talks and trainings are scheduled for this year's conference:</p>
|
||||
<ul>
|
||||
|
||||
<li>November 12: <a href="http://us.apachecon.com/us2007/program/talk/1859">Lucene Boot Camp</a> by Grant Ingersoll. An all-day training focusing on getting started with Lucene.</li>
|
||||
|
||||
<li>November 16, 9:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1992">Apache Solr out of the Box</a> by Chris Hostetter. Introduction to Solr.</li>
|
||||
|
||||
<li>November 16, 10:00 am: <a href="http://us.apachecon.com/us2007/program/talk/1943">Building a Vertical Search Site using Apache Software</a> by Ken Krugler. Will cover many Lucene-based projects.</li>
|
||||
|
||||
<li>November 16, 3:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/1953">Apache Lucene Performance</a> by Grant Ingersoll. Tips and techniques for improving Lucene performance.</li>
|
||||
|
||||
<li>November 16, 4:00 pm: <a href="http://us.apachecon.com/us2007/program/talk/2017"> Advanced Indexing Techniques with Apache Lucene</a> by Michael Busch. Information on payloads and advanced indexing techniques.</li>
|
||||
|
||||
</ul>
|
||||
<a name="N10091"></a><a name="06+June+2007%3A+Release+1.2+available"></a>
|
||||
<h3 class="boxed">06 June 2007: Release 1.2 available</h3>
|
||||
<p>
|
||||
This is the first release since Solr graduated from the Incubator,
|
||||
bringing many new features, including CSV/delimited-text data
|
||||
loading, time based autocommit, faster faceting, negative filters,
|
||||
a spell-check handler, sounds-like word filters, regex text filters,
|
||||
and more flexible plugins.
|
||||
</p>
|
||||
<p>See the <a href="http://svn.apache.org/repos/asf/lucene/solr/tags/release-1.2.0/CHANGES.txt">release notes</a> for more details.</p>
|
||||
<a name="N100A2"></a><a name="17+January+2007%3A+Solr+graduates+from+Incubator"></a>
|
||||
<h3 class="boxed">17 January 2007: Solr graduates from Incubator</h3>
|
||||
<p>
|
||||
Solr has graduated from the Apache Incubator, and is now a sub-project of Lucene.
|
||||
</p>
|
||||
<a name="N100AC"></a><a name="22+December+2006%3A+Release+1.1.0+available"></a>
|
||||
<h3 class="boxed">22 December 2006: Release 1.1.0 available</h3>
|
||||
<p>
|
||||
This is the first release since Solr joined the Incubator, and brings
|
||||
many new features and performance optimizations including highlighting,
|
||||
faceted search, and JSON/Python/Ruby response formats.
|
||||
</p>
|
||||
<a name="N100B6"></a><a name="15+August+2006%3A+Solr+at+ApacheCon+US"></a>
|
||||
<h3 class="boxed">15 August 2006: Solr at ApacheCon US</h3>
|
||||
<p>Chris Hostetter will be presenting
|
||||
<strong><a href="http://www.apachecon.com/2006/US/html/sessions.html#FR26">"Faceted Searching With Apache Solr"</a></strong>
|
||||
at ApacheCon US 2006, on October 13th at 4:30pm.
|
||||
See the <a href="http://www.us.apachecon.com/">ApacheCon</a> website for more details.
|
||||
</p>
|
||||
<a name="N100C9"></a><a name="21+April+2006%3A+Solr+at+ApacheCon"></a>
|
||||
<h3 class="boxed">21 April 2006: Solr at ApacheCon</h3>
|
||||
<p>Yonik Seeley will be presenting
|
||||
<strong>"Apache Solr, a Full-Text Search Server based on Lucene"</strong>
|
||||
at ApacheCon Europe 2006, on June 29th at 5:30pm.
|
||||
See the <a href="http://www.eu.apachecon.com/">ApacheCon</a> website for more details.
|
||||
</p>
|
||||
<a name="N100DA"></a><a name="21+February+2006%3A+nightly+builds"></a>
|
||||
<h3 class="boxed">21 February 2006: nightly builds</h3>
|
||||
<p>Solr now has nightly builds. This automatically creates a
|
||||
<a href="http://people.apache.org/builds/lucene/solr/nightly/">downloadable version of Solr every
|
||||
night</a>. All unit tests must pass, or a message is sent to
|
||||
the developers mailing list and no new version is created. This
|
||||
also updates the <a href="api/index.html">javadoc</a>.</p>
|
||||
<a name="N100EC"></a><a name="17+January+2006%3A+Solr+Joins+Apache+Incubator"></a>
|
||||
<h3 class="boxed">17 January 2006: Solr Joins Apache Incubator</h3>
|
||||
<p>Solr, a search server based on Lucene, has been accepted into the Apache Incubator.
|
||||
Solr was originally developed by CNET Networks, and is widely used within CNET
|
||||
to provide high relevancy search and faceted browsing capabilities.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
<!--+
|
||||
|end content
|
||||
+-->
|
||||
<div class="clearboth"> </div>
|
||||
</div>
|
||||
<div id="footer">
|
||||
<!--+
|
||||
|start bottomstrip
|
||||
+-->
|
||||
<div class="lastmodified">
|
||||
<script type="text/javascript"><!--
|
||||
document.write("Last Published: " + document.lastModified);
|
||||
// --></script>
|
||||
</div>
|
||||
<div class="copyright">
|
||||
Copyright ©
|
||||
2007 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
|
||||
</div>
|
||||
<div id="logos"></div>
|
||||
<!--+
|
||||
|end bottomstrip
|
||||
+-->
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -326,8 +326,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
|
|||
NamedList indexPart = textType.get("index");
|
||||
assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
|
||||
|
||||
assertEquals(" whátëvêr ", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
|
||||
assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
|
||||
assertEquals("\n\nwhátëvêr\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
|
||||
assertEquals("\n\nwhatever\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.MappingCharFilter"));
|
||||
|
||||
List<NamedList> tokenList = (List<NamedList>)indexPart.get(MockTokenizer.class.getName());
|
||||
assertNotNull("Expecting MockTokenizer analysis breakdown", tokenList);
|
||||
|
|
Loading…
Reference in New Issue