mirror of https://github.com/apache/lucene.git
LUCENE-589, LUCENE-2246: fix intl bugs in contrib/demo
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031460 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
863dfccc76
commit
048cdb57f4
|
@ -134,6 +134,12 @@ Bug fixes
|
|||
|
||||
* LUCENE-2616: FastVectorHighlighter: out of alignment when the first value is
|
||||
empty in multiValued field (Koji Sekiguchi)
|
||||
|
||||
* LUCENE-589: Fix contrib/demo for international documents.
|
||||
(Curtis d'Entremont via Robert Muir)
|
||||
|
||||
* LUCENE-2246: Fix contrib/demo for Turkish html documents.
|
||||
(Selim Nadi via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
|
|
|
@ -18,7 +18,9 @@ package org.apache.lucene.demo;
|
|||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -40,7 +42,7 @@ public class FileDocument {
|
|||
Reader field;
|
||||
*/
|
||||
public static Document Document(File f)
|
||||
throws java.io.FileNotFoundException {
|
||||
throws java.io.IOException {
|
||||
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
@ -58,9 +60,9 @@ public class FileDocument {
|
|||
|
||||
// Add the contents of the file to a field named "contents". Specify a Reader,
|
||||
// so that the text of the file is tokenized and indexed, but not stored.
|
||||
// Note that FileReader expects the file to be in the system's default encoding.
|
||||
// Note that FileReader expects the file to be in UTF-8 encoding.
|
||||
// If that's not the case searching for special characters will fail.
|
||||
doc.add(new Field("contents", new FileReader(f)));
|
||||
doc.add(new Field("contents", new InputStreamReader(new FileInputStream(f), "UTF-8")));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
|
|
|
@ -64,7 +64,8 @@ public class HTMLDocument {
|
|||
doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));
|
||||
|
||||
FileInputStream fis = new FileInputStream(f);
|
||||
HTMLParser parser = new HTMLParser(fis);
|
||||
InputStreamReader reader = new InputStreamReader(fis, "UTF-8");
|
||||
HTMLParser parser = new HTMLParser(reader);
|
||||
|
||||
// Add the tag-stripped contents as a Reader-valued Text field so it will
|
||||
// get tokenized and indexed.
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.demo;
|
|||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
@ -124,7 +125,7 @@ public class SearchFiles {
|
|||
|
||||
BufferedReader in = null;
|
||||
if (queries != null) {
|
||||
in = new BufferedReader(new FileReader(queries));
|
||||
in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
|
||||
} else {
|
||||
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
public class HTMLParser implements HTMLParserConstants {
|
||||
|
@ -40,14 +41,6 @@ public class HTMLParser implements HTMLParserConstants {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||
*/
|
||||
@Deprecated
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
|
@ -231,7 +224,7 @@ InterruptedException {
|
|||
Token t1, t2;
|
||||
boolean inImg = false;
|
||||
t1 = jj_consume_token(TagName);
|
||||
String tagName = t1.image.toLowerCase();
|
||||
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
|
@ -268,7 +261,7 @@ InterruptedException {
|
|||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase();
|
||||
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
|
@ -276,7 +269,7 @@ InterruptedException {
|
|||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase();
|
||||
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
|
@ -454,18 +447,18 @@ null)
|
|||
finally { jj_save(1, xla); }
|
||||
}
|
||||
|
||||
private boolean jj_3_1() {
|
||||
if (jj_scan_token(ArgQuote1)) return true;
|
||||
if (jj_scan_token(CloseQuote1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3_2() {
|
||||
if (jj_scan_token(ArgQuote2)) return true;
|
||||
if (jj_scan_token(CloseQuote2)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean jj_3_1() {
|
||||
if (jj_scan_token(ArgQuote1)) return true;
|
||||
if (jj_scan_token(CloseQuote1)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Generated Token Manager. */
|
||||
public HTMLParserTokenManager token_source;
|
||||
SimpleCharStream jj_input_stream;
|
||||
|
|
|
@ -19,9 +19,9 @@
|
|||
|
||||
options {
|
||||
STATIC = false;
|
||||
OPTIMIZE_TOKEN_MANAGER = true;
|
||||
//DEBUG_LOOKAHEAD = true;
|
||||
//DEBUG_TOKEN_MANAGER = true;
|
||||
UNICODE_INPUT = true;
|
||||
}
|
||||
|
||||
PARSER_BEGIN(HTMLParser)
|
||||
|
@ -29,6 +29,7 @@ PARSER_BEGIN(HTMLParser)
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
public class HTMLParser {
|
||||
|
@ -67,14 +68,6 @@ public class HTMLParser {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||
*/
|
||||
@Deprecated
|
||||
public HTMLParser(File file) throws FileNotFoundException {
|
||||
this(new FileInputStream(file));
|
||||
}
|
||||
|
||||
public String getTitle() throws IOException, InterruptedException {
|
||||
if (pipeIn == null)
|
||||
getReader(); // spawn parsing thread
|
||||
|
@ -231,7 +224,7 @@ void Tag() throws IOException :
|
|||
}
|
||||
{
|
||||
t1=<TagName> {
|
||||
String tagName = t1.image.toLowerCase();
|
||||
String tagName = t1.image.toLowerCase(Locale.ENGLISH);
|
||||
if(Tags.WS_ELEMS.contains(tagName) ) {
|
||||
addSpace();
|
||||
}
|
||||
|
@ -253,7 +246,7 @@ void Tag() throws IOException :
|
|||
)
|
||||
&& t2 != null)
|
||||
{
|
||||
currentMetaTag=t2.image.toLowerCase();
|
||||
currentMetaTag=t2.image.toLowerCase(Locale.ENGLISH);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
|
@ -261,7 +254,7 @@ void Tag() throws IOException :
|
|||
if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
|
||||
null)
|
||||
{
|
||||
currentMetaContent=t2.image.toLowerCase();
|
||||
currentMetaContent=t2.image.toLowerCase(Locale.ENGLISH);
|
||||
if(currentMetaTag != null && currentMetaContent != null) {
|
||||
addMetaTag();
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. HTMLParserTokenManager.java */
|
||||
package org.apache.lucene.demo.html;
|
||||
import java.io.*;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
/** Token Manager. */
|
||||
|
@ -218,6 +219,9 @@ private int jjStartNfaWithStates_0(int pos, int kind, int state)
|
|||
return jjMoveNfa_0(state, pos + 1);
|
||||
}
|
||||
static final long[] jjbitVec0 = {
|
||||
0xfffffffffffffffeL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
|
||||
};
|
||||
static final long[] jjbitVec2 = {
|
||||
0x0L, 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL
|
||||
};
|
||||
private int jjMoveNfa_0(int startState, int curPos)
|
||||
|
@ -460,6 +464,9 @@ private int jjMoveNfa_0(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -468,7 +475,7 @@ private int jjMoveNfa_0(int startState, int curPos)
|
|||
{
|
||||
case 22:
|
||||
case 23:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 2)
|
||||
kind = 2;
|
||||
|
@ -476,7 +483,7 @@ private int jjMoveNfa_0(int startState, int curPos)
|
|||
break;
|
||||
case 26:
|
||||
case 27:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 3)
|
||||
kind = 3;
|
||||
|
@ -562,6 +569,9 @@ private int jjMoveNfa_5(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -570,7 +580,7 @@ private int jjMoveNfa_5(int startState, int curPos)
|
|||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 25)
|
||||
kind = 25;
|
||||
|
@ -660,6 +670,9 @@ private int jjMoveNfa_7(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -667,7 +680,7 @@ private int jjMoveNfa_7(int startState, int curPos)
|
|||
switch(jjstateSet[--i])
|
||||
{
|
||||
case 0:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 29)
|
||||
kind = 29;
|
||||
|
@ -753,6 +766,9 @@ private int jjMoveNfa_4(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -761,7 +777,7 @@ private int jjMoveNfa_4(int startState, int curPos)
|
|||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 23)
|
||||
kind = 23;
|
||||
|
@ -876,6 +892,9 @@ private int jjMoveNfa_3(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -884,7 +903,7 @@ private int jjMoveNfa_3(int startState, int curPos)
|
|||
{
|
||||
case 0:
|
||||
case 1:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 19)
|
||||
kind = 19;
|
||||
|
@ -1042,6 +1061,9 @@ private int jjMoveNfa_6(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -1050,7 +1072,7 @@ private int jjMoveNfa_6(int startState, int curPos)
|
|||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 27)
|
||||
kind = 27;
|
||||
|
@ -1183,6 +1205,9 @@ private int jjMoveNfa_1(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -1191,14 +1216,14 @@ private int jjMoveNfa_1(int startState, int curPos)
|
|||
{
|
||||
case 1:
|
||||
case 0:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 14)
|
||||
kind = 14;
|
||||
jjCheckNAdd(0);
|
||||
break;
|
||||
case 3:
|
||||
if ((jjbitVec0[i2] & l2) != 0L)
|
||||
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
jjAddStates(18, 19);
|
||||
break;
|
||||
default : break;
|
||||
|
@ -1336,6 +1361,9 @@ private int jjMoveNfa_2(int startState, int curPos)
|
|||
}
|
||||
else
|
||||
{
|
||||
int hiByte = (int)(curChar >> 8);
|
||||
int i1 = hiByte >> 6;
|
||||
long l1 = 1L << (hiByte & 077);
|
||||
int i2 = (curChar & 0xff) >> 6;
|
||||
long l2 = 1L << (curChar & 077);
|
||||
do
|
||||
|
@ -1344,7 +1372,7 @@ private int jjMoveNfa_2(int startState, int curPos)
|
|||
{
|
||||
case 0:
|
||||
case 1:
|
||||
if ((jjbitVec0[i2] & l2) == 0L)
|
||||
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
|
||||
break;
|
||||
if (kind > 16)
|
||||
kind = 16;
|
||||
|
@ -1371,6 +1399,18 @@ static final int[] jjnextStates = {
|
|||
20, 21, 24, 12, 14, 16, 5, 8, 0, 4, 6, 0, 4, 6, 5, 0,
|
||||
4, 6, 3, 4,
|
||||
};
|
||||
private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
|
||||
{
|
||||
switch(hiByte)
|
||||
{
|
||||
case 0:
|
||||
return ((jjbitVec2[i2] & l2) != 0L);
|
||||
default :
|
||||
if ((jjbitVec0[i1] & l1) != 0L)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Token literal values. */
|
||||
public static final String[] jjstrLiteralImages = {
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
package org.apache.lucene.demo;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestDemo extends LuceneTestCase {
|
||||
// LUCENE-589
|
||||
public void testUnicodeHtml() throws Exception {
|
||||
File dir = getDataFile("test-files/html");
|
||||
File indexDir = new File(TEMP_DIR, "demoIndex");
|
||||
IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() });
|
||||
File queries = getDataFile("test-files/queries.txt");
|
||||
PrintStream outSave = System.out;
|
||||
try {
|
||||
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
|
||||
PrintStream fakeSystemOut = new PrintStream(bytes);
|
||||
System.setOut(fakeSystemOut);
|
||||
SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()});
|
||||
fakeSystemOut.flush();
|
||||
String output = bytes.toString(); // intentionally use default encoding
|
||||
assertTrue(output.contains("1 total matching documents"));
|
||||
} finally {
|
||||
System.setOut(outSave);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.demo.html.HTMLParser;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestHtmlParser extends LuceneTestCase {
|
||||
|
||||
public void testUnicode() throws Exception {
|
||||
String text = "<html><body>汉语</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("汉语", parser);
|
||||
}
|
||||
|
||||
public void testEntities() throws Exception {
|
||||
String text = "<html><body>汉语¥</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("汉语¥", parser);
|
||||
}
|
||||
|
||||
public void testComments() throws Exception {
|
||||
String text = "<html><body>foo<!-- bar --><! baz --></body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
}
|
||||
|
||||
public void testScript() throws Exception {
|
||||
String text = "<html><body><script type=\"text/javascript\">" +
|
||||
"document.write(\"test\")</script>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
}
|
||||
|
||||
public void testStyle() throws Exception {
|
||||
String text = "<html><head><style type=\"text/css\">" +
|
||||
"body{background-color:blue;}</style>" +
|
||||
"</head><body>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
}
|
||||
|
||||
public void testDoctype() throws Exception {
|
||||
String text = "<!DOCTYPE HTML PUBLIC " +
|
||||
"\"-//W3C//DTD HTML 4.01 Transitional//EN\"" +
|
||||
"\"http://www.w3.org/TR/html4/loose.dtd\">" +
|
||||
"<html><body>foo</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("foo", parser);
|
||||
}
|
||||
|
||||
public void testMeta() throws Exception {
|
||||
String text = "<html><head>" +
|
||||
"<meta name=\"a\" content=\"1\" />" +
|
||||
"<meta name=\"b\" content=\"2\" />" +
|
||||
"<meta name=\"keywords\" content=\"this is a test\" />" +
|
||||
"<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />" +
|
||||
"</head><body>foobar</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
Properties tags = parser.getMetaTags();
|
||||
assertEquals(4, tags.size());
|
||||
assertEquals("1", tags.get("a"));
|
||||
assertEquals("2", tags.get("b"));
|
||||
assertEquals("this is a test", tags.get("keywords"));
|
||||
assertEquals("text/html;charset=utf-8", tags.get("content-type"));
|
||||
}
|
||||
|
||||
public void testTitle() throws Exception {
|
||||
String text = "<html><head><TITLE>foo</TITLE><head><body>bar</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertEquals("foo", parser.getTitle());
|
||||
}
|
||||
|
||||
public void testSummary() throws Exception {
|
||||
String text = "<html><head><TITLE>foo</TITLE><head><body>" +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"Summarize me. Summarize me. Summarize me. Summarize me. " +
|
||||
"</body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertEquals(200, parser.getSummary().length());
|
||||
}
|
||||
|
||||
// LUCENE-2246
|
||||
public void testTurkish() throws Exception {
|
||||
String text = "<html><body>" +
|
||||
"<IMG SRC=\"../images/head.jpg\" WIDTH=570 HEIGHT=47 BORDER=0 ALT=\"ş\">" +
|
||||
"<a title=\"(ııı)\"></body></html>";
|
||||
HTMLParser parser = new HTMLParser(new StringReader(text));
|
||||
assertReadsTo("[ş]", parser);
|
||||
}
|
||||
|
||||
private void assertReadsTo(String expected, HTMLParser parser) throws IOException {
|
||||
Reader reader = parser.getReader();
|
||||
StringBuilder builder = new StringBuilder();
|
||||
int ch = 0;
|
||||
while ((ch = reader.read()) != -1) {
|
||||
builder.append((char)ch);
|
||||
}
|
||||
assertEquals(expected, builder.toString());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
汉语
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1 @@
|
|||
contents:汉语
|
Loading…
Reference in New Issue