mirror of https://github.com/apache/lucene.git
LUCENE-1591: workaround bug in xerces so we can process Wikipedia's XML
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@763416 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cb95e45419
commit
6b4831fd1d
|
@ -20,3 +20,50 @@ After that, ant enwiki should process the data set and run a load
|
|||
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
|
||||
also be used to download, decompress, and extract (to individual files
|
||||
in work/enwiki) the dataset, respectively.
|
||||
|
||||
NOTE: This bug in Xerces:
|
||||
|
||||
https://issues.apache.org/jira/browse/XERCESJ-1257
|
||||
|
||||
which is still present as of 2.9.1, causes an exception like this when
|
||||
processing Wikipedia's XML:
|
||||
|
||||
Caused by: org.apache.xerces.impl.io.MalformedByteSequenceException: Invalid byte 2 of 4-byte UTF-8 sequence.
|
||||
at org.apache.xerces.util.ErrorHandlerWrapper.createSAXParseException(Unknown Source)
|
||||
at org.apache.xerces.util.ErrorHandlerWrapper.fatalError(Unknown Source)
|
||||
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
|
||||
at org.apache.xerces.impl.XMLErrorReporter.reportError(Unknown Source)
|
||||
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source)
|
||||
at org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)
|
||||
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
|
||||
at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
|
||||
at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
|
||||
at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
|
||||
at org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker$Parser.run(EnwikiDocMaker.java:77)
|
||||
... 1 more
|
||||
|
||||
The original poster in the Xerces bug provided this patch:
|
||||
|
||||
--- UTF8Reader.java 2006-11-23 00:36:53.000000000 +0100
|
||||
+++ /home/rainman/lucene/xerces-2_9_0/src/org/apache/xerces/impl/io/UTF8Reader.java 2008-04-04 00:40:58.000000000 +0200
|
||||
@@ -534,6 +534,16 @@
|
||||
invalidByte(4, 4, b2);
|
||||
}
|
||||
|
||||
+ // check if output buffer is large enough to hold 2 surrogate chars
|
||||
+ if( out + 1 >= offset + length ){
|
||||
+ fBuffer[0] = (byte)b0;
|
||||
+ fBuffer[1] = (byte)b1;
|
||||
+ fBuffer[2] = (byte)b2;
|
||||
+ fBuffer[3] = (byte)b3;
|
||||
+ fOffset = 4;
|
||||
+ return out - offset;
|
||||
+ }
|
||||
+
|
||||
// decode bytes into surrogate characters
|
||||
int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
|
||||
if (uuuuu > 0x10) {
|
||||
|
||||
which I've applied to Xerces 2.9.1 sources, and committed under
|
||||
lib/xerces-2.9.1-patched-XERCESJ-1257.jar. Once XERCESJ-1257 is fixed
|
||||
we can upgrade to a standard Xerces release.
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
<property name="collections.jar" value="commons-collections-3.1.jar"/>
|
||||
<property name="logging.jar" value="commons-logging-1.0.4.jar"/>
|
||||
<property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
|
||||
<property name="xercesImpl.jar" value="xerces-2.9.0.jar"/>
|
||||
<property name="xercesImpl.jar" value="xerces-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<property name="xml-apis.jar" value="xml-apis-2.9.0.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[50df93072e50efffced25bc814f0f20426a6b69c] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ea000b3b79ec201e637841bebce16cf004231096] was removed in git history.
|
||||
Apache SVN contains full history.
|
Loading…
Reference in New Issue