#65061 - Handle VmlDrawings containing spreadsheet-ml default namespace

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1885197 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2021-01-06 12:39:02 +00:00
parent eba33ddb6f
commit 4c0d0b1381
2 changed files with 47 additions and 17 deletions

View File

@ -18,6 +18,7 @@
package org.apache.poi.xssf.usermodel;
import static org.apache.poi.ooxml.POIXMLTypeLoader.DEFAULT_XML_OPTIONS;
import static org.apache.poi.xssf.usermodel.XSSFRelation.NS_SPREADSHEETML;
import java.io.IOException;
import java.io.InputStream;
@ -46,7 +47,6 @@ import com.microsoft.schemas.vml.CTShapetype;
import com.microsoft.schemas.vml.STExt;
import com.microsoft.schemas.vml.STStrokeJoinStyle;
import org.apache.poi.ooxml.POIXMLDocumentPart;
import org.apache.poi.ooxml.util.DocumentHelper;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.schemas.vmldrawing.XmlDocument;
import org.apache.poi.util.ReplacingInputStream;
@ -55,8 +55,6 @@ import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlOptions;
import org.openxmlformats.schemas.officeDocument.x2006.sharedTypes.STTrueFalse;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
/**
* Represents a SpreadsheetML VML drawing.
@ -129,23 +127,26 @@ public final class XSSFVMLDrawing extends POIXMLDocumentPart {
protected void read(InputStream is) throws IOException, XmlException {
Document doc;
try {
/*
* This is a seriously sick fix for the fact that some .xlsx files contain raw bits
* of HTML, without being escaped or properly turned into XML.
* The result is that they contain things like >br<, which breaks the XML parsing.
* This very sick InputStream wrapper attempts to spot these go past, and fix them.
*/
doc = DocumentHelper.readDocument(new ReplacingInputStream(is, "<br>", "<br/>"));
} catch (SAXException e) {
throw new XmlException(e.getMessage(), e);
}
XmlOptions xopt = new XmlOptions(DEFAULT_XML_OPTIONS);
xopt.setLoadSubstituteNamespaces(Collections.singletonMap("", QNAME_VMLDRAWING.getNamespaceURI()));
xopt.setDocumentType(XmlDocument.type);
/*
* This is a seriously sick fix for the fact that some .xlsx files contain raw bits
* of HTML, without being escaped or properly turned into XML.
* The result is that they contain things like &gt;br&lt;, which breaks the XML parsing.
* This very sick InputStream wrapper attempts to spot these go past, and fix them.
*
* Furthermore some documents contain a default namespace of
* http://schemas.openxmlformats.org/spreadsheetml/2006/main for the namespace-less "xml" document type.
* this definition is wrong and removed.
*/
root = XmlDocument.Factory.parse(
new ReplacingInputStream(
new ReplacingInputStream(is, "<br>", "<br/>"),
" xmlns=\""+NS_SPREADSHEETML+"\"", "")
, xopt);
root = XmlDocument.Factory.parse(doc, xopt);
XmlCursor cur = root.getXml().newCursor();
try {

View File

@ -29,6 +29,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
@ -42,6 +43,8 @@ import com.microsoft.schemas.vml.CTShadow;
import com.microsoft.schemas.vml.CTShape;
import com.microsoft.schemas.vml.CTShapetype;
import com.microsoft.schemas.vml.STExt;
import com.microsoft.schemas.vml.STStrokeJoinStyle;
import com.microsoft.schemas.vml.impl.CTShapetypeImpl;
import org.apache.poi.POIDataSamples;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
@ -181,4 +184,30 @@ public class TestXSSFVMLDrawing {
}
assertEquals(16, count);
}
@Test
public void bug65061_InvalidXmlns() throws IOException, XmlException {
// input hasn't no <?xml... declaration - as in the sample file
String input =
"<xml xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:x=\"urn:schemas-microsoft-com:office:excel\">\n" +
"<v:shapetype id=\"_x0000_t202\" coordsize=\"21600,21600\" path=\"m,l,21600r21600,l21600,xe\" o:spt=\"202\">\n" +
"<v:stroke joinstyle=\"miter\"/>\n" +
"<v:path o:connecttype=\"rect\" gradientshapeok=\"t\"/>\n" +
"</v:shapetype>\n" +
"</xml>";
XSSFVMLDrawing vml = new XSSFVMLDrawing();
vml.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8)));
// check that the xml beans parsed correctly
assertNotNull(vml.getDocument().getXml());
// check the parsed child
List<XmlObject> objs = vml.getItems();
assertEquals(1, objs.size());
XmlObject xst = objs.get(0);
assertTrue(xst instanceof CTShapetypeImpl);
CTShapetype st = (CTShapetype)xst;
assertEquals(STStrokeJoinStyle.MITER, st.getStrokeArray(0).getJoinstyle());
}
}