mirror of https://github.com/apache/poi.git
More ExtractorFactory support and tests
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@645870 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
586fc030ce
commit
84a1727a6d
|
@ -32,7 +32,9 @@ import org.openxml4j.opc.PackageRelationshipCollection;
|
||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLDocument;
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
import org.apache.poi.poifs.filesystem.Entry;
|
import org.apache.poi.poifs.filesystem.Entry;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.xslf.XSLFSlideShow;
|
import org.apache.poi.xslf.XSLFSlideShow;
|
||||||
|
@ -51,20 +53,21 @@ public class ExtractorFactory {
|
||||||
public static final String CORE_DOCUMENT_REL =
|
public static final String CORE_DOCUMENT_REL =
|
||||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
||||||
|
|
||||||
public POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
FileInputStream finp = new FileInputStream(f);
|
InputStream inp = new PushbackInputStream(
|
||||||
|
new FileInputStream(f), 8);
|
||||||
|
|
||||||
if(POIFSFileSystem.hasPOIFSHeader(finp)) {
|
if(POIFSFileSystem.hasPOIFSHeader(inp)) {
|
||||||
return createExtractor(new POIFSFileSystem(finp));
|
return createExtractor(new POIFSFileSystem(inp));
|
||||||
}
|
}
|
||||||
if(POIXMLDocument.hasOOXMLHeader(finp)) {
|
if(POIXMLDocument.hasOOXMLHeader(inp)) {
|
||||||
finp.close();
|
inp.close();
|
||||||
return createExtractor(Package.open(f.toString()));
|
return createExtractor(Package.open(f.toString()));
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file");
|
||||||
}
|
}
|
||||||
|
|
||||||
public POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
// Figure out the kind of stream
|
// Figure out the kind of stream
|
||||||
// If clearly doesn't do mark/reset, wrap up
|
// If clearly doesn't do mark/reset, wrap up
|
||||||
if(! inp.markSupported()) {
|
if(! inp.markSupported()) {
|
||||||
|
@ -80,7 +83,7 @@ public class ExtractorFactory {
|
||||||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
||||||
}
|
}
|
||||||
|
|
||||||
public POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException {
|
public static POIXMLTextExtractor createExtractor(Package pkg) throws IOException, OpenXML4JException, XmlException {
|
||||||
PackageRelationshipCollection core =
|
PackageRelationshipCollection core =
|
||||||
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||||
if(core.size() != 1) {
|
if(core.size() != 1) {
|
||||||
|
@ -100,14 +103,23 @@ public class ExtractorFactory {
|
||||||
throw new IllegalArgumentException("No supported documents found in the OOXML package");
|
throw new IllegalArgumentException("No supported documents found in the OOXML package");
|
||||||
}
|
}
|
||||||
|
|
||||||
public POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
||||||
// Look for certain entries in the stream, to figure it
|
// Look for certain entries in the stream, to figure it
|
||||||
// out from
|
// out from
|
||||||
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
|
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
|
||||||
Entry entry = (Entry)entries.next();
|
Entry entry = (Entry)entries.next();
|
||||||
|
|
||||||
|
System.err.println(entry.getName());
|
||||||
if(entry.getName().equals("Workbook")) {
|
if(entry.getName().equals("Workbook")) {
|
||||||
return new ExcelExtractor(fs);
|
return new ExcelExtractor(fs);
|
||||||
}
|
}
|
||||||
|
if(entry.getName().equals("WordDocument")) {
|
||||||
|
return new WordExtractor(fs);
|
||||||
|
}
|
||||||
|
if(entry.getName().equals("PowerPoint Document")) {
|
||||||
|
return new PowerPointExtractor(fs);
|
||||||
|
}
|
||||||
|
// TODO - visio
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,140 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||||
|
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||||
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||||
|
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||||
|
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||||
|
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test that the extractor factory plays nicely
|
||||||
|
*/
|
||||||
|
public class TestExtractorFactory extends TestCase {
|
||||||
|
private String excel_dir;
|
||||||
|
private String word_dir;
|
||||||
|
private String powerpoint_dir;
|
||||||
|
|
||||||
|
private File txt;
|
||||||
|
|
||||||
|
private File xls;
|
||||||
|
private File xlsx;
|
||||||
|
|
||||||
|
private File doc;
|
||||||
|
private File docx;
|
||||||
|
|
||||||
|
private File ppt;
|
||||||
|
private File pptx;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
excel_dir = System.getProperty("HSSF.testdata.path");
|
||||||
|
word_dir = System.getProperty("HWPF.testdata.path");
|
||||||
|
powerpoint_dir = System.getProperty("HSLF.testdata.path");
|
||||||
|
|
||||||
|
txt = new File(excel_dir, "SampleSS.txt");
|
||||||
|
|
||||||
|
xls = new File(excel_dir, "SampleSS.xls");
|
||||||
|
xlsx = new File(excel_dir, "SampleSS.xlsx");
|
||||||
|
|
||||||
|
doc = new File(word_dir, "SampleDoc.doc");
|
||||||
|
docx = new File(word_dir, "SampleDoc.docx");
|
||||||
|
|
||||||
|
ppt = new File(powerpoint_dir, "SampleShow.ppt");
|
||||||
|
pptx = new File(powerpoint_dir, "SampleShow.pptx");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFile() throws Exception {
|
||||||
|
// Excel
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(xls)
|
||||||
|
instanceof ExcelExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(xls).getText().length() > 200
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(xlsx)
|
||||||
|
instanceof XSSFExcelExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(xlsx).getText().length() > 200
|
||||||
|
);
|
||||||
|
|
||||||
|
// Word
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc)
|
||||||
|
instanceof WordExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(doc).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(docx)
|
||||||
|
instanceof XWPFWordExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(docx).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
|
// PowerPoint
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(ppt)
|
||||||
|
instanceof PowerPointExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(ppt).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(pptx)
|
||||||
|
instanceof XSLFPowerPointExtractor
|
||||||
|
);
|
||||||
|
assertTrue(
|
||||||
|
ExtractorFactory.createExtractor(pptx).getText().length() > 120
|
||||||
|
);
|
||||||
|
|
||||||
|
// Visio
|
||||||
|
// TODO
|
||||||
|
|
||||||
|
// Text
|
||||||
|
try {
|
||||||
|
ExtractorFactory.createExtractor(txt);
|
||||||
|
fail();
|
||||||
|
} catch(IllegalArgumentException e) {
|
||||||
|
// Good
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public void testInputStream() throws Exception {
|
||||||
|
|
||||||
|
}
|
||||||
|
public void testPOIFS() throws Exception {
|
||||||
|
|
||||||
|
}
|
||||||
|
public void testPackage() throws Exception {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue