mirror of https://github.com/apache/poi.git
Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685267 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
80ea376dc0
commit
8ad0a4f34c
|
@ -37,7 +37,7 @@
|
||||||
|
|
||||||
<!-- Don't forget to update status.xml too! -->
|
<!-- Don't forget to update status.xml too! -->
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
|
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||||
|
|
|
@ -95,7 +95,7 @@
|
||||||
<p>If all you are interested in is getting the textual content of
|
<p>If all you are interested in is getting the textual content of
|
||||||
all the document properties, such as for full text indexing, then
|
all the document properties, such as for full text indexing, then
|
||||||
take a look at
|
take a look at
|
||||||
<code>org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</code>. However,
|
<code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
|
||||||
if you want full access to the properties, please read on!</p>
|
if you want full access to the properties, please read on!</p>
|
||||||
|
|
||||||
<p>The first thing you should understand is that a Microsoft Office file is
|
<p>The first thing you should understand is that a Microsoft Office file is
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
<!-- Don't forget to update changes.xml too! -->
|
<!-- Don't forget to update changes.xml too! -->
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||||
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
|
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.poi;
|
||||||
|
|
||||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||||
import org.apache.poi.hpsf.SummaryInformation;
|
import org.apache.poi.hpsf.SummaryInformation;
|
||||||
|
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Common Parent for OLE2 based Text Extractors
|
* Common Parent for OLE2 based Text Extractors
|
||||||
|
@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
||||||
public SummaryInformation getSummaryInformation() {
|
public SummaryInformation getSummaryInformation() {
|
||||||
return document.getSummaryInformation();
|
return document.getSummaryInformation();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an HPSF powered text extractor for the
|
||||||
|
* document properties metadata, such as title and author.
|
||||||
|
*/
|
||||||
|
public POITextExtractor getMetadataTextExtractor() {
|
||||||
|
return new HPSFPropertiesExtractor(this);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,4 +54,11 @@ public abstract class POITextExtractor {
|
||||||
* @return All the text from the document
|
* @return All the text from the document
|
||||||
*/
|
*/
|
||||||
public abstract String getText();
|
public abstract String getText();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns another text extractor, which is able to
|
||||||
|
* output the textual content of the document
|
||||||
|
* metadata / properties, such as author and title.
|
||||||
|
*/
|
||||||
|
public abstract POITextExtractor getMetadataTextExtractor();
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,14 +36,14 @@ import org.apache.poi.util.LittleEndian;
|
||||||
* build in and custom, returning them in
|
* build in and custom, returning them in
|
||||||
* textual form.
|
* textual form.
|
||||||
*/
|
*/
|
||||||
public class HPFSPropertiesExtractor extends POITextExtractor {
|
public class HPSFPropertiesExtractor extends POITextExtractor {
|
||||||
public HPFSPropertiesExtractor(POITextExtractor mainExtractor) {
|
public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
|
||||||
super(mainExtractor);
|
super(mainExtractor);
|
||||||
}
|
}
|
||||||
public HPFSPropertiesExtractor(POIDocument doc) {
|
public HPSFPropertiesExtractor(POIDocument doc) {
|
||||||
super(doc);
|
super(doc);
|
||||||
}
|
}
|
||||||
public HPFSPropertiesExtractor(POIFSFileSystem fs) {
|
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
|
||||||
super(new PropertiesOnlyDocument(fs));
|
super(new PropertiesOnlyDocument(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,6 +127,13 @@ public class HPFSPropertiesExtractor extends POITextExtractor {
|
||||||
public String getText() {
|
public String getText() {
|
||||||
return getSummaryInformationText() + getDocumentSummaryInformationText();
|
return getSummaryInformationText() + getDocumentSummaryInformationText();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prevent recursion!
|
||||||
|
*/
|
||||||
|
public POITextExtractor getMetadataTextExtractor() {
|
||||||
|
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* So we can get at the properties of any
|
* So we can get at the properties of any
|
|
@ -25,7 +25,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
public class TestHPFSPropertiesExtractor extends TestCase {
|
public class TestHPSFPropertiesExtractor extends TestCase {
|
||||||
private String dir;
|
private String dir;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
|
@ -37,7 +37,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
|
||||||
POIFSFileSystem fs = new POIFSFileSystem(
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
new FileInputStream(new File(dir, "TestMickey.doc"))
|
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||||
);
|
);
|
||||||
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||||
ext.getText();
|
ext.getText();
|
||||||
|
|
||||||
// Check each bit in turn
|
// Check each bit in turn
|
||||||
|
@ -60,7 +60,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
|
||||||
POIFSFileSystem fs = new POIFSFileSystem(
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
||||||
);
|
);
|
||||||
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||||
ext.getText();
|
ext.getText();
|
||||||
|
|
||||||
// Check each bit in turn
|
// Check each bit in turn
|
||||||
|
@ -83,7 +83,7 @@ public class TestHPFSPropertiesExtractor extends TestCase {
|
||||||
POIFSFileSystem fs = new POIFSFileSystem(
|
POIFSFileSystem fs = new POIFSFileSystem(
|
||||||
new FileInputStream(new File(dir, "TestMickey.doc"))
|
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||||
);
|
);
|
||||||
HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
|
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||||
|
|
||||||
// Custom properties are part of the document info stream
|
// Custom properties are part of the document info stream
|
||||||
String dinfText = ext.getDocumentSummaryInformationText();
|
String dinfText = ext.getDocumentSummaryInformationText();
|
||||||
|
@ -102,9 +102,9 @@ public class TestHPFSPropertiesExtractor extends TestCase {
|
||||||
HSSFWorkbook wb = new HSSFWorkbook(fs);
|
HSSFWorkbook wb = new HSSFWorkbook(fs);
|
||||||
ExcelExtractor excelExt = new ExcelExtractor(wb);
|
ExcelExtractor excelExt = new ExcelExtractor(wb);
|
||||||
|
|
||||||
String fsText = (new HPFSPropertiesExtractor(fs)).getText();
|
String fsText = (new HPSFPropertiesExtractor(fs)).getText();
|
||||||
String hwText = (new HPFSPropertiesExtractor(wb)).getText();
|
String hwText = (new HPSFPropertiesExtractor(wb)).getText();
|
||||||
String eeText = (new HPFSPropertiesExtractor(excelExt)).getText();
|
String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
|
||||||
|
|
||||||
assertEquals(fsText, hwText);
|
assertEquals(fsText, hwText);
|
||||||
assertEquals(fsText, eeText);
|
assertEquals(fsText, eeText);
|
Loading…
Reference in New Issue