mirror of https://github.com/apache/poi.git
Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646311,646313-646404,646406-646665,646667-646853,646855-646869,646871-647151,647153-647185,647187-647277,647279-647566,647568-647573,647575,647578-647711,647714-647737,647739-647823,647825-648155,648157-648202,648204-648273,648275,648277-648302,648304-648333,648335-648588,648590-648622,648625-648673,648675-649141,649144,649146-649556,649558-649795,649799,649801-649910,649912-649913,649915-650128,650131-650132,650134-650137,650140-650914,650916-651991,651993-652284,652286-652287,652289,652291,652293-652297,652299-652328,652330-652425,652427-652445,652447-652560,652562-652933,652935,652937-652993,652995-653116,653118-653124,653126-653483,653487-653519,653522-653550,653552-653607,653609-653667,653669-653674,653676-653814,653817-653830,653832-653891,653893-653944,653946-654055,654057-654355,654357-654365,654367-654648,654651-655215,655217-655277,655279-655281,655283-655911,655913-656212,656214,656216-656251,656253-656698,656700-656756,656758-656892,656894-657135,657137-657165,657168-657179,657181-657354,657356-657357,657359-657701,657703-657874,657876-658032,658034-658284,658286,658288-658301,658303-658307,658309-658321,658323-658335,658337-658348,658351,658353-658832,658834-658983,658985,658987-659066,659068-659402,659404-659428,659430-659451,659453-659454,659456-659461,659463-659477,659479-659524,659526-659571,659574,659576-660255,660257-660262,660264-660279,660281-660343,660345-660473,660475-660827,660829-660833,660835-660888,660890-663321,663323-663435,663437-663764,663766-663854,663856-664219,664221-664489,664494-664514,664516-668013,668015-668142,668144-668152,668154,668156-668256,668258,668260-669139,669141-669455,669457-669657,669659-669808,669810-670189,670191-671321,671323-672229,672231-672549,672551-672552,672554-672561,672563-672566,672568,672571-673049,673051-673852,673854-673862,673864-673986,673988-673996,673998-674347,674349-674890,674892-674910,674912-674936,674938-674952,674954-675078,675080-675085,675087-675217,675219-675660,675662-675670,675672-675716,675718-675726,675728-675733,675735-675775,675777-675782,675784,675786-675791,675794-675852,675854-676200,676202,676204,676206-676220,676222-676309,676311-676456,676458-676994,676996-677027,677030-677040,677042-677056,677058-677375,677377-677968,677970-677971,677973,677975-677994,677996-678286,678288-678538,678540-680393,680395-680469,680471-680529,680531-680852,680854-681529,681531-681571,681573-682224,682226,682228,682231-682281,682283-682335,682337-682507,682509,682512-682517,682519-682532,682534-682619,682622-682777,682779-682998,683000-683019,683021-683022,683024-683080,683082-683092,683094-683095,683097-683127,683129-683131,683133-683166,683168-683698,683700-683705,683707-683757,683759-683787,683789-683870,683872-683879,683881-683900,683902-684066,684068-684074,684076-684222,684224-684254,684257-684281,684283-684286,684288-684292,684294-684298,684300-684301,684303-684308,684310-684317,684320,684323-684335,684337-684348,684350-684354,684356-684361,684363-684369,684371-684453,684455-684883,684885-684937,684940-684958,684960-684970,684972-684985,684987-685053,685055-685063,685065-685284 via svnmerge from
https://svn.apache.org/repos/asf/poi/trunk ........ r685260 | nick | 2008-08-12 19:44:50 +0100 (Tue, 12 Aug 2008) | 1 line New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor ........ r685263 | nick | 2008-08-12 19:55:47 +0100 (Tue, 12 Aug 2008) | 1 line Few documentation updates for recent new code ........ r685267 | nick | 2008-08-12 20:02:41 +0100 (Tue, 12 Aug 2008) | 1 line Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor ........ r685283 | nick | 2008-08-12 20:57:04 +0100 (Tue, 12 Aug 2008) | 1 line Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers ........ r685284 | nick | 2008-08-12 20:59:35 +0100 (Tue, 12 Aug 2008) | 1 line Update changelog ........ git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685288 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0ba363365c
commit
3638f76a8a
|
@ -58,6 +58,8 @@
|
|||
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
|
||||
</release>
|
||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||
|
|
|
@ -92,6 +92,12 @@
|
|||
properties. Chances are that you will find here what you need and don't
|
||||
have to read the other sections.</note>
|
||||
|
||||
<p>If all you are interested in is getting the textual content of
|
||||
all the document properties, such as for full text indexing, then
|
||||
take a look at
|
||||
<code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
|
||||
if you want full access to the properties, please read on!</p>
|
||||
|
||||
<p>The first thing you should understand is that a Microsoft Office file is
|
||||
not one large bunch of bytes but has an internal filesystem structure with
|
||||
files and directories. You can access these files and directories using
|
||||
|
|
|
@ -55,13 +55,25 @@ can then get text and other properties.
|
|||
</p>
|
||||
</section>
|
||||
|
||||
<section><title>Headers and Footers</title>
|
||||
<p>To get at the headers and footers of a word document, first create a
|
||||
<code>org.apache.poi.hwpf.HWPFDocument</code>. Next, you need to create a
|
||||
<code>org.apache.poi.hwpf.usermodel.HeaderStores</code>, passing it your
|
||||
HWPFDocument. Finally, the HeaderStores gives you access to the headers and
|
||||
footers, including first / even / odd page ones if defined in your
|
||||
document. Additionally, HeaderStores provides a method for removing
|
||||
any macros in the text, which is helpful as many headers and footers
|
||||
do end up with macros in them.</p>
|
||||
</section>
|
||||
|
||||
<section><title>Changing Text</title>
|
||||
<p>It is possible to change the text via
|
||||
<code>insertBefore()</code> and <code>insertAfter()</code>
|
||||
on a <code>Range</code> object (either a <code>Range</code>,
|
||||
<code>Paragraph</code> or <code>CharacterRun</code>).
|
||||
It is also possible to delete a <code>Range</code>, but this
|
||||
code is know to have bugs in it.
|
||||
It is also possible to delete a <code>Range</code>.
|
||||
This code will work in many, but not all cases, and patches to
|
||||
improve it are gratefully received!
|
||||
</p>
|
||||
</section>
|
||||
|
||||
|
|
|
@ -55,6 +55,8 @@
|
|||
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
|
||||
</release>
|
||||
<release version="3.1.1-alpha1" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.poi;
|
|||
|
||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||
|
||||
/**
|
||||
* Common Parent for OLE2 based Text Extractors
|
||||
|
@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
public SummaryInformation getSummaryInformation() {
|
||||
return document.getSummaryInformation();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an HPSF powered text extractor for the
|
||||
* document properties metadata, such as title and author.
|
||||
*/
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
return new HPSFPropertiesExtractor(this);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,14 @@ public abstract class POITextExtractor {
|
|||
public POITextExtractor(POIDocument document) {
|
||||
this.document = document;
|
||||
}
|
||||
/**
|
||||
* Creates a new text extractor, using the same
|
||||
* document as another text extractor. Normally
|
||||
* only used by properties extractors.
|
||||
*/
|
||||
protected POITextExtractor(POITextExtractor otherExtractor) {
|
||||
this.document = otherExtractor.document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all the text from the document.
|
||||
|
@ -46,4 +54,11 @@ public abstract class POITextExtractor {
|
|||
* @return All the text from the document
|
||||
*/
|
||||
public abstract String getText();
|
||||
|
||||
/**
|
||||
* Returns another text extractor, which is able to
|
||||
* output the textual content of the document
|
||||
* metadata / properties, such as author and title.
|
||||
*/
|
||||
public abstract POITextExtractor getMetadataTextExtractor();
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||
|
||||
|
@ -293,8 +294,18 @@ public class CustomProperties extends HashMap
|
|||
final CustomProperty cp = new CustomProperty(p, name);
|
||||
return put(cp);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a set of all the names of our
|
||||
* custom properties
|
||||
*/
|
||||
public Set keySet() {
|
||||
return dictionaryNameToID.keySet();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* <p>Sets the codepage.</p>
|
||||
*
|
||||
* @param codepage the codepage
|
||||
|
|
|
@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet
|
|||
public static final String DEFAULT_STREAM_NAME =
|
||||
"\005DocumentSummaryInformation";
|
||||
|
||||
public PropertyIDMap getPropertySetIDMap() {
|
||||
return PropertyIDMap.getDocumentSummaryInformationProperties();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
|
||||
/**
|
||||
|
@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
|||
*/
|
||||
public abstract class SpecialPropertySet extends MutablePropertySet
|
||||
{
|
||||
/**
|
||||
* The id to name mapping of the properties
|
||||
* in this set.
|
||||
*/
|
||||
public abstract PropertyIDMap getPropertySetIDMap();
|
||||
|
||||
/**
|
||||
* <p>The "real" property set <code>SpecialPropertySet</code>
|
||||
|
|
|
@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet
|
|||
*/
|
||||
public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
|
||||
|
||||
public PropertyIDMap getPropertySetIDMap() {
|
||||
return PropertyIDMap.getSummaryInformationProperties();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hpsf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hpsf.CustomProperties;
|
||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.Property;
|
||||
import org.apache.poi.hpsf.SpecialPropertySet;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* Extracts all of the HPSF properties, both
|
||||
* build in and custom, returning them in
|
||||
* textual form.
|
||||
*/
|
||||
public class HPSFPropertiesExtractor extends POITextExtractor {
|
||||
public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
|
||||
super(mainExtractor);
|
||||
}
|
||||
public HPSFPropertiesExtractor(POIDocument doc) {
|
||||
super(doc);
|
||||
}
|
||||
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
|
||||
super(new PropertiesOnlyDocument(fs));
|
||||
}
|
||||
|
||||
public String getDocumentSummaryInformationText() {
|
||||
DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
// Normal properties
|
||||
text.append( getPropertiesText(dsi) );
|
||||
|
||||
// Now custom ones
|
||||
CustomProperties cps = dsi.getCustomProperties();
|
||||
Iterator keys = cps.keySet().iterator();
|
||||
while(keys.hasNext()) {
|
||||
String key = (String)keys.next();
|
||||
String val = getPropertyValueText( cps.get(key) );
|
||||
text.append(key + " = " + val + "\n");
|
||||
}
|
||||
|
||||
// All done
|
||||
return text.toString();
|
||||
}
|
||||
public String getSummaryInformationText() {
|
||||
SummaryInformation si = document.getSummaryInformation();
|
||||
|
||||
// Just normal properties
|
||||
return getPropertiesText(si);
|
||||
}
|
||||
|
||||
private static String getPropertiesText(SpecialPropertySet ps) {
|
||||
if(ps == null) {
|
||||
// Not defined, oh well
|
||||
return "";
|
||||
}
|
||||
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
PropertyIDMap idMap = ps.getPropertySetIDMap();
|
||||
Property[] props = ps.getProperties();
|
||||
for(int i=0; i<props.length; i++) {
|
||||
String type = Long.toString( props[i].getID() );
|
||||
Object typeObj = idMap.get(props[i].getID());
|
||||
if(typeObj != null) {
|
||||
type = typeObj.toString();
|
||||
}
|
||||
|
||||
String val = getPropertyValueText( props[i].getValue() );
|
||||
text.append(type + " = " + val + "\n");
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
private static String getPropertyValueText(Object val) {
|
||||
if(val == null) {
|
||||
return "(not set)";
|
||||
}
|
||||
if(val instanceof byte[]) {
|
||||
byte[] b = (byte[])val;
|
||||
if(b.length == 0) {
|
||||
return "";
|
||||
}
|
||||
if(b.length == 1) {
|
||||
return Byte.toString(b[0]);
|
||||
}
|
||||
if(b.length == 2) {
|
||||
return Integer.toString( LittleEndian.getUShort(b) );
|
||||
}
|
||||
if(b.length == 4) {
|
||||
return Long.toString( LittleEndian.getUInt(b) );
|
||||
}
|
||||
// Maybe it's a string? who knows!
|
||||
return new String(b);
|
||||
}
|
||||
return val.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the text of all the properties defined in
|
||||
* the document.
|
||||
*/
|
||||
public String getText() {
|
||||
return getSummaryInformationText() + getDocumentSummaryInformationText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prevent recursion!
|
||||
*/
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
|
||||
}
|
||||
|
||||
/**
|
||||
* So we can get at the properties of any
|
||||
* random OLE2 document.
|
||||
*/
|
||||
private static class PropertiesOnlyDocument extends POIDocument {
|
||||
private PropertiesOnlyDocument(POIFSFileSystem fs) {
|
||||
super(fs);
|
||||
}
|
||||
|
||||
public void write(OutputStream out) throws IOException {
|
||||
throw new IllegalStateException("Unable to write, only for properties!");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -30,7 +30,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||
* Creates a new text extractor for the given document
|
||||
*/
|
||||
public POIXMLTextExtractor(POIXMLDocument document) {
|
||||
super(null);
|
||||
super((POIDocument)null);
|
||||
|
||||
this.document = document;
|
||||
}
|
||||
|
@ -54,4 +54,13 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||
public POIXMLDocument getDocument(){
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an OOXML properties text extractor for the
|
||||
* document properties metadata, such as title and author.
|
||||
*/
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
throw new RuntimeException("Not yet supported for OOXML!");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
|||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc)
|
||||
* from the string.
|
||||
*/
|
||||
public static String stripFields(String text) {
|
||||
return Range.stripFields(text);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,6 +35,8 @@ public class HeaderStories {
|
|||
private Range headerStories;
|
||||
private PlexOfCps plcfHdd;
|
||||
|
||||
private boolean stripFields = false;
|
||||
|
||||
public HeaderStories(HWPFDocument doc) {
|
||||
this.headerStories = doc.getHeaderStoryRange();
|
||||
FileInformationBlock fib = doc.getFileInformationBlock();
|
||||
|
@ -157,8 +159,15 @@ public class HeaderStories {
|
|||
return "";
|
||||
}
|
||||
|
||||
// Return the contents
|
||||
return headerStories.text().substring(prop.getStart(), prop.getEnd());
|
||||
// Grab the contents
|
||||
String text =
|
||||
headerStories.text().substring(prop.getStart(), prop.getEnd());
|
||||
|
||||
// Strip off fields and macros if requested
|
||||
if(stripFields) {
|
||||
return Range.stripFields(text);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
public Range getRange() {
|
||||
|
@ -167,4 +176,22 @@ public class HeaderStories {
|
|||
protected PlexOfCps getPlcfHdd() {
|
||||
return plcfHdd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are fields currently being stripped from
|
||||
* the text that this {@link HeaderStories} returns?
|
||||
* Default is false, but can be changed
|
||||
*/
|
||||
public boolean areFieldsStripped() {
|
||||
return stripFields;
|
||||
}
|
||||
/**
|
||||
* Should fields (eg macros) be stripped from
|
||||
* the text that this class returns?
|
||||
* Default is not to strip.
|
||||
* @param stripFields
|
||||
*/
|
||||
public void setAreFieldsStripped(boolean stripFields) {
|
||||
this.stripFields = stripFields;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -299,6 +299,63 @@ public class Range
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc)
|
||||
* from the string.
|
||||
* Normally used to make some text suitable for showing
|
||||
* to humans, and the resultant text should not normally
|
||||
* be saved back into the document!
|
||||
*/
|
||||
public static String stripFields(String text) {
|
||||
// First up, fields can be nested...
|
||||
// A field can be 0x13 [contents] 0x15
|
||||
// Or it can be 0x13 [contents] 0x14 [real text] 0x15
|
||||
|
||||
// If there are no fields, all easy
|
||||
if(text.indexOf('\u0013') == -1) return text;
|
||||
|
||||
// Loop over until they're all gone
|
||||
// That's when we're out of both 0x13s and 0x15s
|
||||
while( text.indexOf('\u0013') > -1 &&
|
||||
text.indexOf('\u0015') > -1) {
|
||||
int first13 = text.indexOf('\u0013');
|
||||
int next13 = text.indexOf('\u0013', first13+1);
|
||||
int first14 = text.indexOf('\u0014', first13+1);
|
||||
int last15 = text.lastIndexOf('\u0015');
|
||||
|
||||
// If they're the wrong way around, give up
|
||||
if(last15 < first13) {
|
||||
break;
|
||||
}
|
||||
|
||||
// If no more 13s and 14s, just zap
|
||||
if(next13 == -1 && first14 == -1) {
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(last15+1);
|
||||
break;
|
||||
}
|
||||
|
||||
// If a 14 comes before the next 13, then
|
||||
// zap from the 13 to the 14, and remove
|
||||
// the 15
|
||||
if(first14 != -1 && (first14 < next13 || next13 == -1)) {
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(first14+1, last15) +
|
||||
text.substring(last15+1);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Another 13 comes before the next 14.
|
||||
// This means there's nested stuff, so we
|
||||
// can just zap the lot
|
||||
text = text.substring(0, first13) +
|
||||
text.substring(last15+1);
|
||||
continue;
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to get the number of sections in a range. If this range is smaller
|
||||
|
|
Binary file not shown.
|
@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
|
|||
private HWPFDocument oddEven;
|
||||
private HWPFDocument diffFirst;
|
||||
private HWPFDocument unicode;
|
||||
private HWPFDocument withFields;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
String dirname = System.getProperty("HWPF.testdata.path");
|
||||
|
@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
|
|||
unicode = new HWPFDocument(
|
||||
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
|
||||
);
|
||||
withFields = new HWPFDocument(
|
||||
new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
|
||||
);
|
||||
}
|
||||
|
||||
public void testNone() throws Exception {
|
||||
|
@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
|
|||
assertEquals("\r\r", hs.getEvenFooter());
|
||||
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
|
||||
}
|
||||
|
||||
public void testWithFields() throws Exception {
|
||||
HeaderStories hs = new HeaderStories(withFields);
|
||||
assertFalse(hs.areFieldsStripped());
|
||||
|
||||
assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
|
||||
|
||||
// Now turn on stripping
|
||||
hs.setAreFieldsStripped(true);
|
||||
assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests for Range which aren't around deletion, insertion,
|
||||
* text replacement or textual contents
|
||||
*/
|
||||
public class TestRange extends TestCase {
|
||||
public void testFieldStripping() throws Exception {
|
||||
String exp = "This is some text.";
|
||||
|
||||
String single = "This is some \u0013Blah!\u0015text.";
|
||||
String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
|
||||
String withNested =
|
||||
"This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
|
||||
String withNested14 =
|
||||
"This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
|
||||
String withNestedIn14 =
|
||||
"This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
|
||||
|
||||
// Check all comes out right
|
||||
assertEquals(exp, Range.stripFields(exp));
|
||||
assertEquals(exp, Range.stripFields(single));
|
||||
assertEquals(exp, Range.stripFields(with14));
|
||||
assertEquals(exp, Range.stripFields(withNested));
|
||||
assertEquals(exp, Range.stripFields(withNested14));
|
||||
assertEquals(exp, Range.stripFields(withNestedIn14));
|
||||
|
||||
// Ones that are odd and we won't change
|
||||
String odd1 = "This\u0015 is \u0013 odd";
|
||||
String odd2 = "This\u0015 is \u0014 also \u0013 odd";
|
||||
|
||||
assertEquals(odd1, Range.stripFields(odd1));
|
||||
assertEquals(odd2, Range.stripFields(odd2));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hpsf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestHPSFPropertiesExtractor extends TestCase {
|
||||
private String dir;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
dir = System.getProperty("HPSF.testdata.path");
|
||||
assertNotNull("HPSF.testdata.path not set", dir);
|
||||
}
|
||||
|
||||
public void testNormalProperties() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(
|
||||
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||
);
|
||||
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||
ext.getText();
|
||||
|
||||
// Check each bit in turn
|
||||
String sinfText = ext.getSummaryInformationText();
|
||||
String dinfText = ext.getDocumentSummaryInformationText();
|
||||
|
||||
assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
|
||||
assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
|
||||
assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
|
||||
assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
|
||||
|
||||
// Now overall
|
||||
String text = ext.getText();
|
||||
assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
|
||||
assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
|
||||
assertTrue(text.indexOf("MANAGER = sample manager") > -1);
|
||||
assertTrue(text.indexOf("COMPANY = sample company") > -1);
|
||||
}
|
||||
public void testNormalUnicodeProperties() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(
|
||||
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
||||
);
|
||||
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||
ext.getText();
|
||||
|
||||
// Check each bit in turn
|
||||
String sinfText = ext.getSummaryInformationText();
|
||||
String dinfText = ext.getDocumentSummaryInformationText();
|
||||
|
||||
assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
|
||||
assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||
assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
|
||||
assertTrue(dinfText.indexOf("SCALE = false") > -1);
|
||||
|
||||
// Now overall
|
||||
String text = ext.getText();
|
||||
assertTrue(text.indexOf("AUTHOR = marshall") > -1);
|
||||
assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||
assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
|
||||
assertTrue(text.indexOf("SCALE = false") > -1);
|
||||
}
|
||||
public void testCustomProperties() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(
|
||||
new FileInputStream(new File(dir, "TestMickey.doc"))
|
||||
);
|
||||
HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
|
||||
|
||||
// Custom properties are part of the document info stream
|
||||
String dinfText = ext.getDocumentSummaryInformationText();
|
||||
assertTrue(dinfText.indexOf("Client = sample client") > -1);
|
||||
assertTrue(dinfText.indexOf("Division = sample division") > -1);
|
||||
|
||||
String text = ext.getText();
|
||||
assertTrue(text.indexOf("Client = sample client") > -1);
|
||||
assertTrue(text.indexOf("Division = sample division") > -1);
|
||||
}
|
||||
|
||||
public void testConstructors() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(
|
||||
new FileInputStream(new File(dir, "TestUnicode.xls"))
|
||||
);
|
||||
HSSFWorkbook wb = new HSSFWorkbook(fs);
|
||||
ExcelExtractor excelExt = new ExcelExtractor(wb);
|
||||
|
||||
String fsText = (new HPSFPropertiesExtractor(fs)).getText();
|
||||
String hwText = (new HPSFPropertiesExtractor(wb)).getText();
|
||||
String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
|
||||
|
||||
assertEquals(fsText, hwText);
|
||||
assertEquals(fsText, eeText);
|
||||
|
||||
assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
|
||||
assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue