Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685283 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-08-12 19:57:04 +00:00
parent 8ad0a4f34c
commit 97e8e39eb6
7 changed files with 162 additions and 3 deletions

View File

@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
return ret.toString(); return ret.toString();
} }
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
*/
public static String stripFields(String text) {
return Range.stripFields(text);
}
} }

View File

@ -35,6 +35,8 @@ public class HeaderStories {
private Range headerStories; private Range headerStories;
private PlexOfCps plcfHdd; private PlexOfCps plcfHdd;
private boolean stripFields = false;
public HeaderStories(HWPFDocument doc) { public HeaderStories(HWPFDocument doc) {
this.headerStories = doc.getHeaderStoryRange(); this.headerStories = doc.getHeaderStoryRange();
FileInformationBlock fib = doc.getFileInformationBlock(); FileInformationBlock fib = doc.getFileInformationBlock();
@ -157,8 +159,15 @@ public class HeaderStories {
return ""; return "";
} }
// Return the contents // Grab the contents
return headerStories.text().substring(prop.getStart(), prop.getEnd()); String text =
headerStories.text().substring(prop.getStart(), prop.getEnd());
// Strip off fields and macros if requested
if(stripFields) {
return Range.stripFields(text);
}
return text;
} }
public Range getRange() { public Range getRange() {
@ -167,4 +176,22 @@ public class HeaderStories {
protected PlexOfCps getPlcfHdd() { protected PlexOfCps getPlcfHdd() {
return plcfHdd; return plcfHdd;
} }
/**
* Are fields currently being stripped from
* the text that this {@link HeaderStories} returns?
* Default is false, but can be changed
*/
public boolean areFieldsStripped() {
return stripFields;
}
/**
* Should fields (eg macros) be stripped from
* the text that this class returns?
* Default is not to strip.
* @param stripFields
*/
public void setAreFieldsStripped(boolean stripFields) {
this.stripFields = stripFields;
}
} }

View File

@ -300,6 +300,63 @@ public class Range
return sb.toString(); return sb.toString();
} }
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
* Normally used to make some text suitable for showing
* to humans, and the resultant text should not normally
* be saved back into the document!
*/
public static String stripFields(String text) {
// First up, fields can be nested...
// A field can be 0x13 [contents] 0x15
// Or it can be 0x13 [contents] 0x14 [real text] 0x15
// If there are no fields, all easy
if(text.indexOf('\u0013') == -1) return text;
// Loop over until they're all gone
// That's when we're out of both 0x13s and 0x15s
while( text.indexOf('\u0013') > -1 &&
text.indexOf('\u0015') > -1) {
int first13 = text.indexOf('\u0013');
int next13 = text.indexOf('\u0013', first13+1);
int first14 = text.indexOf('\u0014', first13+1);
int last15 = text.lastIndexOf('\u0015');
// If they're the wrong way around, give up
if(last15 < first13) {
break;
}
// If no more 13s and 14s, just zap
if(next13 == -1 && first14 == -1) {
text = text.substring(0, first13) +
text.substring(last15+1);
break;
}
// If a 14 comes before the next 13, then
// zap from the 13 to the 14, and remove
// the 15
if(first14 != -1 && (first14 < next13 || next13 == -1)) {
text = text.substring(0, first13) +
text.substring(first14+1, last15) +
text.substring(last15+1);
continue;
}
// Another 13 comes before the next 14.
// This means there's nested stuff, so we
// can just zap the lot
text = text.substring(0, first13) +
text.substring(last15+1);
continue;
}
return text;
}
/** /**
* Used to get the number of sections in a range. If this range is smaller * Used to get the number of sections in a range. If this range is smaller
* than a section, it will return 1 for its containing section. * than a section, it will return 1 for its containing section.

View File

@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
private HWPFDocument oddEven; private HWPFDocument oddEven;
private HWPFDocument diffFirst; private HWPFDocument diffFirst;
private HWPFDocument unicode; private HWPFDocument unicode;
private HWPFDocument withFields;
protected void setUp() throws Exception { protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path"); String dirname = System.getProperty("HWPF.testdata.path");
@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
unicode = new HWPFDocument( unicode = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc")) new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
); );
withFields = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
);
} }
public void testNone() throws Exception { public void testNone() throws Exception {
@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
assertEquals("\r\r", hs.getEvenFooter()); assertEquals("\r\r", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter()); assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
} }
public void testWithFields() throws Exception {
HeaderStories hs = new HeaderStories(withFields);
assertFalse(hs.areFieldsStripped());
assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
// Now turn on stripping
hs.setAreFieldsStripped(true);
assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
}
} }

View File

@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream;
import junit.framework.TestCase; import junit.framework.TestCase;

View File

@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.poi.hwpf.usermodel;
import junit.framework.TestCase;
/**
* Tests for Range which aren't around deletion, insertion,
* text replacement or textual contents
*/
public class TestRange extends TestCase {
public void testFieldStripping() throws Exception {
String exp = "This is some text.";
String single = "This is some \u0013Blah!\u0015text.";
String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
String withNested =
"This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
String withNested14 =
"This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
String withNestedIn14 =
"This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
// Check all comes out right
assertEquals(exp, Range.stripFields(exp));
assertEquals(exp, Range.stripFields(single));
assertEquals(exp, Range.stripFields(with14));
assertEquals(exp, Range.stripFields(withNested));
assertEquals(exp, Range.stripFields(withNested14));
assertEquals(exp, Range.stripFields(withNestedIn14));
// Ones that are odd and we won't change
String odd1 = "This\u0015 is \u0013 odd";
String odd2 = "This\u0015 is \u0014 also \u0013 odd";
assertEquals(odd1, Range.stripFields(odd1));
assertEquals(odd2, Range.stripFields(odd2));
}
}