From 896840bb82919b74932e32fd355998a085d6f916 Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Mon, 31 Aug 2009 17:02:06 +0000 Subject: [PATCH] fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor, see Bugzilla 47727 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@809662 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 3 +- .../poi/xwpf/extractor/XWPFWordExtractor.java | 86 ++++++++++++------ .../xwpf/model/XWPFHeaderFooterPolicy.java | 13 ++- .../xwpf/extractor/TestXWPFWordExtractor.java | 9 ++ test-data/document/Headers.docx | Bin 0 -> 13076 bytes 5 files changed, 81 insertions(+), 30 deletions(-) create mode 100755 test-data/document/Headers.docx diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index a27f2971d2..0191867715 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -33,7 +33,8 @@ - 47773 - Support for extraction of header / footer images in HWPF + 47773 - Fix for extraction paragraphs and sections from headers/footers with XWPFWordExtractor + 47727 - Support for extraction of header / footer images in HWPF moved all test data to a top-level directory 47721 - Added implementation for INDIRECT() 45583 - Avoid exception when reading ClipboardData packet in OLE property sets diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index 63059653f7..2c604b60e4 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -21,6 +21,7 @@ import java.util.Iterator; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.POIXMLException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; @@ -31,6 +32,7 @@ import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.xmlbeans.XmlException; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; /** * Helper class to extract text from an OOXML Word file @@ -72,45 +74,77 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { public String getText() { StringBuffer text = new StringBuffer(); XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); - + // Start out with all headers - // TODO - put them in where they're needed - if(hfPolicy.getFirstPageHeader() != null) { - text.append( hfPolicy.getFirstPageHeader().getText() ); - } - if(hfPolicy.getEvenPageHeader() != null) { - text.append( hfPolicy.getEvenPageHeader().getText() ); - } - if(hfPolicy.getDefaultHeader() != null) { - text.append( hfPolicy.getDefaultHeader().getText() ); - } + extractHeaders(text, hfPolicy); // First up, all our paragraph based text Iterator i = document.getParagraphsIterator(); while(i.hasNext()) { - XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( - new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks)); - text.append(decorator.getText()+"\n"); - } + XWPFParagraph paragraph = i.next(); + + + try { + CTSectPr ctSectPr = null; + if (paragraph.getCTP().getPPr()!=null) { + ctSectPr = paragraph.getCTP().getPPr().getSectPr(); + } + + XWPFHeaderFooterPolicy headerFooterPolicy = null; + + if (ctSectPr!=null) { + headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr); + + extractHeaders(text, headerFooterPolicy); + } + + XWPFParagraphDecorator decorator = new XWPFCommentsDecorator( + new XWPFHyperlinkDecorator(paragraph, null, fetchHyperlinks)); + text.append(decorator.getText()).append('\n'); + + if (ctSectPr!=null) { + extractFooters(text, headerFooterPolicy); + } + } catch (IOException e) { + throw new POIXMLException(e); + } catch (XmlException e) { + throw new POIXMLException(e); + } + } // Then our table based text Iterator j = document.getTablesIterator(); while(j.hasNext()) { - text.append(j.next().getText()+"\n"); + text.append(j.next().getText()).append('\n'); } // Finish up with all the footers - // TODO - put them in where they're needed - if(hfPolicy.getFirstPageFooter() != null) { - text.append( hfPolicy.getFirstPageFooter().getText() ); - } - if(hfPolicy.getEvenPageFooter() != null) { - text.append( hfPolicy.getEvenPageFooter().getText() ); - } - if(hfPolicy.getDefaultFooter() != null) { - text.append( hfPolicy.getDefaultFooter().getText() ); - } + extractFooters(text, hfPolicy); return text.toString(); } + + private void extractFooters(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if(hfPolicy.getFirstPageFooter() != null) { + text.append( hfPolicy.getFirstPageFooter().getText() ); + } + if(hfPolicy.getEvenPageFooter() != null) { + text.append( hfPolicy.getEvenPageFooter().getText() ); + } + if(hfPolicy.getDefaultFooter() != null) { + text.append( hfPolicy.getDefaultFooter().getText() ); + } + } + + private void extractHeaders(StringBuffer text, XWPFHeaderFooterPolicy hfPolicy) { + if(hfPolicy.getFirstPageHeader() != null) { + text.append( hfPolicy.getFirstPageHeader().getText() ); + } + if(hfPolicy.getEvenPageHeader() != null) { + text.append( hfPolicy.getEvenPageHeader().getText() ); + } + if(hfPolicy.getDefaultHeader() != null) { + text.append( hfPolicy.getDefaultHeader().getText() ); + } + } } diff --git a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java index b6bc5f15eb..a0ffb87bf9 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java +++ b/src/ooxml/java/org/apache/poi/xwpf/model/XWPFHeaderFooterPolicy.java @@ -83,19 +83,26 @@ public class XWPFHeaderFooterPolicy { private XWPFHeader defaultHeader; private XWPFFooter defaultFooter; - + /** + * Figures out the policy for the given document, + * and creates any header and footer objects + * as required. + */ + public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { + this(doc, doc.getDocument().getBody().getSectPr()); + } + /** * Figures out the policy for the given document, * and creates any header and footer objects * as required. */ - public XWPFHeaderFooterPolicy(XWPFDocument doc) throws IOException, XmlException { + public XWPFHeaderFooterPolicy(XWPFDocument doc, CTSectPr sectPr) throws IOException, XmlException { // Grab what headers and footers have been defined // For now, we don't care about different ranges, as it // doesn't seem that .docx properly supports that // feature of the file format yet this.doc = doc; - CTSectPr sectPr = doc.getDocument().getBody().getSectPr(); for(int i=0; igzFQcXFe~wWxj!qHK{>1(DmfzBB@X(kspmMEJOm1b?(;7?h#Tv54k)Hpz$!C)#uUCR?+Nz?hj{ z>R`Ea>+L^b7+tVxOq4zCU*iG*PfuU~xxX1SUJO>_8E}@$0PPhHXwW+LMwSk=Z+`6m zYsmk>*8AJ1hsQ2Tfbqiz9zm??4SKRJ!V*n=SrN-|A=(4;PuT}?K^)CrXm86SXqH>l z{(inSB0HDnmo~#$#06QYr#72CpQbCiYT;_t7zQP$HmH&6Yl0v;US(!&Ge^r66Q9Es zPG6S0Q5(ol!b4=tSe2(4Fxalt9E~FmqZk!4ihU7Y*4#dD{uV4XrEozF!OY}{uYN$4 zMQ09CixtZY*7#_CBm{L3O3&uY8+}Sk@qYUVbBYUUEK9lx(u^4W53nq2X1TK$LQgtj zkpa{&IWQp;XC6-JVCvdPj$d-!1}Zu63Dv04Jl*dE{c34Vd4R6%-)&_48zccU@JcfT z0DuF4263^mH>CZ;Pz-GhoUDNU?}y|2(`Z0|-Vb>9fA;nve#F}E2m2@7A`RZ@6Ik8K z5u_C>G|)T%ppcLtEDdGx?>$9}N%Dmc`?2$zg54`B#w;6jw6HziS4XH&g!KeNWW&;6 zD;Mn3c(=Rh`UvObo7VfrBj|o%X=z_*<)ez`sd1WKe6JZVtrjuQYiAvX#i1v-?3c@0 zz>>j-CEe#Pk0T7h6_U%$>VTA}JTUICA>B-Li`kYCig2laZ zued$)$u_HhwT8YTFw@{P_U8h&pa7D zNoRaD=b{516{&<>Xc#$V=SxHhC!GAj{``7p*))X6pqeC^O5DLQ7joH3yFq&b3qs|DyBwCX3Ggu38Yk}9sbzw#8wW!Hw5i7_NUr+X6Kiv83-`hSunZkeg+!M5z&@6gNCl-NR>4QJO*58i#24ieHSmD$7Q zbl>@u(n=qV?$Ts{zI!K_jfOor6-Pwg$ReNSgEk}>8Efdw>z-El8pp~k1ZhcI^*$oQ zIqVQE(g@;Fbksp;bK5RzV~pc)tcqoPth|F;Ee%tBJ_B)^R`*KOy{ct;NWZ2v>xN+l zxX`QxqWg_?~boWpZaDe%!W7++_8r#@7TH81pIsBl$ zUd5lWUMuklQp{1gqLQ|}B6ulV9w@#pPLTS%_J}jq?bU}kw${?pjqZXN6BiQ`=fFn3n1v;i>PPKpX+>?+#rjwL`s=APdHU~ zmueiYMnlh4rfq?Es|Ka%8ns=D6d7K7FL$K(9_UxtM7pms#Y2pz!kJFEy!O2{S5T5f z?^c);Jh8ges5@{J=g{;_Vus*GaNyRU1!^e8S2oo=)~i)4z)xo{JiOSWU~ubRGFvEJ z_SP*i zhC5_^`kXuIr>M3|wqye7>5Nr_4KWp{grUzif_qO@&2LW$7uiCXz&h^#kyp~r3%pE$ z>I?z{0FeJwr>T*ip^-h~4`tTH{YKsH`lj)v0zSpcGfuO0_-k0NKHQQn@||7b(iKJt zqbTn}`k5Ry8BA0ydy2)Jk8)Hfhny-0zAP-RXyQoRzSyL16AU>MniVqblW^2WJSg2O zJCLiZm*y+i{3Mutv4VoZxxgU4B!OK>FmFl~5+v*18#{kBp06Y+$mRXj_wH5uP>lwY zfnx|qStw?VgiN_}^GRO*q;wBOT2gUU(lz9!_OuMcCb_6(%6qIjdm7d^Q{QyVYHO)7n}P=<0}xEN`X3g_~*yux%IHsM2);JHF?YnSGRiUtE8RX(&4(!yN7`Ea`5 zvM2PLUM|ipZk@GYRg|Ro<7q|}BAB;u#yH_R4?3xhzTel^B1C6C^RK<3#2}7?hX9_| z%%xuBvOGM)=_}{95F@{D9L#sEnYKSB?;@xt&lUOx`Twq!d$ktgAJk_Eq`tp=u#vUl zFXmHP1g>)7Jw-S0OIvMr3hm0~q_aaznAX4qQfq=qB4ZtmyrZUBgf*F@TaTxP2QI6l z4xF33K2t8jZILF&jux^H`LD}~pQ*=g72ORn4|(Vo>cfV=9^!Yc(ay-CSrkYf{=s~( zU&FZHx%^~4mp_<~;|KFGEmO^owlO<*^J}1e3sYji+2)#{9pVEcHA|;}3=K(K@(JST zjR)OTx2JDa2;?lV&T0Z#G1U`z8^e)P1x4pYUM_2STWR^)E=lZiKxn^S!mgaVKz&m+ z>~6Bu<}c|3xLVWo5)}>b)3GsqyZT@d+`1P(m~RKje0|G(e=wiw6p;D6 zl7BE?@bNj2`HBtNF(Rz;kaeX20&gW3&>Ccq*cH;+;Hohuxq~XdvfANDMHlW^AJNm3 zHhv|q*Dcqas7{}hgFdKX_F~rf^}+&Kwo1h|;p96T*l=8{6cXzK1Vv=ax`f`I zB5r;0eRmUu(gMQF4!f=gm{r3sxX)~ifWmRwcKACf7aKOa`% z0Tj+QxZFtksFL%lEXA#=%FFOTGqF>^F)`#h$fXUcjIC?8jgluYn*DdJWVd4U*FypT ze%Js2>Yx6|(bUMwi1z3Hr)u`qCakfAF*=A(5cAp^t6y&xO%IUj&lzaMzdHOn;C+V6p0EWbH+Z^kLgGx zy*zjM9wGJOoJ?TkK;mHP*1rRUjycStUVTi(t$PAcb>z3HlEgJ3;a;*d==z=Sa*z7R zxJ145<2BAx0A_|ogh-ycq*JvN!k}0A=vtCQSVFxUlno!6PDa;TEQ_)4xqhq!VLItu zjkL39RBu+^3y6Hpz^p?wZOO3D;)6`LLg`Kdv@1!hE20?okSx{(gG9}mInZ{yd^82~ zAv09{gi#_d068BdWV2CB{(27dgrTd6)p;n1QBs{Hwmmzf)cuGYBh+~KXfy6ehsVty z@d_XR@X5pV>2durDltZ|-sdZU?8U~S?4yS((!9aT z)rkt79R&hVo}T_{Sh%85kapazl6;&LpxT%#b|{Db`o*R`-^%p90a)2h5Z=JH*(#3= zH#KpF`{}RiCl6Q|v}%yFgcy*wm+OIgGD}Jh92<+_FgIdEq9Te(p-dfJBN9T)Ef>z5 z?2%nSBza^{T7_aIqc!JCy34Rghp?D}+hL-9fj@EfsC9#5W}~Wkm$Bb0Rt!4|8-Mij z`^OVlJW3qY>iv!2OTP1K+m`c zB6JFrWr__q1cyw~in9#k;Jke3#T>ijfp&5i0%7`vjZ*0Lp=H=l6Ub0F_}GUU@oein zP1sBMK6z72{O4&olhyun2aapO{obD=F$26T9oFs_=$SgVba`(|L%(}(G7Oo~iRSv&z8B#GUC!vh)+8g&& zaRS9TA0e0|+hfL&fL3O_Kxd<<2xZUgst11k`k^~#H-1`^tw0#;+B@*?RjDxQlkhTB zmg@?`D5GAedPe0lB?So)=7ksRQ)FaCx5gfwN?R;5!E9IE}cfVkL0!+ZYjbHqWkk7WU$qm^u*EO{Wl%w{!w zr$$novjr$EGV%F#d*uu852DqP#(9#{Q5Cr-84W3jkmcF$4)>NW6Sw#~IU#AzxVHoe zh&fG5PqFCQik?k)%1{-fZxfR1*vKHI?6);QKJE++pfO@>Ug|MU8a`$ft5%h%zZ1+n zXmb-h58Fvr9x4U(-)!(zSHG+*?#66XU#vE!&a5;Z=0`E13G5r3untIZG%Ih|{ zOoDIAalTXCA!&>199xzjGD64}FL4rgy0!I|tN}e>KG4_w6-zo8IXaqIoBUiwrz=~@ zF47@C@|Zml4qvgDnfVxjGx6;US4pFIy$5={fP-4gqJp~%&H?%yOF`2tecp-XuPtXc z_V$;rg=_Ts?d6O((!QH@DUemg6y<(E>+N7&Uf!VRni@J!Ck^xJ=iF z+BG*AC-M~gFI*Eyj6L9X7*$qH4f4{Dl~{PQA~9nY;+FZr&Wmb~shiPaSlg;X*KpnFVNRc<@ayD;bR9QT9Cg+JOs^W!SFaX1*C%xcwy1vyKHxw zr62~i^O>wLVn_HtUJcy-3k-RNON-4buo`AdA@=Iom8!JHnrj zNx>hBnn)sKJYr>dx@@|yQpKSOOb=8vTE`wIM^kyLIv^CNuY;_HDHDsm8jDd%O?hTJ zuKbNEP}YyzL`4gAXov(^#2CHRbl5cvyi4L?Wv`}wp%Y#_m%hHl77!wRoc7h9CJr>$=qYj zzf8%ws5*w!3sLFO??ohXrqd4#rV+8ktlkX971^IOI6%zGUE2l0ch-E@3PPwmvYiB- zxPveE+Se+?*tPQULSIAh6IhQ!!%(4a)b%*#CZb8xccs_?hK#L|$4E#Ee&@+1ZjH)s z5f^x=avRVx5s#&p>32bQL}NG!$@PsE+by*ym!TulSoqgpYTb16No5C+>Pj$c{Q07jzP03a|5MCoe!W zMa|%=4N0t4F3*`Qla}A1WL=*eA?cV|@yIhVARO0Ntt`fgsY^eS;edrErQMhwv&<VsjCn_SG#CWR=Z>gv6nx z!PCjNtWiB%aun8E-A#y!S`iJgSY2@Hc8?Svrhj4Zi+(7tt3Ep{y0D$W(PMp-%RkzF zwtPVOkGeTBu!Qhi3eDP4Nl)L>=#RC>Cz)Ra~JJYEUq(k$AaQD_~hhzo$ zpM83Q7iXnmyZc8yLh66IumkFG8OY-=fTbW{I$qA+#@2z>z{dWk8i7UA|JHwiLl+(^ zFAJ>y46I7uBjz8;Vf0&qtL7TZzM;1h`Kn=Mqk$V=c+h`+iC0zq!t=P?<*GyH=+TOU zH!;8kU|>|h@DhUXgzrQ`tbS;4(R^Jt!~|E-7d62*oJn_xtD{Z4NygP^txpLnJWiLJ zgd)v0Bc|MeBWLE6CVmvMSWZ>*ZJ{BeU~Nw73GN~w!+)Kdri;D{Cy!?GGgwS|v@eoa z8>=uBa=HKr>1z`+6ruB6CtZm<6T!Dl3W-z9SS}n9?qg`Jw1~TT34T=-d)bjK^tO+| zUesASEy|uH>XmHKT&P@jQ>f8J=k~pE%if+~E}rk8t2zJjDM2nXQ8fi@I z-QIY@)%A&4JpubI{aH^5Z;U%!+#X1yqaJa7bYiwJ&HLNk=8Pb{Ikl?M}z zG*I*DEL2@h=yipJf?c{|TyD|IyPc9G9)5WO;Zy=Q$ejodPDuq6icm(fX;5zP(8O=| zCN9R_f{UDuhpnma<|HxwD^Bm6_tLZ$W83pYD!e~6PE6n}eKgb1Be{<`j;Fo5On^V& zZ0va4o!ivI&~js_>wJ%o2kGhlQn79sK0;HMSR*oEM6EQV@}&<`q0n5QU7E4g^xv{bc=mD2kpVO0)jgzL0i{uc+kV z40DmoF&0OreaFo69u%$Z%hu$?F7;3r2ARjSvDLa>U(dPr#8(p^BhGyAuzZ)>X?(H$ zh^;K4ic&m`3pq8?xI6^k8h85yfHCUhh(iPX-mLLzYB?ED&3nOkvx~Fy~1lz;^CzLBNVTZy$W}&B$jN&sXjj()*uT zeTFn_Xx?440bJ)If`=lcRHpo}ChBSYiQ3Yxb-6vU^T1)lJ z+Qx5FmnYn`f=H$kX@no;^;53Ba5l$Yxw{qFYSuUzk>h$qq5jSybJC*}Jhz!A_QyPl z63;k~0QU8;q5hZ78!)0W@vN~vPGmYM!3>$+r5}9UMv5IPj8*Yl(BlTNq^iEDggQy0 zwRw<+KMYt0TU0qQI!P8P7^S21Sc_{XVtW&b9(trwBdlBHTeyxxJIH`)Ewu}|a|rb< zE3?Dl0)|nfn$~1N4qn}}_66AJ3?y%KPzZmnlRaKVsjyfWxSz+ElVpC=? z8(%rbV>0N!A9%XH4P%0`@kL8Rpiry@UamfghBf-*A(r#dHcJ;eI4j_&&hJBy$%?D; z;=s&rV()@`pS3)z7T9pe=C;z^to8zF#HrNFFN^SJPD;n7l7ZnPV*VLTdNmF5jZBJ#lW}s-|cBp z&u4diRX1-bmXiA8JtKmYUc&)jBMsp;ohBWf6kpaVfdu3to!`FHhkE`E%u8ju%(W7n z5dcSu;l)mLs_zgyW+>?nf5t|&0xk7I*P)EKZ*;*@&K4!tlxIMZBSQpq#!8}$Z>@<- zc~Bup;W19(%QrW2$kyJ2St%@@yTBpF%v!`Z*&HO3!FTB=xMJ?~tO08~s9a2vhU@h9uO@PJCKCfPmDm$Ai@IC# za!jzs)_9{l+Eym#g1k8y331zA_ET$iP;noHK%X~@nyYw~kq^W)EiCkLBr`HA$>$et zPC&Uz;c$O31gNm=h2NBt%eQ}f^x%jGId`+txV#KBVlC2=_Mg^*$TZkcc-y*ES&vY= zK$PK2-YktUg@ua=yZynv5PNenjE!d!`I-oKp-dw7!i9#ZrG5D_>MBazu5_S0IJ$8@ zA0{g#n#af7c~L6B)4g&r%r348DzD@-t_}P4K6x5jHTriA9Qub1$bq(VNi#o#Ar=ik zA4Vj?O~_Og2!aDt759^ojpfszP1!Knu&c0+^1h(Htv=D~E3+Ht6R#WRo39&r_1CBs zkJZ;z>|`y*t~9SRO0(mI-)HdN1z<*?rbp&)$u8BHarlyRx?L;@%7EE@G7&J?+@rmT*~smZ8vRZfUXe;*S~cP=f5+9fIF8C}NT;lT%_L55 zhAum0ruU?`u=J!~$Gv7HF?OPy0e%Ki{FNUrE2GalWKJ|-W6ErUs9XOzVrQN?{RlC# z0SGN&3N;n*?}!X(LC9I$009~F8v{obstwJ}j9H3rlX53j;@KE7e#zD#GDZQn+-mAb zw~Ek>bG2dY!PUBN#p}iUtOaF|Kh)WO{PxKxPg4Y$7o1uUKS5{YElI;I0^~vCJ<6Uf zbMKkwE-QP=X`e!4s6=gs&Y6s>svViB>-+KrBoNknydi|Q_A2edwaaIX^d#3@UQ+D5 z>B_AvF;?team!ILI=}({Ben(sr2!@>{<#bOr(S;B{X2Fr5Fb z2lqMf@2!Zx122GC#s6Jn;&VyQyUTvbvIO=g{9CWtbNKU~pkMF{xPQU_)*a%KVSY2^v~9b=ji7}{a@%~U_Z=H^zSA9=kVuMLhyN^GK1V;VcK$*u0kdE~(a&q2&qX{>B>xh@1uUWd6!Cjn`8oV~;_Vk)67OH| zXKA?S5}s$Xeo0`%|Ca<~;QxjEmD_rb|9ckY7aDj(2<)|Zo>O@af4;c=C7_u2&+q?h psrwxKcSrLJ3IM24|Mu*Exg9x42%ug4a7oC3TcBxz)BgDNzW_E=n%MvV literal 0 HcmV?d00001