diff --git a/CHANGES.txt b/CHANGES.txt
index d6d3ed7476e..bc91b25ffa4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -61,6 +61,13 @@ API Changes
7. LUCENE-1234: Make BoostingSpanScorer protected. (Andi Vajda via Grant Ingersoll)
+ 8. LUCENE-510: The index now stores strings as true UTF-8 bytes
+ (previously it was Java's modified UTF-8). If any text, either
+ stored fields or a token, has illegal UTF-16 surrogate characters,
+ these characters are now silently replaced with the Unicode
+ replacement character U+FFFD. This is a change to the index file
+ format. (Marvin Humphrey via Mike McCandless)
+
Bug fixes
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single
diff --git a/LICENSE.txt b/LICENSE.txt
index d6456956733..59dbf938fcc 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -200,3 +200,32 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
+
+
+
+Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was
+derived from unicode conversion examples available at
+http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
+from those sources:
+
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
diff --git a/NOTICE.txt b/NOTICE.txt
index 3331b4400f5..92fd3447bc7 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -9,4 +9,3 @@ The snowball stemmers in
were developed by Martin Porter and Richard Boulton.
The full snowball package is available from
http://snowball.tartarus.org/
-
diff --git a/docs/fileformats.html b/docs/fileformats.html
index 40152affad2..5f860df10a3 100644
--- a/docs/fileformats.html
+++ b/docs/fileformats.html
@@ -1237,16 +1237,14 @@ document.write("Last Published: " + document.lastModified);
Lucene writes unicode
- character sequences using Java's
- "modified
- UTF-8 encoding"
- .
+ character sequences as UTF-8 encoded bytes.
-
- Lucene writes strings as a VInt representing the length, followed by
- the character data.
+ Lucene writes strings as UTF-8 encoded bytes.
+ First the length, in bytes, is written as a VInt,
+ followed by the bytes.
String --> VInt, Chars
@@ -1254,13 +1252,13 @@ document.write("Last Published: " + document.lastModified);
-
+
There
diff --git a/docs/fileformats.pdf b/docs/fileformats.pdf
index 2dd3e7f0f62..63f066cfeb6 100644
--- a/docs/fileformats.pdf
+++ b/docs/fileformats.pdf
@@ -5,10 +5,10 @@
/Producer (FOP 0.20.5) >>
endobj
5 0 obj
-<< /Length 1113 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1115 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gb!$G9lo#B&;KZO$6@53W]k9ICdOP`P=a5[dnAEt!C8gORi4Y_IpYbOI4uP7>VL*sJQDKN]6[Q8S%SK06ig^-JUHXo\)4)`fne(e3=[b6f>EEC"UTpJnTI4b:+&Q\[CnNTGc/7;_)qPA_)lrGchW__JWg47o`BO[p&Um!+.u0W#O_5XQks>]'NNfml7k4h>AP)7<_:=9$tb55Sr>k,OS]7BE[U-Ab\Y@C53O7U[j+kjGtTb7cGJWt4]4q%1?L1!CQQ<5`TI,I2_)adekIJ>*t/^>pAl3uDLFdf5&^rP`F@@)9W(IcTW(NY#\]*sIM'Z\]8oJGjSbj1prR?4Z*aJdu7J43Z2RImnNO,g&5I3M5VH2':-I_Sk%/*h!,Ube%='Nl=)%igfBIK6kB('./d.ond,XEb"Gj0GB>!mi6:P'nJ.nk=omFh!NY##@\@,j[:b1"cq>'#cGHH=j_*[ELH%0iiFuF6Ypa8)d6R)6hg!:TBoHp'bhG-KhP`1"^1W>96'NidM=<:"39-:ZdRbNHV-aWUlu;WS;@ccG>Q&E%qrkRV5YNNK?0HTYmqU0t*ir#5_'Mql>(l\qQ((N0FFA,D72uTGCqlqqeq^]kh-tK]%BZrG5]kQueW@*6=,bdmL:Ahs+\@db%=c0>at7&VLcYb2'f+E?G+`RQ%F6g?W_$D)>,7:$@rrQZf%*]lD&$,O1he6&Y^a7t[t/~>
+Gb!$G9lo#B&;KZO$6@53W]k9ICdOP`P=a5[dnAEt!C8gORi4Y_IpYbOI4uP7>VL*sJQDKN]6[Q8S%SK06ig^-JUHXo\)4)`fne(e3=[b6f>EEC"UTpJnTI4b:+&Q\[CnNTGc/7;_)qPA_)lrGchW__JWg47o`BO[p&Um!+.u0W#O_5XQks>]'NNfml7k4h>AP)7<_:=9$tb55Sr>k,OS]7BE[U-Ab\Y@C53O7U[j+kjGtTb7cGJWt4]4q%1?L1!CQQ<5`TI,I2_)adekIJ>*t/^>pAl3uDLFdf5&^rP`F@@)9W(IcTW(NY#\]*sIM'Z\]8oJGjSbj1prR?4Z*aJdu7J43Z2RImnNO,g&5I3M5VH2':-I_Sk%/*h!,Ube%='Nl=)%igfBIK6kB('./d.ond,XEb"Gj0GB>!mi6:P'nJ.nk=omFh!NY##@\@,j[:b1"cq>'#cGHH=j_*[ELH%0iiFuF6Ypa8)d6R)6hg!:TBoHp'bhG-KhP`1"^1W>96'NidM=<:"39-:ZdRbNHV-aWUlu;WS;@ccG>Q&E%qrkRV5YNNK?0HTYmqU0t*irpd"?qn,:d"C#-YFE^n1Pdh+p.E)6ormsXZ\W*VA:%RP,"e:+j.Cs4o`&ee8aUZ1jYiR+jsG2oQ:j
endstream
endobj
6 0 obj
@@ -424,10 +424,10 @@ endobj
>>
endobj
77 0 obj
-<< /Length 2395 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2356 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatn*9lo&I&;KZM'qANtbesnj&0uMb9TuG2an3qO>c\gQ^Dc_j(=,X6i'&C^Fmh\D*-W-1@nhoOrq]!hd\4Ep(.AJPS58J1N(Sl`'e7$Y!3L?q&&G^W>F*Z0K3f-9L^Z76sH_RTcPLcIAX]_goB]1_D$#NrQXd(-RGf#S%96mBDJ$iK+?*&n\6_YcA02%3_bG3U9o_6PTlht>h.f))-II&IO'@nT=9/F;tWD-L)N2/a;1:-\-.q^_QdRIcjZ)jDSl'2[bX.>Sa^7Qak@>'Edc1h@<3+:dCh7[eW\[-D'i\47=C?8C)W=N4J3[kePV),j$WD6R.S:se1FqY1--hSeQp]*if([,3mL0D-9Ft#KQJ3\=ui=9*Q"`b6clYeM5.]@1AHs&'q3,OQAqGY6@V"!$Ef2Gj&n*O`*iPRZSn\l"!4BOa]Ye4mJtESS>i,'h6M`*GHX.PSI5LbCl8C1h)V3hIAHId4dS*IOH.NB_eW:dM?7$@B%H[=DG-ZhT2]>p$V!1W86CfX(L02mXGn[WnN\81Y'"$4NlrW:W?7=AgENG?efL""=U=8b[;nYQHV'QbBBSi_QUq^L_2$XDcnkp1Cs9!J-f'qPKC8M.%o1=t<,,V;JNRUru91F*YfGlsLhM_?\_pi%t\CE6PEN9DB>+$t2!>0\8/eYUnRtoI5I`5Tt$JF^64#b9HG7N_"[3G-=%o3%[HdI].DPLc4rNS=%+qTt(86>R6.TCP,P*i5.4T[f>CA&gJRbbOKS3.fo`pD2\f=>X.YXN[`QL5U!qGF^;84b9EIU+1#k>,-6.B*jVAk%,4Toj[]RbgJUpg#JbA&"Fm,\fLLX%k;ns-!8A$iMh%X8qKX4,pRKCckD2%i2$I$2e%Pf*HmHuE:'#tTa3<8iZnEOu+(2E[AEp(1XR1@7H2Pp79;p9Di1p1:Fq@$`;PciQPsgA!K3Y]e6lmJU_el/>sLP(ZL@;/>8a!_j7E@JpDI*=W/'_FO!ppbZ6[G0I&TD]g<9O;OXHO4m_kC:I[M3)h$3JW(=`2%X")+18Fae-uQ3:I$e3)[O8LX!8+J#'/+JWt;][J]T:5%UVr@/B[5(m1OY3apVm;o&WjF[:rO#hc#V10*H!@i!$ZkV59GM92^QB"%(.cFVE;4PZOr?CMV?HG?YUb2.$M$eJ-Qb]\D=9qJb)Qs7OYgF[%W/lK3PDl7o=]#c&F1!BK>S!0W`??IL0`50HG!O_^;5?f1QqQo<\gkeUIfAm[b)&-Q8WoL:PuanHYa"I65LmBpe%2g0/F1?/9X/EiZ#uILe`rg>pX)$fp"+RA[he!g!+N7O-i~>
+Gatn*968iG'SZ;X'jN(A0giTEE7nd%9KHIdNLESrKd'C&Ae`C]jLG$["UW6M*s)qj3YU8D0;3_'mWV(VK^Vs"T3$qh$^0;G^E45&Q=`^[kNXi>mItgVb-]C`UoBM1Mr,?>JerCC;X+Q).U8puKFmhPNY^oS6$/F89RWjT7e&l:mj=f@fg`Q/BHaLYP2onOe=D_L-^RMX*2C\$)Xcg/JIofDmr=oHcML1[Mc&sd-WJQJ>NEZ*h5;DPlX6n0!1B\$5Fb-B_\Vt4\(C=bPN^AKrdMSk7+l^RDJ??+uIjkX_4;@7@.>f:'H*;%COC3-Jc@^hB9p%e^1'*JT^n>r%JQi%ftISlM5q%@3",[DVFl8msnGQ@^V"icnL`"u04EiU)URGJFRq+ZV&qisr:Ed_#/cUcYgml2J'qU^Kc0Q>LV,UUrofE/8U5ZBt83*n#$G6?pa)9iHSc`pDDFdTn:1k`^m_Bu*>1'oAtgj+?SE]Y-$g3aO4ff<)#fdW&EcGO*;F]Ll#kE!A-h-OX#*0E//tp2LIl]c0VEdLdj#QXn@H"sK+"cQnFjj)kn/V9Ghnh"+WZk;HFJ?TF=.hJh7rZFF=L`j#GZDco\+4cIF?ZJetp(/2"e)5$PYG4"eR[3J.1oS;Vs)HR%Cm+*(40XuSKgDs4oj+TgMh3toe*LB3@[slQ4)iLpA=F8,8*]%V"UNIb@1i23833:\/]t'\pPO:@6$ShlfOglruj0Xr*T-hjjhO8;P2e2&]k1r<1Uj`Y@WUXQ_SG>S2e1>AH'm!:)#G4YgFQ,];S`1bk_K?EiP=/7KLO+l%uo.s#\_r4EbF*m^L<:2V`7!N#b>2c1FU&m0tVVuXhCW&E8VP+#fPAt2&^(7r<1UmqQ8RR`&@`(g:RI>(s3g3_iqk$8`$&Y6=-K".O=Y"$Ldp\u12)Qd#jWFcn/)G67>TZ#obk8Im?h0@C`k!8O;OZ^LtYuh)Rs2R"OnR65fS]V$j9WW7.%&oNIenA8Z5.MKQSY&7DhBf<"#<8FMR^&(\,t[G';/D]ebJH'[S1IsDgnA01u$Cl[EIgWDO_3IZBmWmb[3XR4-mL]*pPLSsbCE,_BrWqQ,!4fih=rFTEecGOD[U]#XGD3!ANa.'*6PSm-JYFL^JX2mo$,gCaXrJ,lM8G"^'PcJklB.?oZAV)o]qK?!eKMBS*ZEt`RbG';2[5ksN=?!-.cDW@k1bErlGst>O%rkq5L9+(:+Stb[IG.Z.,O'9U5J,\><:=%ZS:+6`XZ,FCEOMS;p/KQ]pTh3urL^.PSTM<:DAO9T#'Ti-pOT]sDKWd>JeVe=7=!RCgt@N[kGE$t.pG:HP4FcC?BXiC,?mHe6:p8KD8iJI@A_,4j2"Zak6L1(%on%-IX#?um-DJSma"Z,N6bKeS`/:OqtX`MIUfbOlbRkY]3:9OnSgHOpPZPEENb)Q/nlMpr^3pu[HE43h>QSX^!S>E79UHe9cX+=:>PT'51\KRhqjctYp4VTfR-tsHcVZQ!L8DN5;%V'OY!?'kF#j9B6AtP$^:."lhg[>'i)Ru:GSi2(0i-YH"J6g>37#k>ZG^@It)kDdHp
endstream
endobj
78 0 obj
@@ -439,10 +439,10 @@ endobj
>>
endobj
79 0 obj
-<< /Length 1783 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1821 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gb!;dD/\/e&H;*)Tl3SSQk^oEP4oT^:$XE9k%(m)023&*12RDa7$j).Z-4RH;`7PE;:q6a&4cE[0*'Z<2oQ4Z,tj;4]h@SCqgBfX.pW^'3I#,S%^%#'4RM^4CnseD&q90mJocd14$0/bFe@h1j`>/bJdPSY@L.8?,X`a5Sg*.m9,N(iI-_[!%75(VZ/[Q]b0,?.6IL1N;eeHtFTq=;5tl;oJT5XE%0;V+H%fmk"0(8Gs#b&<'/)%-oHbM+;EiF!1'\@!;8FR?8'UJY1<3fAq1kT06JC`"7_RH?q]RN6mO`.&T.a?=N_M^`]o$6*)*2QK)-Z4C@>\$/BX8f+`u$_Z-:=01":!qHUSb6ZBH*((B'mgF$"_?=VqOa;8'MI`cPTa4kf\i&BE$]'LuE28C-c8T$QDGTXV*L>_lkO<[d$`!c*i(AX@Ms9;*2Wr1pHY#bC?9LB5-OUeCeE*d4HFe.VJe"(i="neg2PK]K^u"$).h=/?-uqSb5gVYb:]M#2V_Al(p\]&sp1-k313'4LMa0#M4%I9!Zt5(*;"um\/#Xa$;"Kc-$3[]Ed=#1+"k"J9BS?BuB?HGj[*W/[F!Q$J6*@ZUVbc`l`sZ4qPMO56#N1Mj[O@P[]BKpWk^nRC!Q*Yli\qJ^i+]J.iUo18@8b8-/4rpGSYq/>;-_;hc)PH/OE$?KrqN1OrRT4^2YHD6OkEAq(Y[-gJ),8.9TH4G6]
+Gb!;dD/\/e&H;*)Tl3SSQkf:s]``#t]$$C"=`*e(;ZsI$gHm46V<#;UfC'tXk0^%E#:iUZ5nY20*UfSHmWd`\l/:B>n9%!u-2bJ!`&=m&+:-o/kL\$ILD%XjgmS]O*kZd_l;o4fU(0jj(B_9pabCmPD-uoU'%id5c)jbhP=h^kneLV"33mhjQ&0.&S7_9N&)H]HM02b8u4`tKA$R)E_h+o2$A-1-/+j[/143'4M8DN$7PJ+2#LHYW*"+`MiBFV)d,`YTmkUe"#Y0R)P%3P1@%01oQ-#Cb"\=jM-5U5-*ZorQ?(>]NO:,qlc=H?Q*\&Iet;=bfK>/OF-jgn$X03)VB73L;D987TbFnkp!Iob1LjisFJ)>k@q$8N^>fa^gj6_1WPWW/N@`1$"@G!-=b%MU0U&$]r/#/7_pmJ(%nZ^Qp)&-!D`5aBXkjBiVu@U2=i(4kW)$j[sD?p=n;G#l'D#dc@CbZpQsg3$:tO#Jb_6WoXpqY\7e"Ncn6:d3NAHl@8paLrcNHjZ&`2XMdu7=INZqbf$CD_6i9=#"1s+Koec$Y6C/uD)QL,MCNs!jG;QNF&9TW`!#E)oPp/^sj3bg/2B5u6Wg.+ohQ,u`Q>+X"1T&.EMVEc`l`SU?$Qc:Ye_1):mWhkfA1$->!HL!n:Hr(!9g1GX9P9?dT#\$R^&.jOqn?7D9L'fqhI!I]KQ^N[bH/*!VFGEj\J`TaqSiBm9Nc)]?%(Q5XGrXP]dVbO*7c[6kT'mCa@,TmhD?[chVJke>_6R-R5E'mh25>H;\p;Vs,oDUVd-I4%]4-hMe.1JE4h`.;h-,H4%7lAG7u9UDdaeM(a,292UeEbmg6;L9Rr'bGVL4;!M:TtN>:Nf<76C0E?D?HY)M1jbQ[js?AgI#`)3rrnG5VsX~>
endstream
endobj
80 0 obj
@@ -454,10 +454,10 @@ endobj
>>
endobj
81 0 obj
-<< /Length 2177 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 2155 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-GatU5;01_T&:WeD0_]TZ-h5m02UHfnOcZB`OZ(Q]B1dpk[L@&59e?Gj^V7"YdB5&5PKlm?,\0KYhsPs*P.jXoGE^Um/Yg#;J+p(ZY3BfDEVTR"S[GZgNhJK#KXU'G?X)F5rqssdDAsoERJ8?sT;D./7f+mg2Dn"6g65P6c\l5%X@RZj^,GSJ/cmBRf7mrPn9aM[p=%$X&J]^]Z`_(N*RUHbN9Oe?U\$;3A#9^-?bYDkB&gqZrR9S=`-\iLj3uiV;SD7#mE&-=QQ.TNI*@VSa=aFb]a'[h;/aG67O]A)$+o3aJJ4MRC#;?`#YFefqCNb4Rq+;P)c[_La`[ZeTd=V-o'i]S"t8'RZB&_0/-*d8r"4ME.SRR_'imJcZqSH_?qm"jmBfR>A:A6^j`H06T8lBM>G*hZ-6%KKb"uJNbRe!C;9A;%3Y*D4G=7&n@J/5Bp0.7)H0Z^c>4hS1MG]kn6m`)m@gIUSf:1$l$jT<3:/EcPUJt5L-&TZi]Fg^bL-Z&O;YQeUN9j\>=,KgA2PRAUs"naHFfF0&Z_AHp-7Fa'sZZOUM/X7WHZrd;a@4L+[6&LQ),KWKL0FYdC8CUOHh;ba;ZLcdc0Kd8-(U"VD")lIW]"HFd1W>aV&[GhghTuEFfrF7%B,K3pobL9B)L'0]p&<+&%#4>BWATqOG"uSc*MC0ep!0%crXq16Ib6duAVIT<'iBZGfiL.XA&fKhq[5#*/ab_'T@BeZbr3mmsh`mm'ZNTnD78=*3W$qG7DIH=\nA%!.!X6J_I3RUO/:n^2^oS"9Nc`A!<[6+_9A1b?-r^`U6a⋙H:JP(Vp+WU%,NJ84X;WrKRJFY-\PP35/%<\Vl!;YHY1S<,]ap:j%R:%he*WEI`m:^m9!VR@#Fq\:*b)_7WjZ[)GR+_@_9W<.#S6_UW,@q,!TI9$k$@KC?aMJ^f"ALDIBH'.jCD^^=^lh>4>Rm33K$u&iFN1&4=D@MOil&:j=e=%`eG@BXt'VLa,D&VQ0/9MWK:8;9T*.2c?I%5Yj%^d!SWcIl@"Xmf2.O$XPmMqH%XHFK6m&:Wk;b&8FoeB'5a@,Ws0rVmUI5i;MSdat^s+K9&nLW5_3k,Lf=I+p+%Nk=ShmVC-gR%tHbp^L;8>`Pj[=RY.%CSAarA5tQ!\eCnVVcKQO/)DdcO?E3,t?;+4E:h@mD[ZSAP"nbUM,ZFeX^-RM42(A&F#&.,7+nqp'4?Hso`(i$NBU,JsBouin4E`!\5uqCbUGIk'r*!1`Vh:=%`roc$k!Kk[17I[ZD:j2Z`GAR(f0jP`#43fJ_TGI`SLmZ?Pqo+u\`tYDrN``*8`9'267H40;LP`*Vb/%Q>IC@'e6[X'.]rPYBXW8VQ+`n9A\1E%1X,hafpak&YO`M()8>!N9_d7`r95-m*m<`Nj"D^TM]Zuf/Aiufs34kj%q+^/X_S`6^Ue%2g!d-355cBtRJW<$lP:]E65FgP5#BnQ^>[qE1R;Xgn/e_#4LfCkee~>
+Gatm=D/\/e&H;*)+nSEk+]e8RGm+DJ:>BGsSLsPi!V9^G&nPjbV,W]>359p,/;JE^C!4=h1_;7$I.$C4j71ToF$&W6n!Mg\s8Um=)nM_gT#^jlDXlM)D--J=N,%t0Dk0[:J,Wm?C8Su-9VRPEc8,/Z;qoW?/_r3&D6hal]!u7&FT%SEP&M8AkgcN\[E9dK^(<2_PJrE>dVN39GEW6k4O#:G/DbT_5Dd3^P5CU"hqI/S)@YBMSHFfohh:'";TupkVKaHGpGmjSjfoQBIB8f`ec6i9L]hcLGcEt[Q#l\F5ag!0J'2P/\g*!N<>N2>9,#n0lo%MBnG?2#&7_3`om0B:s)]a^!sbOE$UXnr>cJ'a.nBd.-mguWR\__U&bm;`QJc'MhM3!nh*Qj7'F.@_/)8[>!S&+^g'/e!#=J1WSI*7>SHdG$>a2MKSH>Z))#j+sf$TFd-8'k&Y0(rVZ:id&B5RZSQqS++D4e-MdB.*rgc)\mbCUKN0!!C=o"T`?W6MY*L5c_*XucRL%[.$,sZ'gYe<7u3KcN5plHnZtPJA@6.Q\5\\l:DR>kHjQc(-n(;,j!o9a"=fsCD@N?gU6]"n*5Akd`b%Ah%L0n!F]Jc-a[r!fIE/>fCUfNG]]6UJ`,Fl)1CQJop<(XLFO81D*XN==10@D8E.dEuGG1iU.%Hs5d#K!rJ\XR7b.pPIdfZ+:@8rnW'hQlaC4i%LB]u0ieM&;,`Eg%eKCssb#@Gp;hcc!*=AIJ*:d[.8cS>GV$#4Q*d?30"9JlWFk@t&J\k[ERC<48&Sj%]o?YBY&Z)/dTm;1FlM5D9=CfDOG?HQR2#4+%tnFDI>'Kh-k(r`u4cUjFbSO2!u29-pNGR&U(9pD,8H4/>C>nS.K&0fFq6%q^<-`sH0Z.s;1W3bZpDJGNQt$E4/`E3_TAR=Gn=i%%0fk-'B=R?>b0p%QsOUhHn!NpeTV1pctC'M5%a+P[[pfSP\XIe(+QF_!e$bAbBqf80o\:$F(?N+ld87og;d/bYEj)5id4`DSugq<)J]jT^k&_60]B*@lQO-)!.JZdaaMM+9L1FhDVBLQ0q(bhCu`O?=PeT*;,/B]rDn'"S/NuW-Xs^(2COL&0VbLLTL=7EX;MebC>I=7Rt.?,aD(fMD-R\#%.M4L/Zf2g;Uob-9pUTL4+\;a2GiafWZbOr"nS/e6C7h;\fkB?Ot]mqqUEQ_O=D>B\P\qS0`%R;9&2X=a%6t!&oi=uE"l'#;=0BPWlhNXKQtL_5Zpo%q8^PkRkuL\]K[q`(/L7L:SIt&PVt;'K':$`@Y1>r9,S>Sb$PEZdL76V/^3k:83"MsJs-qMh*'F/p6`6atk1IPs7Bc^IbUI7O$Y($&`D!cQWp!h70\`T=8j>
endstream
endobj
82 0 obj
@@ -469,10 +469,10 @@ endobj
>>
endobj
83 0 obj
-<< /Length 1887 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1945 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatm<=`<%a&:W67+T1:#d>#9onc4OK3U._Ui4?>C(9%,7,+&"t&ukA^m=4":.>fq2dS\^XE6MVU1%WXK(Z2>15HP].(>QHUhiE]JAZc[L!W;E0p3Gm-&X+tn9'r\r^Fn@`2Rk;q,M."j^HUiJ?g5Rmj-9;`r6*t]3eo5]5@21b=aK:d2cl]j4a=BW(2bLf$id`cDtQ7/F5a4/c$C5]7G?i8FS$Ws:dopK3k&%6#EQC\7>:,S/q^<9&%"(SsmS]6o"/nI6g@p!Ylo9rSfYq`IWUd;uIR5Y,O+U5.VLdaj/QK_<1of/5%`V)MH&'O-tZe,Jq1n+r+:nN8[WjI]e`Wr=nre-mi!YAQV'HDnq69>rtHu*ipV*USoV9TM#>aK&`D>+*F=bB`/W>\XSLYi)o(N=OOf^+j?EQ9H8+JSbFhH8bTtGq?P=<,"s*(St.'+D0T*H)\9lSnq#WUtL>Fe_Ma/`ertU3ChKDH.;m#=D!9"U<'Ts76W$k$p:llSlNO9Oh.7R$r_6((K"RsFU#m0>jOi1-)dP$*"IRbfdbPe"e([Ha=ZJff10mK(.\V%Zq.a@'so77B\M3A3(`c]G8&U,Fu3!9JE!9pO(<.]CLh^b*:1SB.a7Wt`fXFqc4kB0[Unk%mT$&!X,;r]U5&pVQX[\,2FD@&"YLIcVfL]a&e?-:^2.l4KTY.iu%:>^bPIG'Nt>H>%V;>no#]KZ!@cMSRO-R[nPMe#8e`Jdpb":Cle$?J+[Os8LoGF4dD0)(rOc+IBL0eoVuiJ=4IBes1X'AP.k#.'lFFOV:PmrY"I>IN(q9m;W@J6*>(pAH6*!p"h-I<3K48;[=Y>3",^U%`E67ai#?@g3ZE2K![U".f2R@6AV^E\Z?1r:dAbIHfe@bfT\jOPsH[E.JI"*EW[mh^\4@D=\U6-D'4n5UuG:,OpB(Q9:pM/18%17FuUeM!:^?]6aZR#tS5>Kt`K.?6tU%%iu@2s$D8Lf5pgQc^tkPU1F,i3>D/s*?c?)09ToR5JWb][W+$iZN+3c@q]cLaB#b5++`E9)`U?8IJ`2r;oHdFaH%`tolsRSlBj&f%8]:=3NiY;%W[\2L/,T(t8gTHl`k'1aq+kE4O$;?#g,ei5dD(1=3sTTe<*5HSpQ97aoHT?/eR"Uu&VDKNG%m?r'6Oi&?^H1%qgnK<=R;r_gG=NkV(etDnlnpnZD$'@cX9'V!KaFE[^>Xi4X.-aV<.3gV%1#VM9)-<_du0Ck/[aJn>Y%Q&h/Y'`":V>K@DjFT&.LnUc3LEHNG?Z#]q.QfQ&ED*(U;Q6SA.1md-N>L*)d+r!$0gr
+Gatm<>Ar7S'Roe[&Fu%e%4b?+%2]V/C-$+Kq?lD)]lS55\$Gf%]H9;9V&&8=495@o\-6#AU:2)*D!Yc:lCu4O2Yg8e(l,(>FTQ?@d9#NleX:0\oh51U\_?!uY<(oN/+aPXoYZn]*L[*B5Dg:"!8.te=9QPdUaF9,WUh$JZRg:WXZUtpLRc=AkbZsM&VTuh,`C*EAFZ>be_fk;K)ihj#6i.Vqk5#@cQtW>,<\D\gX\LbD>tHk,$oL#I5a0P%Fr^<;/%X"$>Ta7766<"!D!>(%R5+a!ql#Ph7e]Hm9F%6tHPf3ZS"40BY9:'@ik_Mo],fF:?^_$f5"T[X_3mkKNp-#<8MgW;D(Du\Akb[iUtG'.9eA,9Xp9XbS]0bY1s#A8Z_EC)o2&o-3%rEF1''hf0e[iD[Zgso1nZX#SN$SO@eehN(G&R.NL@YR\O(bjL$a'[UKS:bSKZ^.[lft1$b(7Sebu%^*%"2JcQFQ=?:GEMG3X\*J@'Eb5BBQ!q.U<*uXDL4car5spB(.`o>,+t+4W9EjtT*acV,2C_KX@ZB@0K?73]o+[,A,bO_HtneF.!M$j/"JpkFB2gVeB:g'\&Z]rdUpAW<*)Y*3H9O0qld0dUD#-H0RFFVPu%#Tm]k2tf=9!Yq1!gcSF]i2?8/^#W'BA5((J^&"ha/R14FslVM(-Md'\3UYKc:%"`5Kmrlp0PW`UDbr#Hn8IdTC3tuE:f8Yn!Q?,lI*oCdWYG2(CCB"V,RtHSs+>u/DEfXg7<>HcYd?9.RH>qg!cLV3LI0NoDUQ1%O!JHm+%[=hFN08ND+qLLjDlc,<'Fci`Yg>c[+`tkUd\oe@*1$'_X.MRg>HMf>S;69,-rHS@5JX.d[\=i9nZp*,:R,.Y%`XQ#I`+r/%j-hWtM#OKn?S9hok=[k5^;1FDM1AKIYoJ6At[Hp\HS`*&%tj#!OMi:IDL?O4ZTWaUntlb^;ml?lidh1B0PYPk,r@FV*tl\E3(!bE)fM!n2Lfdm:$O90.7?%>aZ.[PcCN3fRs)l:V^NuAr*/keW;O1UE'HN?`TVH1FMe-\eRteKo"6AY$`IESIiq9/rkqElb,o:jhd+u@3BMk4AFXnd>+Dm5L%t)=ZoN%6mbB;0;7!>46NP_I?T@KC%a'GK"GC+Z<+[lZY+"tF?n$>"C*Uj.`pe^DF+%rrsFG(!rc7Oara,t>s?lU2dcjp\6h>".olA),8qjH3irDd(=0sn$[6pKn\Y0\"0=aB5"YlVN;!B3]L;Z~>
endstream
endobj
84 0 obj
@@ -514,10 +514,10 @@ endobj
>>
endobj
89 0 obj
-<< /Length 1768 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1808 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gb"/(997gc&AJ$C#ei,'X;/E#?OAV]?#6bp[TGJTD@a$@fgI.792]Q;^V5qKR7!D7HBB>*U'MC!'Sl;<2]mg=%u\i]Mp_.;G?4;'0;FruH;9VKo%tj5Io#'3jou;_c>rRSkeimV^RrQ,+36*nEo_@@T)@V1^6c:&ESr0Mj=G'/cZbFE=lS7s1C:XphG#=]WkkH:'9*k&Z9K_+ZrgKIW+8\HbTJ_n!h&nYVd3lh479k]LYFJ%6f@-kak'Y1JR_esItU&m%V`i"o.V5_U+0pZhJSZ.d``5VchW9?IU[i'&H!>c;(bFuZBMQ>P24c`1_uegT*W:EfE0TK.S'_X)Od\nUUna_CsqS.T/Z(5k$s:g`pUN>-%R>dKP!JE5_X$Xt)S?qY8o[qL%)/uGIVAP$#6IF8qc0qj:-/t2uO*a!C=jG8ZH/VaMJ,P2qPVqDY69VeYSLmc/6#o5M$e3koHB=U-=(d1?(Zb$6/>YV-gmfh0plehg%Pc+\mf*tEHV7t>fK1H%79i)BZ4$<=h$K6hb3GdQW7F`mp?j_T0.mc9DL,uu%K.Ou"4I^s]bW$0P%"&sH.7%GOaNqLQ_aa(!+5^8?FOZ^Fi;&7u1c0+720PO](US!C2J69ZHAZG!/Sk>DJ9uiNLG?%HNfeZS'Xhf"@6.!JZJ7*bCA3>*^-Kak$N]p#aPpGe;fm&mIBQ3:S2S;PMZq@p)A0fZHWho]P_dOu\N!iX-]Pstq!j'NBW8Gj$/IYSrj2c8cY!L0&+>>-_#S$@-=[,?9pIH#f]%e+.<&G4nq#k0DMc:I+e,`D':9trB4*@t/mOL//Z*df+PgD2nP(R4)uV&(>)iMIN.R.Xn=*`(kXF0ToPCdMj7%/#7X-lp<-,'[^E^T;Nq-8HI)C-=CTjKqS5'g\r/YkG9Hlo@s!aS/C3n/M_M'h8%ida#+bc?;Da.+S[?]8qJ#gcCi438bId`tjFEcZ-%P+KC')3;,X/UX'M\Qhtk0?9VKo4f5/YBc26XZGsh9nA/cgN0\qL!QQpJ8JiIcmkd';Vlp9og4uC3b38r*W'W-AkqA(CG7^;E-/Q%\+PM#$52[9p,`QChoU^X$8:jjEl7bi?n6l0nF^?_.*11LWtd8[_4.]"V@t)YbHE?F3YY9R%b.EqIM30)5"n9"DKGWn99FC?4krP'L$%FapK^iQlu`W9V>pUC'oTlh,Oon.GQWRJd\MW$JLoGLP`XueinV2GG%B`#RC;iU+ec`A$eFC5d9:9dG&H4S*L@bt=;(EE*aAi=N\X<].uF]n)hn56/.d@0`kohQq%WoaPY5&c]tM%ZlT!?5Cd/*TG"]VpOu,3C=CJT7n8tE_=AQW@gOZYRPE5)mEql?SrBYWD9UkBA^pX&"gU.[gP$'8Z%3=dfE/r-o(-bDE2AJ*irf@><_23gI-Zu1?b+AI'lNac6N^%Bdr"*pP!md`8!;h5CWJNMoL%mJ\5N5[;FA9p7UTX.E[dZBW^,L24nZhEK\O#$Er6)h;mi-,87;b.LRf&8LFCfn,4SHJ+[H"f30TBQ:rs!K0cB?gO,O)plf]sdg'(qcV$
+Gb"/(95iiK&AJ$C#ekB_XL3@I^(ff!ejYJ'8.k;sRN?!rZAD2L6W(,E?XHEkbb7]k:)iPn-l,sQP_Sef2]mgMO3>Y+bK85O[FX@+Vod`!*@m!knUD+Y^Ls74fR_#'[@7"Oq=s7,5:f.Eou*&*0l]6'8,RE[f1B/nd'FrEYa1RIqn%W\..I5URX*^emq/opjkt\G9\)/R+4g#GnKP\_#K%b[%#IF@&6\VLHR\r^Ada?b^e?ifN(T[n\U-g?j10q0n[=q9lZNrr)g!S7J%[.5.h^-;gDNDF)dH%(!gcHe-p=h7Y6=?9\!hSd5hd?N;:#FUFfTE[l9.)aRF'Gh'QqNYsDndJs@nP6qe-7MNj-OIgGdF'EK!A3\1h0Ml5C@>1JU(Y>?'?E5((_p2nCn$_em_hmQ5jbRbXX#Df4((C%eBi"%;A2uL_;&_<%GK90MIjANmeVk@DE,,*i#"W>rS#7T]d*h4_*3q.X1sW"e(#0<V-'RtX+ZHU(62"e%T(PAgB,h1d$6DoCPb=uVjC_f_@3a>_9]$f-<7GVK&1!+n)$Q!]HAu3CqZSQ59_B8>7'U9k7E`9WNo6M(emV72KVRXH&;AqpehEWtG8\[h:6kECAT],+(HI9I8n;lnLa8P` 0`2L\'A"bE5AM[H,V[XBe?j99`4ZP?F3\dl@a&G*NREC>,M=LW9p][7tZO(2;'pXT!s7/20`Il5BXCp1GD1<)Tn0>cI#sjg(^CWDng+HHma5F('/REqEms&8$7'*FEj=:/o"$CL&P_f*rtq#7jh1gEWUq=@''R^e=)u9a0$QXG@/4G;QtC,71NU`0FN:#]*D5>'`i[r]R1@O=q[km\r]5XP@ra]\'R.UgMc>uJrIGh3mc+NX2kb_W$&fgb+<3unDia`'k2T)qCm($im@cfqJ<0qd--]>QF3Za4(Bko102l<$oIP/581QEH^N3B72)/B3iC0'+1fn.>)1k,o"A2^nU+hDG=k2"KKj.$=E=TA`6,Vg?'*\8i'%/5oQ:*X>!h,Ge?ll^SfrDqiS1-O\b@-Er9=drDD/a9^D$Q_6Nl1\o/n_"LF?^(cpBAisb0V/?U($CapopI1Z\ZhMHnKX6oP?.6D^$g\'^L'n=l5feMW`cC`.>#Q&tJWL5M&L%mL2kj^,M\gHOs;>3ANrNsr9GFdBH_u=G)30qGT>=:2nMf?'^cN?7cH^!pI[AqIDJ]Fp74E_]+n_t[Eh_W-rSk#K>"N#T]\CN%=4YXP*FL=B(FGLM6KcGAu'?O+DdDl7n>rR5*"lJt^c/[#75Ul.mg'2"t.0lF3)N7NF!7`qmLWKE'Wd?m.~>
endstream
endobj
90 0 obj
@@ -1026,37 +1026,37 @@ endobj
39 0 obj
<<
/S /GoTo
-/D [80 0 R /XYZ 85.0 659.0 null]
+/D [80 0 R /XYZ 85.0 637.8 null]
>>
endobj
41 0 obj
<<
/S /GoTo
-/D [80 0 R /XYZ 85.0 606.666 null]
+/D [80 0 R /XYZ 85.0 585.466 null]
>>
endobj
43 0 obj
<<
/S /GoTo
-/D [84 0 R /XYZ 85.0 659.0 null]
+/D [84 0 R /XYZ 85.0 637.8 null]
>>
endobj
45 0 obj
<<
/S /GoTo
-/D [84 0 R /XYZ 85.0 520.547 null]
+/D [84 0 R /XYZ 85.0 499.347 null]
>>
endobj
47 0 obj
<<
/S /GoTo
-/D [84 0 R /XYZ 85.0 442.894 null]
+/D [84 0 R /XYZ 85.0 421.694 null]
>>
endobj
49 0 obj
<<
/S /GoTo
-/D [84 0 R /XYZ 85.0 190.441 null]
+/D [84 0 R /XYZ 85.0 169.241 null]
>>
endobj
51 0 obj
@@ -1115,147 +1115,147 @@ endobj
xref
0 142
0000000000 65535 f
-0000048685 00000 n
-0000048880 00000 n
-0000048973 00000 n
+0000048762 00000 n
+0000048957 00000 n
+0000049050 00000 n
0000000015 00000 n
0000000071 00000 n
-0000001276 00000 n
-0000001396 00000 n
-0000001568 00000 n
-0000049125 00000 n
-0000001703 00000 n
-0000049188 00000 n
-0000001838 00000 n
-0000049254 00000 n
-0000001975 00000 n
-0000049318 00000 n
-0000002112 00000 n
-0000049384 00000 n
-0000002249 00000 n
-0000049450 00000 n
-0000002386 00000 n
-0000049516 00000 n
-0000002523 00000 n
-0000049580 00000 n
-0000002660 00000 n
-0000049646 00000 n
-0000002797 00000 n
-0000049710 00000 n
-0000002934 00000 n
-0000049776 00000 n
-0000003071 00000 n
-0000049842 00000 n
-0000003208 00000 n
-0000049907 00000 n
-0000003345 00000 n
-0000049973 00000 n
-0000003482 00000 n
-0000050037 00000 n
-0000003618 00000 n
-0000050103 00000 n
-0000003755 00000 n
-0000050167 00000 n
-0000003891 00000 n
-0000050233 00000 n
-0000004028 00000 n
-0000050297 00000 n
-0000004165 00000 n
-0000050363 00000 n
-0000004301 00000 n
-0000050429 00000 n
-0000004438 00000 n
-0000050495 00000 n
-0000004574 00000 n
-0000005293 00000 n
-0000005416 00000 n
-0000005485 00000 n
-0000050559 00000 n
-0000005618 00000 n
-0000050623 00000 n
-0000005751 00000 n
-0000050687 00000 n
-0000005884 00000 n
-0000050751 00000 n
-0000006017 00000 n
-0000050815 00000 n
-0000006150 00000 n
-0000050879 00000 n
-0000006282 00000 n
-0000050944 00000 n
-0000006415 00000 n
-0000008563 00000 n
-0000008671 00000 n
-0000010822 00000 n
-0000010930 00000 n
-0000013211 00000 n
-0000013319 00000 n
-0000015395 00000 n
-0000015503 00000 n
-0000017991 00000 n
-0000018099 00000 n
-0000019975 00000 n
-0000020083 00000 n
-0000022353 00000 n
-0000022461 00000 n
-0000024441 00000 n
-0000024549 00000 n
-0000026017 00000 n
-0000026125 00000 n
-0000027509 00000 n
-0000027617 00000 n
-0000029478 00000 n
-0000029586 00000 n
-0000031316 00000 n
-0000031424 00000 n
-0000033608 00000 n
-0000033716 00000 n
-0000035499 00000 n
-0000035607 00000 n
-0000037552 00000 n
-0000037660 00000 n
-0000039066 00000 n
-0000039175 00000 n
-0000041074 00000 n
-0000041184 00000 n
-0000042398 00000 n
-0000051009 00000 n
-0000042508 00000 n
-0000042708 00000 n
-0000042926 00000 n
-0000043132 00000 n
-0000043340 00000 n
-0000043508 00000 n
-0000043708 00000 n
-0000043866 00000 n
-0000044041 00000 n
-0000044282 00000 n
-0000044411 00000 n
-0000044565 00000 n
-0000044719 00000 n
-0000044863 00000 n
-0000045013 00000 n
-0000045154 00000 n
-0000045394 00000 n
-0000045576 00000 n
-0000045749 00000 n
-0000045952 00000 n
-0000046140 00000 n
-0000046392 00000 n
-0000046533 00000 n
-0000046742 00000 n
-0000046928 00000 n
-0000047102 00000 n
-0000047347 00000 n
-0000047538 00000 n
-0000047744 00000 n
-0000047905 00000 n
-0000048019 00000 n
-0000048130 00000 n
-0000048242 00000 n
-0000048351 00000 n
-0000048458 00000 n
-0000048575 00000 n
+0000001278 00000 n
+0000001398 00000 n
+0000001570 00000 n
+0000049202 00000 n
+0000001705 00000 n
+0000049265 00000 n
+0000001840 00000 n
+0000049331 00000 n
+0000001977 00000 n
+0000049395 00000 n
+0000002114 00000 n
+0000049461 00000 n
+0000002251 00000 n
+0000049527 00000 n
+0000002388 00000 n
+0000049593 00000 n
+0000002525 00000 n
+0000049657 00000 n
+0000002662 00000 n
+0000049723 00000 n
+0000002799 00000 n
+0000049787 00000 n
+0000002936 00000 n
+0000049853 00000 n
+0000003073 00000 n
+0000049919 00000 n
+0000003210 00000 n
+0000049984 00000 n
+0000003347 00000 n
+0000050050 00000 n
+0000003484 00000 n
+0000050114 00000 n
+0000003620 00000 n
+0000050180 00000 n
+0000003757 00000 n
+0000050244 00000 n
+0000003893 00000 n
+0000050310 00000 n
+0000004030 00000 n
+0000050374 00000 n
+0000004167 00000 n
+0000050440 00000 n
+0000004303 00000 n
+0000050506 00000 n
+0000004440 00000 n
+0000050572 00000 n
+0000004576 00000 n
+0000005295 00000 n
+0000005418 00000 n
+0000005487 00000 n
+0000050636 00000 n
+0000005620 00000 n
+0000050700 00000 n
+0000005753 00000 n
+0000050764 00000 n
+0000005886 00000 n
+0000050828 00000 n
+0000006019 00000 n
+0000050892 00000 n
+0000006152 00000 n
+0000050956 00000 n
+0000006284 00000 n
+0000051021 00000 n
+0000006417 00000 n
+0000008565 00000 n
+0000008673 00000 n
+0000010824 00000 n
+0000010932 00000 n
+0000013213 00000 n
+0000013321 00000 n
+0000015397 00000 n
+0000015505 00000 n
+0000017954 00000 n
+0000018062 00000 n
+0000019976 00000 n
+0000020084 00000 n
+0000022332 00000 n
+0000022440 00000 n
+0000024478 00000 n
+0000024586 00000 n
+0000026054 00000 n
+0000026162 00000 n
+0000027546 00000 n
+0000027654 00000 n
+0000029555 00000 n
+0000029663 00000 n
+0000031393 00000 n
+0000031501 00000 n
+0000033685 00000 n
+0000033793 00000 n
+0000035576 00000 n
+0000035684 00000 n
+0000037629 00000 n
+0000037737 00000 n
+0000039143 00000 n
+0000039252 00000 n
+0000041151 00000 n
+0000041261 00000 n
+0000042475 00000 n
+0000051086 00000 n
+0000042585 00000 n
+0000042785 00000 n
+0000043003 00000 n
+0000043209 00000 n
+0000043417 00000 n
+0000043585 00000 n
+0000043785 00000 n
+0000043943 00000 n
+0000044118 00000 n
+0000044359 00000 n
+0000044488 00000 n
+0000044642 00000 n
+0000044796 00000 n
+0000044940 00000 n
+0000045090 00000 n
+0000045231 00000 n
+0000045471 00000 n
+0000045653 00000 n
+0000045826 00000 n
+0000046029 00000 n
+0000046217 00000 n
+0000046469 00000 n
+0000046610 00000 n
+0000046819 00000 n
+0000047005 00000 n
+0000047179 00000 n
+0000047424 00000 n
+0000047615 00000 n
+0000047821 00000 n
+0000047982 00000 n
+0000048096 00000 n
+0000048207 00000 n
+0000048319 00000 n
+0000048428 00000 n
+0000048535 00000 n
+0000048652 00000 n
trailer
<<
/Size 142
@@ -1263,5 +1263,5 @@ trailer
/Info 4 0 R
>>
startxref
-51063
+51140
%%EOF
diff --git a/src/java/org/apache/lucene/document/Document.java b/src/java/org/apache/lucene/document/Document.java
index 85bb1eaceee..7291cc2ac26 100644
--- a/src/java/org/apache/lucene/document/Document.java
+++ b/src/java/org/apache/lucene/document/Document.java
@@ -18,9 +18,9 @@ package org.apache.lucene.document;
*/
import java.util.*; // for javadoc
-import org.apache.lucene.search.Hits; // for javadoc
-import org.apache.lucene.search.Searcher; // for javadoc
-import org.apache.lucene.index.IndexReader; // for javadoc
+import org.apache.lucene.search.Hits; // for javadoc
+import org.apache.lucene.search.Searcher; // for javadoc
+import org.apache.lucene.index.IndexReader; // for javadoc
/** Documents are the unit of indexing and search.
*
diff --git a/src/java/org/apache/lucene/index/DocumentsWriter.java b/src/java/org/apache/lucene/index/DocumentsWriter.java
index a6896d01b45..2232f48f9e8 100644
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@@ -28,6 +28,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.io.PrintStream;
@@ -291,7 +292,7 @@ final class DocumentsWriter {
assert docStoreSegment != null;
fieldsWriter.close();
fieldsWriter = null;
- assert numDocsInStore*8 == directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION):
+ assert 4+numDocsInStore*8 == directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION):
"after flush: fdx size mismatch: " + numDocsInStore + " docs vs " + directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
}
@@ -754,27 +755,26 @@ final class DocumentsWriter {
return segment + "." + extension;
}
- static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
+ private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
while(true) {
final char c1 = text1[pos1++];
final char c2 = text2[pos2++];
- if (c1 < c2)
+ if (c1 != c2) {
if (0xffff == c2)
return 1;
- else
- return -1;
- else if (c2 < c1)
- if (0xffff == c1)
+ else if (0xffff == c1)
return -1;
else
- return 1;
- else if (0xffff == c1)
+ return c1-c2;
+ } else if (0xffff == c1)
return 0;
}
}
private final TermInfo termInfo = new TermInfo(); // minimize consing
+ final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
+
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
@@ -831,9 +831,6 @@ final class DocumentsWriter {
final char[] text = termStates[0].text;
final int start = termStates[0].textOffset;
- int pos = start;
- while(text[pos] != 0xffff)
- pos++;
long freqPointer = freqOut.getFilePointer();
long proxPointer = proxOut.getFilePointer();
@@ -932,7 +929,17 @@ final class DocumentsWriter {
// Write term
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
- termsOut.add(fieldNumber, text, start, pos-start, termInfo);
+
+ // TODO: we could do this incrementally
+ UnicodeUtil.UTF16toUTF8(text, start, termsUTF8);
+
+ // TODO: we could save O(n) re-scan of the term by
+ // computing the shared prefix with the last term
+ // while during the UTF8 encoding
+ termsOut.add(fieldNumber,
+ termsUTF8.result,
+ termsUTF8.length,
+ termInfo);
}
}
@@ -1048,7 +1055,12 @@ final class DocumentsWriter {
// This call is not synchronized and does all the work
state.processDocument(analyzer);
} finally {
- // This call is synchronized but fast
+ // Note that we must call finishDocument even on
+ // exception, because for a non-aborting
+ // exception, a portion of the document has been
+ // indexed (and its ID is marked for deletion), so
+ // all index files must be updated to record this
+ // document. This call is synchronized but fast.
finishDocument(state);
}
success = true;
diff --git a/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java b/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java
index f6912716ff0..3f14cc6a9bf 100644
--- a/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriterFieldData.java
@@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
@@ -337,12 +338,36 @@ final class DocumentsWriterFieldData implements Comparable {
int code = 0;
- // Compute hashcode
+ // Compute hashcode & replace any invalid UTF16 sequences
int downto = tokenTextLen;
- while (downto > 0)
- code = (code*31) + tokenText[--downto];
+ while (downto > 0) {
+ char ch = tokenText[--downto];
- // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
+ if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
+ if (0 == downto) {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ } else {
+ final char ch2 = tokenText[downto-1];
+ if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
+ // OK: high followed by low. This is a valid
+ // surrogate pair.
+ code = ((code*31) + ch)*31+ch2;
+ downto--;
+ continue;
+ } else {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ }
+ }
+ } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+
+ code = (code*31) + ch;
+ }
+
+ // System.out.println(" addPosition: field=" + fieldInfo.name + " buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
int hashPos = code & postingsHashMask;
@@ -713,7 +738,8 @@ final class DocumentsWriterFieldData implements Comparable {
threadState.doVectorSort(postingsVectors, numPostingsVectors);
- Posting lastPosting = null;
+ int encoderUpto = 0;
+ int lastTermBytesCount = 0;
final ByteSliceReader reader = vectorSliceReader;
final char[][] charBuffers = threadState.charPool.buffers;
@@ -723,40 +749,37 @@ final class DocumentsWriterFieldData implements Comparable {
Posting posting = vector.p;
final int freq = posting.docFreq;
- final int prefix;
final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
- int pos2 = start2;
+ // We swap between two encoders to save copying
+ // last Term's byte array
+ final UnicodeUtil.UTF8Result utf8Result = threadState.utf8Results[encoderUpto];
+
+ // TODO: we could do this incrementally
+ UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
+ final int termBytesCount = utf8Result.length;
+
+ // TODO: UTF16toUTF8 could tell us this prefix
// Compute common prefix between last term and
// this term
- if (lastPosting == null)
- prefix = 0;
- else {
- final char[] text1 = charBuffers[lastPosting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- final int start1 = lastPosting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
- int pos1 = start1;
- while(true) {
- final char c1 = text1[pos1];
- final char c2 = text2[pos2];
- if (c1 != c2 || c1 == 0xffff) {
- prefix = pos1-start1;
+ int prefix = 0;
+ if (j > 0) {
+ final byte[] lastTermBytes = threadState.utf8Results[1-encoderUpto].result;
+ final byte[] termBytes = threadState.utf8Results[encoderUpto].result;
+ while(prefix < lastTermBytesCount && prefix < termBytesCount) {
+ if (lastTermBytes[prefix] != termBytes[prefix])
break;
- }
- pos1++;
- pos2++;
+ prefix++;
}
}
- lastPosting = posting;
+ encoderUpto = 1-encoderUpto;
+ lastTermBytesCount = termBytesCount;
- // Compute length
- while(text2[pos2] != 0xffff)
- pos2++;
-
- final int suffix = pos2 - start2 - prefix;
+ final int suffix = termBytesCount - prefix;
tvfLocal.writeVInt(prefix);
tvfLocal.writeVInt(suffix);
- tvfLocal.writeChars(text2, start2 + prefix, suffix);
+ tvfLocal.writeBytes(utf8Result.result, prefix, suffix);
tvfLocal.writeVInt(freq);
if (doVectorPositions) {
diff --git a/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java b/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
index 0d97297a20d..78a50139978 100644
--- a/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java
@@ -24,6 +24,7 @@ import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.UnicodeUtil;
/** Used by DocumentsWriter to maintain per-thread state.
* We keep a separate Posting hash and other state for each
@@ -311,6 +312,7 @@ final class DocumentsWriterThreadState {
if (docWriter.fieldsWriter == null) {
assert docWriter.docStoreSegment == null;
assert docWriter.segment != null;
+ docWriter.files = null;
docWriter.docStoreSegment = docWriter.segment;
// If we hit an exception while init'ing the
// fieldsWriter, we must abort this segment
@@ -321,7 +323,6 @@ final class DocumentsWriterThreadState {
} catch (Throwable t) {
throw new AbortException(t, docWriter);
}
- docWriter.files = null;
}
localFieldsWriter = new FieldsWriter(null, fdtLocal, docWriter.fieldInfos);
}
@@ -331,17 +332,18 @@ final class DocumentsWriterThreadState {
if (docHasVectors) {
if (docWriter.tvx == null) {
assert docWriter.docStoreSegment != null;
+ docWriter.files = null;
// If we hit an exception while init'ing the term
// vector output files, we must abort this segment
// because those files will be in an unknown
// state:
try {
docWriter.tvx = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- docWriter.tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ docWriter.tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
docWriter.tvd = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- docWriter.tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ docWriter.tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
docWriter.tvf = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- docWriter.tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ docWriter.tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
// We must "catch up" for all docs before us
// that had no vectors:
@@ -353,7 +355,6 @@ final class DocumentsWriterThreadState {
} catch (Throwable t) {
throw new AbortException(t, docWriter);
}
- docWriter.files = null;
}
numVectorFields = 0;
}
@@ -672,21 +673,23 @@ final class DocumentsWriterThreadState {
int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ assert text1 != text2 || pos1 != pos2;
+
while(true) {
final char c1 = text1[pos1++];
final char c2 = text2[pos2++];
- if (c1 < c2)
+ if (c1 != c2) {
if (0xffff == c2)
return 1;
- else
- return -1;
- else if (c2 < c1)
- if (0xffff == c1)
+ else if (0xffff == c1)
return -1;
else
- return 1;
- else if (0xffff == c1)
- return 0;
+ return c1-c2;
+ } else
+ // This method should never compare equal postings
+ // unless p1==p2
+ assert c1 != 0xffff;
}
}
@@ -715,5 +718,8 @@ final class DocumentsWriterThreadState {
// Used to read a string value for a field
ReusableStringReader stringReader = new ReusableStringReader();
+
+ final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
+ new UnicodeUtil.UTF8Result()};
}
diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java
index 13a87778bc1..45e06f2fd04 100644
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@@ -51,6 +51,8 @@ final class FieldsReader {
private int numTotalDocs;
private int size;
private boolean closed;
+ private final int format;
+ private final int formatSize;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
@@ -72,9 +74,33 @@ final class FieldsReader {
try {
fieldInfos = fn;
- cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize);
+ cloneableFieldsStream = d.openInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize);
+ indexStream = d.openInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize);
+
+ // First version of fdx did not include a format
+ // header, but, the first int will always be 0 in that
+ // case
+ int firstInt = indexStream.readInt();
+ if (firstInt == 0)
+ format = 0;
+ else
+ format = firstInt;
+
+ if (format > FieldsWriter.FORMAT_CURRENT)
+ throw new CorruptIndexException("Incompatible format version: " + format + " expected "
+ + FieldsWriter.FORMAT_CURRENT + " or lower");
+
+ if (format > FieldsWriter.FORMAT)
+ formatSize = 4;
+ else
+ formatSize = 0;
+
+ if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
+ cloneableFieldsStream.setModifiedUTF8StringsMode();
+
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
- indexStream = d.openInput(segment + ".fdx", readBufferSize);
+
+ final long indexSize = indexStream.length()-formatSize;
if (docStoreOffset != -1) {
// We read only a slice out of this shared fields file
@@ -83,13 +109,13 @@ final class FieldsReader {
// Verify the file is long enough to hold all of our
// docs
- assert ((int) (indexStream.length() / 8)) >= size + this.docStoreOffset;
+ assert ((int) (indexSize / 8)) >= size + this.docStoreOffset;
} else {
this.docStoreOffset = 0;
- this.size = (int) (indexStream.length() >> 3);
+ this.size = (int) (indexSize >> 3);
}
- numTotalDocs = (int) (indexStream.length() >> 3);
+ numTotalDocs = (int) (indexSize >> 3);
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
@@ -142,8 +168,12 @@ final class FieldsReader {
return size;
}
+ private final void seekIndex(int docID) throws IOException {
+ indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
+ }
+
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
- indexStream.seek((n + docStoreOffset) * 8L);
+ seekIndex(n);
long position = indexStream.readLong();
fieldsStream.seek(position);
@@ -195,7 +225,7 @@ final class FieldsReader {
* startDocID. Returns the IndexInput (the fieldStream),
* already seeked to the starting point for startDocID.*/
final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
- indexStream.seek((docStoreOffset+startDocID) * 8L);
+ seekIndex(startDocID);
long startOffset = indexStream.readLong();
long lastOffset = startOffset;
int count = 0;
@@ -225,13 +255,12 @@ final class FieldsReader {
}
private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
- if (binary || compressed) {
- long pointer = fieldsStream.getFilePointer();
- fieldsStream.seek(pointer + toRead);
- } else {
- //We need to skip chars. This will slow us down, but still better
- fieldsStream.skipChars(toRead);
- }
+ if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
+ fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
+ } else {
+ // We need to skip chars. This will slow us down, but still better
+ fieldsStream.skipChars(toRead);
+ }
}
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
@@ -265,7 +294,10 @@ final class FieldsReader {
int length = fieldsStream.readVInt();
long pointer = fieldsStream.getFilePointer();
//Skip ahead of where we are by the length of what is stored
- fieldsStream.skipChars(length);
+ if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
+ fieldsStream.seek(pointer+length);
+ else
+ fieldsStream.skipChars(length);
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);
f.setOmitNorms(fi.omitNorms);
}
@@ -471,10 +503,16 @@ final class FieldsReader {
localFieldsStream.readBytes(b, 0, b.length);
fieldsData = new String(uncompress(b), "UTF-8");
} else {
- //read in chars b/c we already know the length we need to read
- char[] chars = new char[toRead];
- localFieldsStream.readChars(chars, 0, toRead);
- fieldsData = new String(chars);
+ if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
+ byte[] bytes = new byte[toRead];
+ localFieldsStream.readBytes(bytes, 0, toRead);
+ fieldsData = new String(bytes, "UTF-8");
+ } else {
+ //read in chars b/c we already know the length we need to read
+ char[] chars = new char[toRead];
+ localFieldsStream.readChars(chars, 0, toRead);
+ fieldsData = new String(chars);
+ }
}
} catch (IOException e) {
throw new FieldReaderException(e);
diff --git a/src/java/org/apache/lucene/index/FieldsWriter.java b/src/java/org/apache/lucene/index/FieldsWriter.java
index ac733e35507..5dfb5b6f665 100644
--- a/src/java/org/apache/lucene/index/FieldsWriter.java
+++ b/src/java/org/apache/lucene/index/FieldsWriter.java
@@ -33,6 +33,17 @@ final class FieldsWriter
static final byte FIELD_IS_TOKENIZED = 0x1;
static final byte FIELD_IS_BINARY = 0x2;
static final byte FIELD_IS_COMPRESSED = 0x4;
+
+ // Original format
+ static final int FORMAT = 0;
+
+ // Changed strings to UTF8
+ static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1;
+
+ // NOTE: if you introduce a new format, make it 1 higher
+ // than the current one, and always change this if you
+ // switch to a new format!
+ static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
private FieldInfos fieldInfos;
@@ -44,8 +55,34 @@ final class FieldsWriter
FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
fieldInfos = fn;
- fieldsStream = d.createOutput(segment + ".fdt");
- indexStream = d.createOutput(segment + ".fdx");
+
+ boolean success = false;
+ final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION;
+ try {
+ fieldsStream = d.createOutput(fieldsName);
+ fieldsStream.writeInt(FORMAT_CURRENT);
+ success = true;
+ } finally {
+ if (!success) {
+ close();
+ d.deleteFile(fieldsName);
+ }
+ }
+
+ success = false;
+ final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
+ try {
+ indexStream = d.createOutput(indexName);
+ indexStream.writeInt(FORMAT_CURRENT);
+ success = true;
+ } finally {
+ if (!success) {
+ close();
+ d.deleteFile(fieldsName);
+ d.deleteFile(indexName);
+ }
+ }
+
doClose = true;
}
@@ -73,8 +110,10 @@ final class FieldsWriter
final void close() throws IOException {
if (doClose) {
- fieldsStream.close();
- indexStream.close();
+ if (fieldsStream != null)
+ fieldsStream.close();
+ if (indexStream != null)
+ indexStream.close();
}
}
diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java
index d18d3e11e29..a09d8efab3c 100644
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@@ -67,6 +67,7 @@ import java.util.Iterator;
(which just deletes and then adds the entire document).
When finished adding, deleting and updating documents, close should be called.
+
These changes are buffered in memory and periodically
flushed to the {@link Directory} (during the above method
calls). A flush is triggered when there are enough
@@ -1843,26 +1844,30 @@ public class IndexWriter {
* partially succeeded).
*
* This method periodically flushes pending documents
- * to the Directory (every {@link #setMaxBufferedDocs}),
- * and also periodically merges segments in the index
- * (every {@link #setMergeFactor} flushes). When this
- * occurs, the method will take more time to run (possibly
- * a long time if the index is large), and will require
- * free temporary space in the Directory to do the
- * merging.
+ * to the Directory (see above), and
+ * also periodically triggers segment merges in the index
+ * according to the {@link MergePolicy} in use.
*
- * The amount of free space required when a merge is triggered is
- * up to 1X the size of all segments being merged, when no
- * readers/searchers are open against the index, and up to 2X the
- * size of all segments being merged when readers/searchers are open
- * against the index (see {@link #optimize()} for details). The
- * sequence of primitive merge operations performed is governed by
- * the merge policy.
+ * Merges temporarily consume space in the
+ * directory. The amount of space required is up to 1X the
+ * size of all segments being merged, when no
+ * readers/searchers are open against the index, and up to
+ * 2X the size of all segments being merged when
+ * readers/searchers are open against the index (see
+ * {@link #optimize()} for details). The sequence of
+ * primitive merge operations performed is governed by the
+ * merge policy.
*
* Note that each term in the document can be no longer
* than 16383 characters, otherwise an
* IllegalArgumentException will be thrown.
*
+ * Note that it's possible to create an invalid Unicode
+ * string in java if a UTF16 surrogate pair is malformed.
+ * In this case, the invalid characters are silently
+ * replaced with the Unicode replacement character
+ * U+FFFD.
+ *
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
diff --git a/src/java/org/apache/lucene/index/SegmentMerger.java b/src/java/org/apache/lucene/index/SegmentMerger.java
index 60a8328698a..f4f299aa7f4 100644
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@@ -349,7 +349,7 @@ final class SegmentMerger {
fieldsWriter.close();
}
- assert docCount*8 == directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) :
+ assert 4+docCount*8 == directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) :
"after mergeFields: fdx size mismatch: " + docCount + " docs vs " + directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
} else
diff --git a/src/java/org/apache/lucene/index/SegmentTermEnum.java b/src/java/org/apache/lucene/index/SegmentTermEnum.java
index 46e1a7943ec..03c18e1b0aa 100644
--- a/src/java/org/apache/lucene/index/SegmentTermEnum.java
+++ b/src/java/org/apache/lucene/index/SegmentTermEnum.java
@@ -61,8 +61,8 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
format = firstInt;
// check that it is a format we can understand
- if (format < TermInfosWriter.FORMAT)
- throw new CorruptIndexException("Unknown format version:" + format);
+ if (format < TermInfosWriter.FORMAT_CURRENT)
+ throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher");
size = input.readLong(); // read the size
@@ -77,13 +77,17 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
} else {
indexInterval = input.readInt();
skipInterval = input.readInt();
- if (format == -3) {
+ if (format <= TermInfosWriter.FORMAT) {
// this new format introduces multi-level skipping
maxSkipLevels = input.readInt();
}
}
}
-
+ if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
+ termBuffer.setPreUTF8Strings();
+ scanBuffer.setPreUTF8Strings();
+ prevBuffer.setPreUTF8Strings();
+ }
}
protected Object clone() {
diff --git a/src/java/org/apache/lucene/index/TermBuffer.java b/src/java/org/apache/lucene/index/TermBuffer.java
index 72054bb656a..4b180ce0beb 100644
--- a/src/java/org/apache/lucene/index/TermBuffer.java
+++ b/src/java/org/apache/lucene/index/TermBuffer.java
@@ -19,28 +19,31 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.UnicodeUtil;
final class TermBuffer implements Cloneable {
- private static final char[] NO_CHARS = new char[0];
private String field;
- private char[] text = NO_CHARS;
- private int textLength;
private Term term; // cached
+ private boolean preUTF8Strings; // true if strings are stored in modified UTF8 encoding (LUCENE-510)
+ private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes)
+
+ private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
+ private UnicodeUtil.UTF8Result bytes = new UnicodeUtil.UTF8Result();
public final int compareTo(TermBuffer other) {
- if (field == other.field) // fields are interned
- return compareChars(text, textLength, other.text, other.textLength);
+ if (field == other.field) // fields are interned
+ return compareChars(text.result, text.length, other.text.result, other.text.length);
else
return field.compareTo(other.field);
}
- private static final int compareChars(char[] v1, int len1,
- char[] v2, int len2) {
- int end = Math.min(len1, len2);
+ private static final int compareChars(char[] chars1, int len1,
+ char[] chars2, int len2) {
+ final int end = len1 < len2 ? len1:len2;
for (int k = 0; k < end; k++) {
- char c1 = v1[k];
- char c2 = v2[k];
+ char c1 = chars1[k];
+ char c2 = chars2[k];
if (c1 != c2) {
return c1 - c2;
}
@@ -48,13 +51,11 @@ final class TermBuffer implements Cloneable {
return len1 - len2;
}
- private final void setTextLength(int newLength) {
- if (text.length < newLength) {
- char[] newText = new char[newLength];
- System.arraycopy(text, 0, newText, 0, textLength);
- text = newText;
- }
- textLength = newLength;
+ /** Call this if the IndexInput passed to {@link #read}
+ * stores terms in the "modified UTF8" (pre LUCENE-510)
+ * format. */
+ void setPreUTF8Strings() {
+ preUTF8Strings = true;
}
public final void read(IndexInput input, FieldInfos fieldInfos)
@@ -63,8 +64,25 @@ final class TermBuffer implements Cloneable {
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
- setTextLength(totalLength);
- input.readChars(this.text, start, length);
+ if (preUTF8Strings) {
+ text.setLength(totalLength);
+ input.readChars(text.result, start, length);
+ } else {
+
+ if (dirty) {
+ // Fully convert all bytes since bytes is dirty
+ UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
+ bytes.setLength(totalLength);
+ input.readBytes(bytes.result, start, length);
+ UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
+ dirty = false;
+ } else {
+ // Incrementally convert only the UTF8 bytes that are new:
+ bytes.setLength(totalLength);
+ input.readBytes(bytes.result, start, length);
+ UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
+ }
+ }
this.field = fieldInfos.fieldName(input.readVInt());
}
@@ -73,27 +91,27 @@ final class TermBuffer implements Cloneable {
reset();
return;
}
-
- // copy text into the buffer
- setTextLength(term.text().length());
- term.text().getChars(0, term.text().length(), text, 0);
-
- this.field = term.field();
+ final String termText = term.text();
+ final int termLen = termText.length();
+ text.setLength(termLen);
+ termText.getChars(0, termLen, text.result, 0);
+ dirty = true;
+ field = term.field();
this.term = term;
}
public final void set(TermBuffer other) {
- setTextLength(other.textLength);
- System.arraycopy(other.text, 0, text, 0, textLength);
-
- this.field = other.field;
- this.term = other.term;
+ text.copyText(other.text);
+ dirty = true;
+ field = other.field;
+ term = other.term;
}
public void reset() {
- this.field = null;
- this.textLength = 0;
- this.term = null;
+ field = null;
+ text.setLength(0);
+ term = null;
+ dirty = true;
}
public Term toTerm() {
@@ -101,7 +119,7 @@ final class TermBuffer implements Cloneable {
return null;
if (term == null)
- term = new Term(field, new String(text, 0, textLength), false);
+ term = new Term(field, new String(text.result, 0, text.length), false);
return term;
}
@@ -112,9 +130,10 @@ final class TermBuffer implements Cloneable {
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.text = new char[text.length];
- System.arraycopy(text, 0, clone.text, 0, textLength);
-
+ clone.dirty = true;
+ clone.bytes = new UnicodeUtil.UTF8Result();
+ clone.text = new UnicodeUtil.UTF16Result();
+ clone.text.copyText(text);
return clone;
}
}
diff --git a/src/java/org/apache/lucene/index/TermInfosReader.java b/src/java/org/apache/lucene/index/TermInfosReader.java
index 4d627f6b1cd..e08eb42dcef 100644
--- a/src/java/org/apache/lucene/index/TermInfosReader.java
+++ b/src/java/org/apache/lucene/index/TermInfosReader.java
@@ -58,12 +58,12 @@ final class TermInfosReader {
segment = seg;
fieldInfos = fis;
- origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis",
+ origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
readBufferSize), fieldInfos, false);
size = origEnum.size;
totalIndexInterval = origEnum.indexInterval;
- indexEnum = new SegmentTermEnum(directory.openInput(segment + ".tii",
+ indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
readBufferSize), fieldInfos, true);
success = true;
diff --git a/src/java/org/apache/lucene/index/TermInfosWriter.java b/src/java/org/apache/lucene/index/TermInfosWriter.java
index c04fd920148..a15309c305d 100644
--- a/src/java/org/apache/lucene/index/TermInfosWriter.java
+++ b/src/java/org/apache/lucene/index/TermInfosWriter.java
@@ -21,6 +21,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.UnicodeUtil;
/** This stores a monotonically increasing set of pairs in a
Directory. A TermInfos can be written once, in order. */
@@ -29,6 +30,13 @@ final class TermInfosWriter {
/** The file format version, a negative number. */
public static final int FORMAT = -3;
+ // Changed strings to true utf8 with length-in-bytes not
+ // length-in-chars
+ public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
+
+ // NOTE: always change this if you switch to a new format!
+ public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
+
private FieldInfos fieldInfos;
private IndexOutput output;
private TermInfo lastTi = new TermInfo();
@@ -62,13 +70,12 @@ final class TermInfosWriter {
private long lastIndexPointer;
private boolean isIndex;
- private char[] lastTermText = new char[10];
- private int lastTermTextLength;
+ private byte[] lastTermBytes = new byte[10];
+ private int lastTermBytesLength = 0;
private int lastFieldNumber = -1;
- private char[] termTextBuffer = new char[10];
-
private TermInfosWriter other;
+ private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
TermInfosWriter(Directory directory, String segment, FieldInfos fis,
int interval)
@@ -89,27 +96,32 @@ final class TermInfosWriter {
fieldInfos = fis;
isIndex = isi;
output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
- output.writeInt(FORMAT); // write format
+ output.writeInt(FORMAT_CURRENT); // write format
output.writeLong(0); // leave space for size
- output.writeInt(indexInterval); // write indexInterval
- output.writeInt(skipInterval); // write skipInterval
- output.writeInt(maxSkipLevels); // write maxSkipLevels
+ output.writeInt(indexInterval); // write indexInterval
+ output.writeInt(skipInterval); // write skipInterval
+ output.writeInt(maxSkipLevels); // write maxSkipLevels
+ assert initUTF16Results();
}
void add(Term term, TermInfo ti) throws IOException {
+ UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result);
+ add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti);
+ }
- final int length = term.text.length();
- if (termTextBuffer.length < length)
- termTextBuffer = new char[(int) (length*1.25)];
+ // Currently used only by assert statements
+ UnicodeUtil.UTF16Result utf16Result1;
+ UnicodeUtil.UTF16Result utf16Result2;
- term.text.getChars(0, length, termTextBuffer, 0);
-
- add(fieldInfos.fieldNumber(term.field), termTextBuffer, 0, length, ti);
+ // Currently used only by assert statements
+ private boolean initUTF16Results() {
+ utf16Result1 = new UnicodeUtil.UTF16Result();
+ utf16Result2 = new UnicodeUtil.UTF16Result();
+ return true;
}
// Currently used only by assert statement
- private int compareToLastTerm(int fieldNumber, char[] termText, int start, int length) {
- int pos = 0;
+ private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
if (lastFieldNumber != fieldNumber) {
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
@@ -121,45 +133,42 @@ final class TermInfosWriter {
return cmp;
}
- while(pos < length && pos < lastTermTextLength) {
- final char c1 = lastTermText[pos];
- final char c2 = termText[pos + start];
- if (c1 < c2)
- return -1;
- else if (c1 > c2)
- return 1;
- pos++;
- }
-
- if (pos < lastTermTextLength)
- // Last term was longer
- return 1;
- else if (pos < length)
- // Last term was shorter
- return -1;
+ UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
+ UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
+ final int len;
+ if (utf16Result1.length < utf16Result2.length)
+ len = utf16Result1.length;
else
- return 0;
+ len = utf16Result2.length;
+
+ for(int i=0;i, TermInfo> pair to the set.
+ /** Adds a new <, TermInfo> pair to the set.
Term must be lexicographically greater than all previous Terms added.
TermInfo pointers must be positive and greater than all previous.*/
- void add(int fieldNumber, char[] termText, int termTextStart, int termTextLength, TermInfo ti)
+ void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
throws IOException {
- assert compareToLastTerm(fieldNumber, termText, termTextStart, termTextLength) < 0 ||
- (isIndex && termTextLength == 0 && lastTermTextLength == 0) :
+ assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
+ (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
- " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
- " text=" + new String(termText, termTextStart, termTextLength) + " lastText=" + new String(lastTermText, 0, lastTermTextLength);
+ " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
+ " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
if (!isIndex && size % indexInterval == 0)
- other.add(lastFieldNumber, lastTermText, 0, lastTermTextLength, lastTi); // add an index term
+ other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
- writeTerm(fieldNumber, termText, termTextStart, termTextLength); // write term
+ writeTerm(fieldNumber, termBytes, termBytesLength); // write term
output.writeVInt(ti.docFreq); // write doc freq
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
@@ -174,34 +183,36 @@ final class TermInfosWriter {
lastIndexPointer = other.output.getFilePointer(); // write pointer
}
- if (lastTermText.length < termTextLength)
- lastTermText = new char[(int) (termTextLength*1.25)];
- System.arraycopy(termText, termTextStart, lastTermText, 0, termTextLength);
- lastTermTextLength = termTextLength;
lastFieldNumber = fieldNumber;
-
lastTi.set(ti);
size++;
}
- private void writeTerm(int fieldNumber, char[] termText, int termTextStart, int termTextLength)
+ private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
throws IOException {
+ // TODO: UTF16toUTF8 could tell us this prefix
// Compute prefix in common with last term:
int start = 0;
- final int limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength;
+ final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
while(start < limit) {
- if (termText[termTextStart+start] != lastTermText[start])
+ if (termBytes[start] != lastTermBytes[start])
break;
start++;
}
- int length = termTextLength - start;
-
+ final int length = termBytesLength - start;
output.writeVInt(start); // write shared prefix length
output.writeVInt(length); // write delta length
- output.writeChars(termText, start+termTextStart, length); // write delta chars
+ output.writeBytes(termBytes, start, length); // write delta bytes
output.writeVInt(fieldNumber); // write field num
+ if (lastTermBytes.length < termBytesLength) {
+ byte[] newArray = new byte[(int) (termBytesLength*1.5)];
+ System.arraycopy(lastTermBytes, 0, newArray, 0, start);
+ lastTermBytes = newArray;
+ }
+ System.arraycopy(termBytes, start, lastTermBytes, start, length);
+ lastTermBytesLength = termBytesLength;
}
/** Called to complete TermInfos creation. */
diff --git a/src/java/org/apache/lucene/index/TermVectorsReader.java b/src/java/org/apache/lucene/index/TermVectorsReader.java
index 533a716661c..c90d1c2f0e8 100644
--- a/src/java/org/apache/lucene/index/TermVectorsReader.java
+++ b/src/java/org/apache/lucene/index/TermVectorsReader.java
@@ -32,8 +32,16 @@ class TermVectorsReader implements Cloneable {
// NOTE: if you make a new format, it must be larger than
// the current format
static final int FORMAT_VERSION = 2;
+
+ // Changes to speed up bulk merging of term vectors:
static final int FORMAT_VERSION2 = 3;
+ // Changed strings to UTF8 with length-in-bytes not length-in-chars
+ static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
+
+ // NOTE: always change this if you switch to a new format!
+ static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
+
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4;
@@ -134,7 +142,7 @@ class TermVectorsReader implements Cloneable {
}
boolean canReadRawDocs() {
- return format >= FORMAT_VERSION2;
+ return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
}
/** Retrieve the length (in bytes) of the tvd and tvf
@@ -190,9 +198,9 @@ class TermVectorsReader implements Cloneable {
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
{
int format = in.readInt();
- if (format > FORMAT_VERSION2) {
+ if (format > FORMAT_CURRENT) {
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
- + FORMAT_VERSION2 + " or less");
+ + FORMAT_CURRENT + " or less");
}
return format;
}
@@ -434,24 +442,45 @@ class TermVectorsReader implements Cloneable {
int start = 0;
int deltaLength = 0;
int totalLength = 0;
- char [] buffer = new char[10]; // init the buffer with a length of 10 character
- char[] previousBuffer = {};
-
+ byte[] byteBuffer;
+ char[] charBuffer;
+ final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
+
+ // init the buffers
+ if (preUTF8) {
+ charBuffer = new char[10];
+ byteBuffer = null;
+ } else {
+ charBuffer = null;
+ byteBuffer = new byte[20];
+ }
+
for (int i = 0; i < numTerms; i++) {
start = tvf.readVInt();
deltaLength = tvf.readVInt();
totalLength = start + deltaLength;
- if (buffer.length < totalLength) { // increase buffer
- buffer = null; // give a hint to garbage collector
- buffer = new char[totalLength];
-
- if (start > 0) // just copy if necessary
- System.arraycopy(previousBuffer, 0, buffer, 0, start);
- }
+
+ final String term;
- tvf.readChars(buffer, start, deltaLength);
- String term = new String(buffer, 0, totalLength);
- previousBuffer = buffer;
+ if (preUTF8) {
+ // Term stored as java chars
+ if (charBuffer.length < totalLength) {
+ char[] newCharBuffer = new char[(int) (1.5*totalLength)];
+ System.arraycopy(charBuffer, 0, newCharBuffer, 0, start);
+ charBuffer = newCharBuffer;
+ }
+ tvf.readChars(charBuffer, start, deltaLength);
+ term = new String(charBuffer, 0, totalLength);
+ } else {
+ // Term stored as utf8 bytes
+ if (byteBuffer.length < totalLength) {
+ byte[] newByteBuffer = new byte[(int) (1.5*totalLength)];
+ System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start);
+ byteBuffer = newByteBuffer;
+ }
+ tvf.readBytes(byteBuffer, start, deltaLength);
+ term = new String(byteBuffer, 0, totalLength, "UTF-8");
+ }
int freq = tvf.readVInt();
int [] positions = null;
if (storePositions) { //read in the positions
diff --git a/src/java/org/apache/lucene/index/TermVectorsWriter.java b/src/java/org/apache/lucene/index/TermVectorsWriter.java
index 9ac2104b9fc..0d5e4fc4378 100644
--- a/src/java/org/apache/lucene/index/TermVectorsWriter.java
+++ b/src/java/org/apache/lucene/index/TermVectorsWriter.java
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
@@ -27,17 +28,19 @@ final class TermVectorsWriter {
private IndexOutput tvx = null, tvd = null, tvf = null;
private FieldInfos fieldInfos;
+ final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
+ new UnicodeUtil.UTF8Result()};
public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos)
throws IOException {
// Open files for TermVector storage
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
+ tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
this.fieldInfos = fieldInfos;
}
@@ -97,15 +100,22 @@ final class TermVectorsWriter {
final String[] terms = vectors[i].getTerms();
final int[] freqs = vectors[i].getTermFrequencies();
- String lastTermText = "";
+ int utf8Upto = 0;
+ utf8Results[1].length = 0;
+
for (int j=0; j bytes.length)
+ bytes = new byte[(int) (length*1.25)];
+ readBytes(bytes, 0, length);
+ return new String(bytes, 0, length, "UTF-8");
+ }
+
+ private String readModifiedUTF8String() throws IOException {
int length = readVInt();
if (chars == null || length > chars.length)
chars = new char[length];
@@ -113,11 +133,15 @@ public abstract class IndexInput implements Cloneable {
return new String(chars, 0, length);
}
- /** Reads UTF-8 encoded characters into an array.
+ /** Reads Lucene's old "modified UTF-8" encoded
+ * characters into an array.
* @param buffer the array to read characters into
* @param start the offset in the array to start storing characters
* @param length the number of characters to read
* @see IndexOutput#writeChars(String,int,int)
+ * @deprecated -- please use readString or readBytes
+ * instead, and construct the string
+ * from those utf8 bytes
*/
public void readChars(char[] buffer, int start, int length)
throws IOException {
@@ -144,6 +168,8 @@ public abstract class IndexInput implements Cloneable {
* and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine
* how many more bytes to read
* @param length The number of chars to read
+ * @deprecated this method operates on old "modified utf8" encoded
+ * strings
*/
public void skipChars(int length) throws IOException{
for (int i = 0; i < length; i++) {
@@ -194,6 +220,7 @@ public abstract class IndexInput implements Cloneable {
clone = (IndexInput)super.clone();
} catch (CloneNotSupportedException e) {}
+ clone.bytes = null;
clone.chars = null;
return clone;
diff --git a/src/java/org/apache/lucene/store/IndexOutput.java b/src/java/org/apache/lucene/store/IndexOutput.java
index 648355db08f..acd4dcf4ac4 100644
--- a/src/java/org/apache/lucene/store/IndexOutput.java
+++ b/src/java/org/apache/lucene/store/IndexOutput.java
@@ -18,6 +18,7 @@ package org.apache.lucene.store;
*/
import java.io.IOException;
+import org.apache.lucene.util.UnicodeUtil;
/** Abstract base class for output to a file in a Directory. A random-access
* output stream. Used for all Lucene index output operations.
@@ -26,6 +27,8 @@ import java.io.IOException;
*/
public abstract class IndexOutput {
+ private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
+
/** Writes a single byte.
* @see IndexInput#readByte()
*/
@@ -96,16 +99,18 @@ public abstract class IndexOutput {
* @see IndexInput#readString()
*/
public void writeString(String s) throws IOException {
- int length = s.length();
- writeVInt(length);
- writeChars(s, 0, length);
+ UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
+ writeVInt(utf8Result.length);
+ writeBytes(utf8Result.result, 0, utf8Result.length);
}
- /** Writes a sequence of UTF-8 encoded characters from a string.
+ /** Writes a sub sequence of characters from s as the old
+ * format (modified UTF-8 encoded bytes).
* @param s the source of the characters
* @param start the first character in the sequence
* @param length the number of characters in the sequence
- * @see IndexInput#readChars(char[],int,int)
+ * @deprecated -- please pre-convert to utf8 bytes
+ * instead or use {@link #writeString}
*/
public void writeChars(String s, int start, int length)
throws IOException {
@@ -125,11 +130,12 @@ public abstract class IndexOutput {
}
}
- /** Writes a sequence of UTF-8 encoded characters from a char[].
+ /** Writes a sub sequence of characters from char[] as
+ * the old format (modified UTF-8 encoded bytes).
* @param s the source of the characters
* @param start the first character in the sequence
* @param length the number of characters in the sequence
- * @see IndexInput#readChars(char[],int,int)
+ * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString}
*/
public void writeChars(char[] s, int start, int length)
throws IOException {
diff --git a/src/java/org/apache/lucene/util/StringHelper.java b/src/java/org/apache/lucene/util/StringHelper.java
index 7d422b3ccb0..7ffdd92aacb 100644
--- a/src/java/org/apache/lucene/util/StringHelper.java
+++ b/src/java/org/apache/lucene/util/StringHelper.java
@@ -25,6 +25,22 @@ package org.apache.lucene.util;
*/
public abstract class StringHelper {
+ /**
+ * Compares two byte[] arrays, element by element, and returns the
+ * number of elements common to both arrays.
+ *
+ * @param bytes1 The first byte[] to compare
+ * @param bytes2 The second byte[] to compare
+ * @return The number of common elements.
+ */
+ public static final int bytesDifference(byte[] bytes1, int len1, byte[] bytes2, int len2) {
+ int len = len1 < len2 ? len1 : len2;
+ for (int i = 0; i < len; i++)
+ if (bytes1[i] != bytes2[i])
+ return i;
+ return len;
+ }
+
/**
* Compares two strings, character by character, and returns the
* first position where the two strings differ from one another.
@@ -45,7 +61,6 @@ public abstract class StringHelper {
return len;
}
-
private StringHelper() {
}
}
diff --git a/src/java/org/apache/lucene/util/UnicodeUtil.java b/src/java/org/apache/lucene/util/UnicodeUtil.java
new file mode 100644
index 00000000000..020bc694245
--- /dev/null
+++ b/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -0,0 +1,447 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * Some of this code came from the excellent Unicode
+ * conversion examples from:
+ *
+ * http://www.unicode.org/Public/PROGRAMS/CVTUTF
+ *
+ * Full Copyright for that code follows:
+*/
+
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/**
+ * Class to encode java's UTF16 char[] into UTF8 byte[]
+ * without always allocating a new byte[] as
+ * String.getBytes("UTF-8") does.
+ *
+ * WARNING: This API is a new and experimental and
+ * may suddenly change.
+ */
+
+final public class UnicodeUtil {
+
+ public static final int UNI_SUR_HIGH_START = 0xD800;
+ public static final int UNI_SUR_HIGH_END = 0xDBFF;
+ public static final int UNI_SUR_LOW_START = 0xDC00;
+ public static final int UNI_SUR_LOW_END = 0xDFFF;
+ public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
+
+ private static final long UNI_MAX_BMP = 0x0000FFFF;
+
+ private static final int HALF_BASE = 0x0010000;
+ private static final long HALF_SHIFT = 10;
+ private static final long HALF_MASK = 0x3FFL;
+
+ public static final class UTF8Result {
+ public byte[] result = new byte[10];
+ public int length;
+
+ public void setLength(int newLength) {
+ if (result.length < newLength) {
+ byte[] newArray = new byte[(int) (1.5*newLength)];
+ System.arraycopy(result, 0, newArray, 0, length);
+ result = newArray;
+ }
+ length = newLength;
+ }
+ }
+
+ public static final class UTF16Result {
+ public char[] result = new char[10];
+ public int[] offsets = new int[10];
+ public int length;
+
+ public void setLength(int newLength) {
+ if (result.length < newLength) {
+ char[] newArray = new char[(int) (1.5*newLength)];
+ System.arraycopy(result, 0, newArray, 0, length);
+ result = newArray;
+ }
+ length = newLength;
+ }
+
+ public void copyText(UTF16Result other) {
+ setLength(other.length);
+ System.arraycopy(other.result, 0, result, 0, length);
+ }
+ }
+
+ /** Encode characters from a char[] source, starting at
+ * offset and stopping when the character 0xffff is seen.
+ * Returns the number of bytes written to bytesOut. */
+ public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) {
+
+ int upto = 0;
+ int i = offset;
+ byte[] out = result.result;
+
+ while(true) {
+
+ final int code = (int) source[i++];
+
+ if (upto+4 > out.length) {
+ byte[] newOut = new byte[2*out.length];
+ assert newOut.length >= upto+4;
+ System.arraycopy(out, 0, newOut, 0, upto);
+ result.result = out = newOut;
+ }
+ if (code < 0x80)
+ out[upto++] = (byte) code;
+ else if (code < 0x800) {
+ out[upto++] = (byte) (0xC0 | (code >> 6));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ if (code == 0xffff)
+ // END
+ break;
+ out[upto++] = (byte)(0xE0 | (code >> 12));
+ out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && source[i] != 0xffff) {
+ int utf32 = (int) source[i];
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ out[upto++] = (byte) 0xEF;
+ out[upto++] = (byte) 0xBF;
+ out[upto++] = (byte) 0xBD;
+ }
+ }
+ //assert matches(source, offset, i-offset-1, out, upto);
+ result.length = upto;
+ }
+
+ /** Encode characters from a char[] source, starting at
+ * offset for length chars. Returns the number of bytes
+ * written to bytesOut. */
+ public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {
+
+ int upto = 0;
+ int i = offset;
+ final int end = offset + length;
+ byte[] out = result.result;
+
+ while(i < end) {
+
+ final int code = (int) source[i++];
+
+ if (upto+4 > out.length) {
+ byte[] newOut = new byte[2*out.length];
+ assert newOut.length >= upto+4;
+ System.arraycopy(out, 0, newOut, 0, upto);
+ result.result = out = newOut;
+ }
+ if (code < 0x80)
+ out[upto++] = (byte) code;
+ else if (code < 0x800) {
+ out[upto++] = (byte) (0xC0 | (code >> 6));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ out[upto++] = (byte)(0xE0 | (code >> 12));
+ out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && i < end && source[i] != 0xffff) {
+ int utf32 = (int) source[i];
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ out[upto++] = (byte) 0xEF;
+ out[upto++] = (byte) 0xBF;
+ out[upto++] = (byte) 0xBD;
+ }
+ }
+ //assert matches(source, offset, length, out, upto);
+ result.length = upto;
+ }
+
+ /** Encode characters from this String, starting at offset
+ * for length characters. Returns the number of bytes
+ * written to bytesOut. */
+ public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) {
+ final int end = offset + length;
+
+ byte[] out = result.result;
+
+ int upto = 0;
+ for(int i=offset;i out.length) {
+ byte[] newOut = new byte[2*out.length];
+ assert newOut.length >= upto+4;
+ System.arraycopy(out, 0, newOut, 0, upto);
+ result.result = out = newOut;
+ }
+ if (code < 0x80)
+ out[upto++] = (byte) code;
+ else if (code < 0x800) {
+ out[upto++] = (byte) (0xC0 | (code >> 6));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ out[upto++] = (byte)(0xE0 | (code >> 12));
+ out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end-1)) {
+ int utf32 = (int) s.charAt(i+1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ out[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ out[upto++] = (byte) 0xEF;
+ out[upto++] = (byte) 0xBF;
+ out[upto++] = (byte) 0xBD;
+ }
+ }
+ //assert matches(s, offset, length, out, upto);
+ result.length = upto;
+ }
+
+ /** Convert UTF8 bytes into UTF16 characters. If offset
+ * is non-zero, conversion starts at that starting point
+ * in utf8, re-using the results from the previous call
+ * up until offset. */
+ public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
+
+ final int end = offset + length;
+ char[] out = result.result;
+ if (result.offsets.length <= end) {
+ int[] newOffsets = new int[2*end];
+ System.arraycopy(result.offsets, 0, newOffsets, 0, result.offsets.length);
+ result.offsets = newOffsets;
+ }
+ final int[] offsets = result.offsets;
+
+ // If incremental decoding fell in the middle of a
+ // single unicode character, rollback to its start:
+ int upto = offset;
+ while(offsets[upto] == -1)
+ upto--;
+
+ int outUpto = offsets[upto];
+
+ // Pre-allocate for worst case 1-for-1
+ if (outUpto+length >= out.length) {
+ char[] newOut = new char[2*(outUpto+length)];
+ System.arraycopy(out, 0, newOut, 0, outUpto);
+ result.result = out = newOut;
+ }
+
+ while (upto < end) {
+
+ final int b = utf8[upto]&0xff;
+ final int ch;
+
+ offsets[upto++] = outUpto;
+
+ if (b < 0xc0) {
+ assert b < 0x80;
+ ch = b;
+ } else if (b < 0xe0) {
+ ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
+ offsets[upto++] = -1;
+ } else if (b < 0xf0) {
+ ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
+ offsets[upto++] = -1;
+ offsets[upto++] = -1;
+ } else {
+ assert b < 0xf8;
+ ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
+ offsets[upto++] = -1;
+ offsets[upto++] = -1;
+ offsets[upto++] = -1;
+ }
+
+ if (ch <= UNI_MAX_BMP) {
+ // target is a character <= 0xFFFF
+ out[outUpto++] = (char) ch;
+ } else {
+ // target is a character in range 0xFFFF - 0x10FFFF
+ final int chHalf = ch - HALF_BASE;
+ out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
+ out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
+ }
+ }
+
+ offsets[upto] = outUpto;
+ result.length = outUpto;
+ }
+
+ // Only called from assert
+ /*
+ private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
+ try {
+ String s1 = new String(source, offset, length);
+ String s2 = new String(result, 0, upto, "UTF-8");
+ if (!s1.equals(s2)) {
+ //System.out.println("DIFF: s1 len=" + s1.length());
+ //for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ if (i < size-1) {
+ i++;
+ char nextCH = s.charAt(i);
+ if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
+ // Valid surrogate pair
+ } else
+ // Unmatched hight surrogate
+ return false;
+ } else
+ // Unmatched hight surrogate
+ return false;
+ } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
+ // Unmatched low surrogate
+ return false;
+ }
+
+ return true;
+ }
+
+ public static final boolean validUTF16String(char[] s, int size) {
+ for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ if (i < size-1) {
+ i++;
+ char nextCH = s[i];
+ if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
+ // Valid surrogate pair
+ } else
+ return false;
+ } else
+ return false;
+ } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
+ // Unmatched low surrogate
+ return false;
+ }
+
+ return true;
+ }
+ */
+}
diff --git a/src/site/src/documentation/content/xdocs/fileformats.xml b/src/site/src/documentation/content/xdocs/fileformats.xml
index a776abf36eb..a1d7f961c14 100644
--- a/src/site/src/documentation/content/xdocs/fileformats.xml
+++ b/src/site/src/documentation/content/xdocs/fileformats.xml
@@ -736,10 +736,7 @@
Lucene writes unicode
- character sequences using Java's
- "modified
- UTF-8 encoding"
- .
+ character sequences as UTF-8 encoded bytes.
@@ -748,8 +745,9 @@
String
- Lucene writes strings as a VInt representing the length, followed by
- the character data.
+ Lucene writes strings as UTF-8 encoded bytes.
+ First the length, in bytes, is written as a VInt,
+ followed by the bytes.
@@ -1233,10 +1231,12 @@
--> VInt
- This
- file is sorted by Term. Terms are ordered first lexicographically
- by the term's field name, and within that lexicographically by the
- term's text.
+
+ This file is sorted by Term. Terms are
+ ordered first lexicographically (by UTF16
+ character code) by the term's field name,
+ and within that lexicographically (by
+ UTF16 character code) by the term's text.
TIVersion names the version of the format
of this file and is -2 in Lucene 1.4.
diff --git a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
index ad4309f3e83..6add9cdd67c 100644
--- a/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
+++ b/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.util.LuceneTestCase;
import java.util.Arrays;
+import java.util.List;
import java.util.Enumeration;
import java.util.zip.ZipFile;
import java.util.zip.ZipEntry;
@@ -39,6 +40,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.util._TestUtil;
/*
Verify we can read the pre-2.1 file format, do searches
@@ -131,7 +133,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
for(int i=0;i> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
+ chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
+ }
+
+ UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);
+
+ String s1 = new String(chars, 0, len);
+ String s2 = new String(utf8.result, 0, utf8.length, "UTF-8");
+ assertEquals("codepoint " + ch, s1, s2);
+
+ UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
+ assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
+
+ byte[] b = s1.getBytes("UTF-8");
+ assertEquals(utf8.length, b.length);
+ for(int j=0;j 0 && buffer[offset] >= 0xdc00 && buffer[offset] < 0xe000)
+ // Don't start in the middle of a valid surrogate pair
+ offset--;
+
+ for(int i=offset;i |