mirror of https://github.com/apache/lucene.git
LUCENE-510: change index format to store strings as true UTF8 not modified UTF8
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@641303 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0bd6f8ec04
commit
8af3598b74
|
@ -61,6 +61,13 @@ API Changes
|
|||
|
||||
7. LUCENE-1234: Make BoostingSpanScorer protected. (Andi Vajda via Grant Ingersoll)
|
||||
|
||||
8. LUCENE-510: The index now stores strings as true UTF-8 bytes
|
||||
(previously it was Java's modified UTF-8). If any text, either
|
||||
stored fields or a token, has illegal UTF-16 surrogate characters,
|
||||
these characters are now silently replaced with the Unicode
|
||||
replacement character U+FFFD. This is a change to the index file
|
||||
format. (Marvin Humphrey via Mike McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single
|
||||
|
|
29
LICENSE.txt
29
LICENSE.txt
|
@ -200,3 +200,32 @@
|
|||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
|
||||
Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was
|
||||
derived from unicode conversion examples available at
|
||||
http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
|
||||
from those sources:
|
||||
|
||||
/*
|
||||
* Copyright 2001-2004 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
|
|
@ -9,4 +9,3 @@ The snowball stemmers in
|
|||
were developed by Martin Porter and Richard Boulton.
|
||||
The full snowball package is available from
|
||||
http://snowball.tartarus.org/
|
||||
|
||||
|
|
|
@ -1237,16 +1237,14 @@ document.write("Last Published: " + document.lastModified);
|
|||
<h3 class="boxed">Chars</h3>
|
||||
<p>
|
||||
Lucene writes unicode
|
||||
character sequences using Java's
|
||||
<a href="http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8">"modified
|
||||
UTF-8 encoding"</a>
|
||||
.
|
||||
character sequences as UTF-8 encoded bytes.
|
||||
</p>
|
||||
<a name="N10433"></a><a name="String"></a>
|
||||
<a name="N1042F"></a><a name="String"></a>
|
||||
<h3 class="boxed">String</h3>
|
||||
<p>
|
||||
Lucene writes strings as a VInt representing the length, followed by
|
||||
the character data.
|
||||
Lucene writes strings as UTF-8 encoded bytes.
|
||||
First the length, in bytes, is written as a VInt,
|
||||
followed by the bytes.
|
||||
</p>
|
||||
<p>
|
||||
String --> VInt, Chars
|
||||
|
@ -1254,13 +1252,13 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10440"></a><a name="Per-Index Files"></a>
|
||||
<a name="N1043C"></a><a name="Per-Index Files"></a>
|
||||
<h2 class="boxed">Per-Index Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The files in this section exist one-per-index.
|
||||
</p>
|
||||
<a name="N10448"></a><a name="Segments File"></a>
|
||||
<a name="N10444"></a><a name="Segments File"></a>
|
||||
<h3 class="boxed">Segments File</h3>
|
||||
<p>
|
||||
The active segments in the index are stored in the
|
||||
|
@ -1421,7 +1419,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
This is used to verify integrity of the file on
|
||||
opening the index.
|
||||
</p>
|
||||
<a name="N104DC"></a><a name="Lock File"></a>
|
||||
<a name="N104D8"></a><a name="Lock File"></a>
|
||||
<h3 class="boxed">Lock File</h3>
|
||||
<p>
|
||||
The write lock, which is stored in the index
|
||||
|
@ -1439,7 +1437,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Note that prior to version 2.1, Lucene also used a
|
||||
commit lock. This was removed in 2.1.
|
||||
</p>
|
||||
<a name="N104E8"></a><a name="Deletable File"></a>
|
||||
<a name="N104E4"></a><a name="Deletable File"></a>
|
||||
<h3 class="boxed">Deletable File</h3>
|
||||
<p>
|
||||
Prior to Lucene 2.1 there was a file "deletable"
|
||||
|
@ -1448,7 +1446,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
the files that are deletable, instead, so no file
|
||||
is written.
|
||||
</p>
|
||||
<a name="N104F1"></a><a name="Compound Files"></a>
|
||||
<a name="N104ED"></a><a name="Compound Files"></a>
|
||||
<h3 class="boxed">Compound Files</h3>
|
||||
<p>Starting with Lucene 1.4 the compound file format became default. This
|
||||
is simply a container for all files described in the next section
|
||||
|
@ -1475,14 +1473,14 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10519"></a><a name="Per-Segment Files"></a>
|
||||
<a name="N10515"></a><a name="Per-Segment Files"></a>
|
||||
<h2 class="boxed">Per-Segment Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The remaining files are all per-segment, and are
|
||||
thus defined by suffix.
|
||||
</p>
|
||||
<a name="N10521"></a><a name="Fields"></a>
|
||||
<a name="N1051D"></a><a name="Fields"></a>
|
||||
<h3 class="boxed">Fields</h3>
|
||||
<p>
|
||||
|
||||
|
@ -1701,7 +1699,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N105DC"></a><a name="Term Dictionary"></a>
|
||||
<a name="N105D8"></a><a name="Term Dictionary"></a>
|
||||
<h3 class="boxed">Term Dictionary</h3>
|
||||
<p>
|
||||
The term dictionary is represented as two files:
|
||||
|
@ -1764,10 +1762,12 @@ document.write("Last Published: " + document.lastModified);
|
|||
--> VInt
|
||||
</p>
|
||||
|
||||
<p>This
|
||||
file is sorted by Term. Terms are ordered first lexicographically
|
||||
by the term's field name, and within that lexicographically by the
|
||||
term's text.
|
||||
<p>
|
||||
This file is sorted by Term. Terms are
|
||||
ordered first lexicographically (by UTF16
|
||||
character code) by the term's field name,
|
||||
and within that lexicographically (by
|
||||
UTF16 character code) by the term's text.
|
||||
</p>
|
||||
|
||||
<p>TIVersion names the version of the format
|
||||
|
@ -1887,7 +1887,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N1065C"></a><a name="Frequencies"></a>
|
||||
<a name="N10658"></a><a name="Frequencies"></a>
|
||||
<h3 class="boxed">Frequencies</h3>
|
||||
<p>
|
||||
The .frq file contains the lists of documents
|
||||
|
@ -2005,7 +2005,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
|
||||
to entry 31 on level 0.
|
||||
</p>
|
||||
<a name="N106DE"></a><a name="Positions"></a>
|
||||
<a name="N106DA"></a><a name="Positions"></a>
|
||||
<h3 class="boxed">Positions</h3>
|
||||
<p>
|
||||
The .prx file contains the lists of positions that
|
||||
|
@ -2071,7 +2071,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Payload. If PayloadLength is not stored, then this Payload has the same
|
||||
length as the Payload at the previous position.
|
||||
</p>
|
||||
<a name="N1071A"></a><a name="Normalization Factors"></a>
|
||||
<a name="N10716"></a><a name="Normalization Factors"></a>
|
||||
<h3 class="boxed">Normalization Factors</h3>
|
||||
<p>
|
||||
|
||||
|
@ -2175,7 +2175,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<b>2.1 and above:</b>
|
||||
Separate norm files are created (when adequate) for both compound and non compound segments.
|
||||
</p>
|
||||
<a name="N10783"></a><a name="Term Vectors"></a>
|
||||
<a name="N1077F"></a><a name="Term Vectors"></a>
|
||||
<h3 class="boxed">Term Vectors</h3>
|
||||
<p>
|
||||
Term Vector support is an optional on a field by
|
||||
|
@ -2308,7 +2308,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N10819"></a><a name="Deleted Documents"></a>
|
||||
<a name="N10815"></a><a name="Deleted Documents"></a>
|
||||
<h3 class="boxed">Deleted Documents</h3>
|
||||
<p>The .del file is
|
||||
optional, and only exists when a segment contains deletions.
|
||||
|
@ -2380,7 +2380,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N1085C"></a><a name="Limitations"></a>
|
||||
<a name="N10858"></a><a name="Limitations"></a>
|
||||
<h2 class="boxed">Limitations</h2>
|
||||
<div class="section">
|
||||
<p>There
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
/Producer (FOP 0.20.5) >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 1113 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1115 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gb!$G9lo#B&;KZO$6@53W]k9ICdOP`P=a5[dnAEt!C8gORi4Y_IpYbOI4uP7>VL*sJQDKN]6[Q8S%SK06ig^-JUH<Al=8O8@*ujlaYsElqCQ+]dUW@K(&6J^]Uhf(`"fd/ks57"^Y;LPl&;h5f5Weas6O](/=h']Wa8Dgbn^!_nuqmJZM#o`9CmhRK^LMlr+=3/eTP@k\(dc/b(WtL"dm!4)WMB;gocr=T%j/$lE>Xo\)4)`fne(e3=[b6f>EEC"UTpJnTI4b:+&Q\[CnNTGc/7;_)qPA_)lrGchW__JWg47o`BO[p&Um!+.u0W#O_5XQks>]'NNfml7k4h>AP)7<_:=9$tb55Sr>k,OS]7BE[U-Ab\Y@C53O7U[j+kjGtTb7cGJWt4]4q%1?L1!CQQ<5`TI,I2_)adekIJ>*t/^>pAl3uDLFdf5&^rP`F@@)9W(IcTW(NY#\]*sIM'Z<d&t3hRVikJNVEBpodoAaHU1t%N=<1'@Znt!e]BL\HZ/a>\]8oJGjSbj1prR?4Z*aJdu7J43Z2RImnNO,g&5I3M5VH2':-I_Sk%/*h!,Ube%='Nl=)%ig<O]S?L^)IJD0):,^^6[jHQQCW-C^9o*fNn)K>fBIK6kB('./d.ond,XEb"Gj0GB>!mi6:P'nJ.nk=omFh!NY##@\@,j[:b1"cq>'#cGHH=j_*[ELH%0iiFuF6Ypa8)d6R)6hg!:TBoHp'bhG-KhP`1"^1W>96'N<D13]Y/+UHG@&2r2F2$s\)JV&fP*2-,dk)TVE]n5qOOP6%ca`=h%]EG39p)4Bs]@HW+DQO4f/#dl(-2uJWE(:fZI2,;9_4U]5%3*,-VM=__!qNI>idM=<:"39-:<OT#"5S7H:k4\oWGFH"NYnA(]m_P[!j#Ab-=JX4=f8QF"m2]WUJ\p_Mdg>ZdRbNHV-aWUlu;WS;@ccG>Q&E%qrkRV5YNNK?0HTYmqU0t*ir#5_'Mql>(l\qQ((N0FFA,D72uTGCqlqqeq^]kh-tK]%BZrG5]kQueW@*6=,bdmL:Ahs+\@db%=c0>at7&VLcYb2'f+E?G+`RQ%F6g?W_$D)>,7:$@rrQZf%*]lD&$,O1he6&Y^a7t[t/~>
|
||||
Gb!$G9lo#B&;KZO$6@53W]k9ICdOP`P=a5[dnAEt!C8gORi4Y_IpYbOI4uP7>VL*sJQDKN]6[Q8S%SK06ig^-JUH<Al=8O8@*ujlaYsElqCQ+]dUW@K(&6J^]Uhf(`"fd/ks57"^Y;LPl&;h5f5Weas6O](/=h']Wa8Dgbn^!_nuqmJZM#o`9CmhRK^LMlr+=3/eTP@k\(dc/b(WtL"dm!4)WMB;gocr=T%j/$lE>Xo\)4)`fne(e3=[b6f>EEC"UTpJnTI4b:+&Q\[CnNTGc/7;_)qPA_)lrGchW__JWg47o`BO[p&Um!+.u0W#O_5XQks>]'NNfml7k4h>AP)7<_:=9$tb55Sr>k,OS]7BE[U-Ab\Y@C53O7U[j+kjGtTb7cGJWt4]4q%1?L1!CQQ<5`TI,I2_)adekIJ>*t/^>pAl3uDLFdf5&^rP`F@@)9W(IcTW(NY#\]*sIM'Z<d&t3hRVikJNVEBpodoAaHU1t%N=<1'@Znt!e]BL\HZ/a>\]8oJGjSbj1prR?4Z*aJdu7J43Z2RImnNO,g&5I3M5VH2':-I_Sk%/*h!,Ube%='Nl=)%ig<O]S?L^)IJD0):,^^6[jHQQCW-C^9o*fNn)K>fBIK6kB('./d.ond,XEb"Gj0GB>!mi6:P'nJ.nk=omFh!NY##@\@,j[:b1"cq>'#cGHH=j_*[ELH%0iiFuF6Ypa8)d6R)6hg!:TBoHp'bhG-KhP`1"^1W>96'N<D13]Y/+UHG@&2r2F2$s\)JV&fP*2-,dk)TVE]n5qOOP6%ca`=h%]EG39p)4Bs]@HW+DQO4f/#dl(-2uJWE(:fZI2,;9_4U]5%3*,-VM=__!qNI>idM=<:"39-:<OT#"5S7H:k4\oWGFH"NYnA(]m_P[!j#Ab-=JX4=f8QF"m2]WUJ\p_Mdg>ZdRbNHV-aWUlu;WS;@ccG>Q&E%qrkRV5YNNK?0HTYmqU0t*irpd"?qn,:d"C#-YFE^n1Pdh+p.E)6ormsXZ\W*VA:%RP,"e:+j.Cs4o`&e<tlkNO/Af5Gf:G\LtAq%r#BNX.t.*T_1rqm[19=O!>e8aUZ1jYiR+jsG2oQ:j<dfA8JsYn?Y(bb.E)%nMP[Pl~>
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
|
@ -424,10 +424,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
77 0 obj
|
||||
<< /Length 2395 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 2356 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gatn*9lo&I&;KZM'qANtbesnj&0uMb9TuG2an3q<C+a?6MF?-fO[Y%gj6#5(,Tm3e&F`#f1Qb+P$1>O>c\gQ^Dc_j(=,X6i'&C^Fmh\D*-W-1@nhoOrq]!hd\4Ep(.AJPS58J1N(Sl`'e7$Y!3L?q&&G^W>F*Z0K3f-9L^Z76sH_RTcPLcIAX]_goB]1_D$#NrQXd(-RGf#S%96mBDJ$iK+?*&n\6_YcA02%3_bG3U9o_6PTlht>h.f))-II&IO'@nT=9/F;tWD-L)N2/a;1:-\-.q^_QdRIcjZ)jDSl'2[bX.>Sa^7Qak@>'Edc1h@<3+:dCh7[eW\[-D'i\47=C?8C)W=N4J3[kePV),j$WD6R.S:se<?1FqY1--hSeQp]*if([,3mL0D-9Ft#KQJ3\=ui=9*Q"`b6clYeM5.]@1AHs&'q3,OQAq<?rS>GY6@V"!$Ef2Gj&n*O`*iPRZSn\l"!4BOa]Ye4mJtESS>i,'h6M`*GHX.PSI5LbCl8C1h)V3hIAHId4dS*IOH.NB_eW:dM?7$@B%H[=DG-ZhT2]>p$V!1W86CfX(L02mXGn[WnN\81Y'"$4NlrW:W?7=AgENG?efL""=U=8b[;nYQHV'QbBBSi_QUq^L_2$XDcnkp1Cs9!J-f'qPKC8M.%o<DGle3150!Kf[=W8$M#6YCmFfjtuFBB=;9`5+YQ7YM9HgN!@P_Y>1=t<,,V;JNRUru91F*YfGlsLhM_?\_pi%t\CE6PEN9DB>+$t2!>0\8/eYUnRtoI5I`5Tt$JF^64#b9HG7N_"[3G-=%o3%[HdI].DPLc4rNS=%+qTt(86>R6.TCP,P*i5.4T[f>CA&gJRbbOKS3.fo`pD2\f=>X.YXN[`QL5U!qGF^;84b9EIU+1#k>,-6.B*jVAk%,4Toj[]RbgJU<gf#;4O",-.E2Cgc3!A#3HCO94QM4pNc*KT=`MEl\t)l!TAerN2KTdKQC,$L)MmPb=XC'q0h@57&E!:)#&2)<PnZEUm0BC0,+842N[W3pXkL$!D)\)q_<E&7`'#Tno0K%%NN_D/C*^F@fE3"QpZKf'$5Og#Z[0X-!GWXdi!3&[fWK7u/d<0=C%F]3h`'@%MaBF:%We.g\R&"<V,)<%:?aq4]7n5=#`<eNL4@4jeN+,R=Ue%ic90se='.G/)X!aJM.\0=*LB>pg#JbA&"Fm,\fLLX%k;ns-!8A$iMh%X8qKX4,pRK<ebp3G411^,3mVL>CckD2%i2$I$2e%Pf*HmHuE:'#tTa3<8iZnEOu+(2E[<ug/(E+dUs&B4g+\@L15KeP-;^b\"q!C>AEp(1XR1@7H2Pp79;p9Di1p1:Fq@$`;PciQPsgA!K3Y]e6lmJU_el/<ZX"XjF,G&<2,)-B*5'o;^"[hWnB5f[\,I0!@o/4SG3@/Ybb7OIs5&"<uRefOMg)DW0uXd(+FRLHKdo_Uu?kR4m94r@qOVjW+D@qKS-?V!P=K)pl)@%7t,Ad6hI6Ns4]!iX2-/Z^UuMM/sIqZ<jkJ8.LNQX3l.f.TE7%\X)Id>>sLP(ZL@;/>8a!_j7E@JpDI*=W/'_FO!ppbZ6[G0I&TD]g<9O;OXHO4m_kC:I[M3)h$3JW(=`2%X")+18Fae-uQ3:I$e3)[O8LX!8+J#'/+JW<P]\OGP2sh)V3hI1:i:NXZd+K^^-H.l]s5/fZZ_A&F"N5^ap'Ea8]ubJKfAT4/fOe-FVWf+mV]/Adh&DIZsV:og.doQJ+_c&t%Zdc\;/8$`roTrGe,gX8-hWUCptnp%8$K/raL=7&Y%GW]"']sXK2X:D`%Yh\WG9iW79c"a01]\f$EH<^)fP;*gs@8Znb-)CLbP<qZ7EuSEaiLfQ`[AT1bo%\LdV6%%W1Fo/;Q?bmNPUnABF$c-CZA)iKj1DpGF1fj;jb8r=H((Ao84d_5kN+NG7Ed_gp\X>t;][J]T:5%UVr@/B[5(m1OY3apVm<SU:k!gD&"DEea'L5ZY.q)gZ@;`2^>;o&WjF[:rO#hc#V10*H!@i!$ZkV59GM92^QB"%(<MIoji'5_R?ajLVksju6FKE[ij;7KpY=1V7C#@lUeU[qAaA!Z@%CESn'0V5+)EEqZ6`4sd`^>.cFVE;4PZOr?CMV?HG?YUb2.$M$eJ-Qb]\D<T@;JSQ=Xq$oO^]IkkV6JS"KR$]UgJ3c&tO_W]QG#QPh)KkO+[logX4pkAcVs1pYVZ5'>=9qJb)Qs7OYgF[%W/lK3PDl7o=]#c&F1!BK>S!0W`??IL0`50HG!O_^;5?f1QqQ<OASDe4q4bu5[O%F'Rt\B&n?;Am0fra#Kl]r>o<\gkeUIfAm[b)&-Q8WoL:PuanHYa"I65LmBpe%2g0/F1?/9X/EiZ#uILe`rg>pX)$fp"+RA[he!g!+N7O-i~>
|
||||
Gatn*968iG'SZ;X'jN(A0giTEE7nd%9KHIdNLESrKd'C&Ae`C]jLG$["UW6M*s)qj3YU8D0;3_'mWV(VK^Vs"T3$qh$^0;G^E45&Q=`^[kNXi>mItgVb-]C`UoBM1Mr,?>JerCC;X+Q).U8puKFmhPNY^oS6$/F89RWjT7e&l:mj=f@fg`<b`6m!L9JbD"%S#bPFRV[BDjT`_>Q/BHaLYP2onOe=D_L-^RMX*2C\$)Xcg/JIofDmr=oHcML1[Mc&sd-WJQJ>NEZ*h5;DPlX6n0!1B\$5Fb-B_\Vt4\(C=bPN^AKrdMSk7+l^RDJ??+uIjkX_4;@7@.>f:'H*;%COC3-Jc@<K(H1ob]<(O4JeX&lMB$;E!9FZ?.Kelb0&)aKo*\<,`Uc"GIEZ80F>^hB9p%e^1'*JT^n>r%JQi%ftISlM5q%@3",[DVFl8msnGQ@^V"icnL`"u04EiU)URGJFRq+ZV&qisr:Ed_#/cUcYgml2J'qU^Kc0Q>LV,UUrofE/8U5ZBt83*n#$G6?pa)9iHSc`pDDFdTn:1k`^m_Bu*>1'oAtgj+?SE]Y-$g3aO4ff<)#fdW&EcGO*;F]<MY+J[L9/$Pl)j(p+J!_*QN"mFe>Ll#kE!A-h-OX#*0E//tp2LIl]c0VEdLdj#QXn@H"sK+"cQnFjj)kn/V9Ghnh"+WZk;HFJ?TF=.hJh7rZFF=L`j#GZDco\+4cIF?ZJetp(/2"e)5$P<GV<cNir=?'pE%W1tX!t8VFd(%R7#I#br]BN6"j+ThTgmYf?(m[VM4$"Pd6J0'H!+B3RN;V&d/eJa:6o_N*#so"1_RL-g2n'R#3P^>YG4"eR[3J.1oS;Vs)HR%Cm+*(40XuSKgDs4oj+TgMh3toe*LB3@[slQ4)iLpA=F8,8*]%V"UNIb@1i23833:<hX@EO.!c1nWL'(O;B%INuh@i96RsT%t,8>\/]<t*1%'6(O\jK9_>t'\pPO:@6$ShlfOgl<h.DYdODi[SZ668(P$`lKJ"WfId6_ecJ*1G&jiQUCskqN?HL_,ELMDN&`_D/ZGfG%RF4:$n^_Fsu1,d'T=%'6(O]#0ne^h/jAYV9!@'oB!=Qia@]o_-@Z.6+So.JVL9RKA<`iM"X[&H[s]343fX_E@iP(El2&`*(%s?sqr\>ruj0Xr*T-hjjhO8;P2e2&]k1r<1Uj`Y@WUXQ_SG>S2e1>AH'm!:)#G4YgFQ,];<P=F4E_9ZBd'd-*(o'-&b2<eAf7#U)M[c72QkK+#`dg=WbcU<jHZZVoBpi3EXWVGA@o(CG/'C[)gH>S`1bk_K?EiP=/7KLO+l%uo.s#\_r4EbF*m^L<:2V`7!N#b>2c1FU&m0tVVuXhCW&E8VP+#fPAt2&^(7r<1UmqQ8RR`&@`<ZN^.+X@QTU`YW%]Upii@HpP\F"p8WOI=]C3#17+T#_aUXN0Ct/E/u[1>(g:R<M*\(erJFk_nNorL4^(qKOC<MUL=AX:I#@uHi[2WbSK=KW$64=)IJ$fP>I>(s3g3_iqk$8`$&Y6=-K".O=Y"$Ldp\u12)Qd#jWFcn/)G67>TZ#obk8Im?h0@C`k!8O;OZ^LtYuh)Rs2R"OnR65fS]V$j9WW7.%&oNIenA8Z5.<JIlY)3oU]+OK1rH)$hg'-B;b]Q\YqX)$lK/)_OMa&OLYAELJF`3+gd^L'nTIL.Ii"\crA*7<gb>MKQSY&7DhBf<"#<8FMR^&(\,t[G';/D]ebJH'[S1IsDgnA01u$Cl[EIgWDO_3IZBmWmb[3XR4-mL]*pPLSsbCE,_BrWqQ,!4fih=rFTEecGOD[U]#XGD3!ANa.'*6PSm-JYFL^JX2mo$,gCaXrJ,lM8G"^'PcJklB.?oZAV)o]qK?!eKMBS*ZEt`RbG';2[5ksN=?!-.cDW@k1bErlGst>O%rkq5L9+(:+Stb[IG.Z.,O'9U5J,\><:=%ZS:+6`XZ,FCEOMS;p/KQ]pTh3urL^.PSTM<:DAO9T#'Ti-pOT]sDKWd>JeVe=7=!RCgt@N[kGE$t.pG:HP4FcC?BXiC,?mHe6:p8KD8iJI@A_,4j2"Zak6L1(%on%-IX#?um-DJSma"Z,N6bKeS`/:OqtX`MIUfbOlbRkY]3:9OnSgHOpPZPEENb)Q/nlMpr^3pu[HE43h>QSX^!S>E79UHe9cX+=:>PT'51\KRhqjctYp4VTfR-tsHcVZQ!L8DN5;%V'OY!?'kF#j9B6AtP$^:."lhg[>'i)Ru:GSi2(0i-YH"J6g>37#k>ZG^@It)kDdHp<D\dg,l4+Ho^'uV0@/BdG.asbd\36HBaT8pPTo=Aqe/<<U"G(!E:SN<*OVq)+/CS.-Wl;TXuX0(C`rrM?$_+j~>
|
||||
endstream
|
||||
endobj
|
||||
78 0 obj
|
||||
|
@ -439,10 +439,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
79 0 obj
|
||||
<< /Length 1783 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1821 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gb!;dD/\/e&H;*)Tl3SSQk^oEP4oT^:$XE9k%(m)023&*12RDa7$j).Z<tA1/;6T`C-6T'0a*;,ME3rqgiLLlhol;nAqM`=Qc,ZOoptq8TX=]/mO35u[;u*cIX(;i7P8\OO9P,<@.j7;AVBo2UP(EFr&K!X)n[(Z%]pPkQ35`A%DW&C;.oW<i5>-4RH;`7PE;:q6a&4cE[0*'Z<2oQ4Z,tj;4]h@SCqgBfX.pW^'3I#,S%^%#'4RM^4CnseD&q90mJocd14$0/bFe@h1j`>/bJdPSY@L.8?,X`a5Sg*.m9,N(iI-_[!%75(VZ/[Q]b0,?.6IL1N;eeHtFTq=;5tl;oJT5XE%0;V+H%fmk"0(8Gs#b&<'/)%-oHbM+;EiF!1'\@!;8FR?8'UJY<p`YaJ*5YrKTk],t@iPboqU%aj*,&;T0ShE'gq/Uo)N/0\t%_-@h'5m)fL8S'(-DK;ObpR1kLO4Wqgg^A8UZeRh<OB1\=FW"72JYg*oKi$RKNjQ1!!Mpq,fVlM2#Qr<<ahq:)j"agEorg>1<3fAq1kT06JC`"7_RH?q]RN6mO`.&T.a?=N_M^`]<gS4NElTASKucY.(2$_QBY[Ir<*E^[dpn!E;)/_0=t,dDY0?LEWmY?fCiA,mfg8a$HJDKd[l&C9pl3MGjJa5iKb#)aT]6,f^-+>o$6*)*2<e7e:i=ZJ_3=PC"AfXAW.O1lS(G<k;5e`.@AV30[d)hY'H7NK7l,iu3I\M@lijmj$4@5ZTIXNIe[$;I4=_]f>QK)-Z4C@>\$/BX8f+`u$_Z-:=01":!qHUSb6ZBH*((B'mgF$"_?=VqOa;8'MI`cPTa4kf\i&BE$]'LuE28C-c8T$QDGTXV*L>_lkO<[d$`!c*i(AX@Ms9;*2Wr1pHY#bC?9LB5-OUeCeE*d4HFe.VJe"(i="neg2PK]K^u"$<?).h=/?-uqSb5gVYb:]M#2V_Al(p\]&sp1-k313'4LMa0#M4%I9!Zt5<V?pG1`+%i2e$Ij/:iaJ)iMa2Wd)GYd3Z%ko8R_s*M`g.,.K0Rhm5B,UG`Q"K[Rr([G]&=HeW\(Qd@LQA:4@7O4U_MX6Y]$YMN!OHJL7Nld37,4i?o))71SN+u&$MJf#qOY(0oZ(>(*;"um\/#Xa$;<k,'+*n!GY$_fjp3ra,_(ddci;87A.9+Qg@H8('eJlcXK^NKaTR0fBZ(NN_Z7W/KSRn5I01X&S;_[L.S;"W)9^gg1'C1ki'0Vhn.i*G5;1mdsr]6eFH+2h?t/Ks=`kZ-II]^)c6S3frl_upX3buT>"Kc-$3[]Ed=#1+"k"J9BS?BuB?HGj[*W/[F!Q$J6*@ZU<AS_ZG7fk@a;iM%CfX\>Vbc`l`sZ4qPMO56#N1Mj[O@P[]BKpWk^nRC!Q*Yli\qJ^i+]J.iUo18@8b8-/4rpGSYq/>;-_;h<alC1`DKm;;`Gc7ON=Gu6]\]q5a?)H_+&?qqIF%Em]!L*3udI\LE1e,55%[D,IjH`=nUNu(X3=MuNLSh`XlH5]1HE9mmeMM[io)[BV0nXjaKstj(hp&ZbnS7DN0a[=q=5\DCMjY5E'rToerQ2G2)S7CAUF>c)PH/OE$?KrqN1OrRT4^2YHD6OkEAq(Y[-gJ),8.9TH4G6]<aX-(]?<4G&"<T)5PCXF5iS_2ID].KIb8L@gT@Z&8aJkH6PmD"0es*("*1.XD)4_#SlE:)2o1L5VJ#N<eK`C_6To2Q"a:&:?8J1qAbFDVhX"joFi9`/OJ(429FmT*9/IE\;"nFM/nP%`hd3.!~>
|
||||
Gb!;dD/\/e&H;*)Tl3SSQkf:s]``#t]$$C"=`*e(;ZsI$gHm46V<#;UfC'tXk0^%E#:iUZ5nY20*UfSHmWd`\l/:B>n9%!u-2bJ!`&=m&+:-o/kL\$ILD%XjgmS]O*kZd_l;o4fU(0jj(B_9pabCmPD-uoU'%id5c)jbhP=h^kne<o;MGAoDF!Z4H/?1KP?EBMF:T_sooT-+;2MpG7G-1VU7qqqZ\)YQ8jO3=cr:&QX4`DDSFWOAO:tO0D#VG=YIF.?dB4EmQg)gl;h[Ppib@8a=e^,_h\7W#V+a"+,fU:ZO0aW6:#=mSSgEo\*2.=)UC;\#6_bNl5oj"Us1fNt@8d1[GO(h`G;&"Df#)CX\+n?XNpEQ/Pb39i(4eDb!fANtWg;FGq%X3elTC?F;MlP9Xn&=)n*9'h>LV"33mhjQ&0.&S7_9N&)H]HM02b8u4`tKA$R)E_h+o2$A-1-/+j[/143'4M8DN$7PJ+2#LHYW*"+`MiBFV)d,`YTmkUe"#Y0R)P%3P1@%01oQ-#Cb"\=jM-5U5-*ZorQ?(>]NO:,qlc=H?Q*\&Iet;=bfK>/OF-jgn$X03)VB73L;D987TbFnkp!Iob1LjisFJ)>k@q$8N^>fa^gj6_1WPWW/N@`1$"@G!-=b%MU0U&$]r/#/7_pmJ(%nZ^Qp)&-!D`5aBXkjBiVu@U2=i(4kW)$j[sD?p=n;G#l'D#dc@CbZpQsg3$:tO#Jb_6WoXpqY\7e"Ncn6:d3NAHl@8paLrcNHjZ&`2XMdu7=INZqbf$CD_6i9=#"1s+Koe<D\X;gh\4I#0$B_A=A&Hj'*\Aon"V+c<p(h53Ja#DS*-!3.`+W=><Q0:?6Qh.\]o:nI_D0QbB77V`;OB^$3Kap!5`olYe/2Vo;RcKaFTF2si^Kk\iI"RHkP!C6dpY9)FUgW'=s+$*!`p:6?484)Z5H<LQn(P@p?L=n3#TM^o5%LR\uMYX-SFVU)0,o8%`u_%Sefs?S>c$Y6C/uD)QL,MCNs!jG;QNF&9TW`!#E)oPp/^sj3bg/2B5u6Wg.+ohQ,u`Q>+X"1T&.<i)p+!M)OLWlVt.opS1AjRHA;P'KSbkL.D:d#HhQYc:2Xh1^KqD,:h'He+U69*\X3u.`poN_dBUFG?<B4IpTgX9[pZ2eDnod4[$^PB,S'#4XAD6p2)Rr0l!ik?m4]8.qSuF+*bk]qFn#1:Jj9N#r`Vf2tU)KhafqaiqKd*nZ\A2^<&i`688=;p2LHo)B$HgF<NP2V=eGn[i%S$^V\W*5KmMG!_*.EBK/u^?*8A9N!hdQYoC@A4,RY0T)WcU%H]aGibo21H5CmmpDiYu4"D[T;_gN@$V,ND5Mj0]cZ!)hY6"l@`aAjqJaJhL=^DBt&N@\A<dt$"jII,F`"$d:34aIL1;RYn,"+T^>EMVEc`l`SU?$Qc:Ye_1):mWhkfA1$->!HL!n:Hr(!9g1GX9P9?dT<HN;mPDijbr%4!InCD`eL&:t/hMkL=l+Sn8@HeC.[(Gk.]djJIVfneAbI!d8[1%RE$5TYoVRRbf9[#3bsDO@][CDLp:i%m(d-Fq-XpcCRe7]t1MiR3(bO/SCjXe]>#\$R^&.jOqn?7D9L'fqhI!I]KQ^N[bH/*!VFGEj\J`TaqSiBm9Nc)]?%(Q5XGrXP]dVbO*7c[6kT'mCa@,TmhD?[chVJke>_6R-R5E'mh25>H;\p;Vs,oDUVd-I4%]4-hMe.1JE4h`.;h-,H4%7lAG7u9UDdaeM(a,292UeEbmg6;L9Rr'bGVL4;!M:TtN>:Nf<76C0E?D?HY)M1jbQ[js?AgI#`)3rrnG5VsX~>
|
||||
endstream
|
||||
endobj
|
||||
80 0 obj
|
||||
|
@ -454,10 +454,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
81 0 obj
|
||||
<< /Length 2177 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 2155 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
GatU5;01_T&:WeD0_]TZ-h5m02UHfnOcZB`OZ(Q]B1dpk[L@&59e?Gj^V7"YdB5&5PKlm?,\0KYhsPs*P.jXoGE^Um/Yg#;J+p(ZY3BfDEVTR"S[GZgNhJK#KXU'G?X)F5rqssdDAsoERJ8?sT;D./7f+mg2Dn"6g65P6c\l5%X@RZj^,GSJ/cmBRf7mrPn9aM[p=%$X&J]^]Z`_(N*RUHbN9Oe?U\$;3A#9^-?bYDkB&gqZrR9S=`-\iLj3uiV;SD7#mE&-=<B>QQ.TNI*@VSa=aFb]a'[h;/aG67O]A)$+o3aJJ4MRC#;?`#YFefqCNb4Rq+<drUpV[t4g^S]Rs7ZD$i`MXK*o5E>;P)c[_La`[ZeTd=V-o'i]S"t8'RZB&_0/-*d8r"4ME.SRR_'imJcZqSH_?qm"jmBfR>A:A6^j`H06T8lBM>G*h<Saf;UZ4\d6XWL"Y7GlfJ"^P)N4<@j-l&D-l_]q^f2CJ),Yo$3uA][)MD'5XTO0@4$t@!>Z-6%KKb"uJNbRe!C;9A;%3Y*D4G=7&n@J/5Bp0.7)H0Z^c>4hS1MG]kn6m`)m@gIUSf:1$l$jT<c/AQ-_co2@iW-=ZGpK6LCOO=4$"l+3(caR6+d;"JCRohK^a:hK_R,.mpYu?AM;iO:'Zd6CJAc(jM?_*KH!l(h++gd'FH4d/$rdfFs+"&`A1lecrb_3$&+<FW<,N:e?><3:/EcPUJt5L-&TZi]Fg^bL-Z&O;YQeUN9j\>=,KgA2PRAUs"naHF<SK!V[Fi?rLn4u!(8cnSned<R*=SB=S.S"^_eRo,1atm%@h+SjdD)`M5HV;f=,S3%\q9`(sl*qF4c%@Hn5/c,RN-S^`N#4Man]T%QFegXh\iZ#^#?@FW`nAj+M`!'KUcXMO@p_6JXWkcCoeFi]cA6<MEe7@&M7j<p31!0L;*?.DSkM+Q%%>fF0&Z_AHp-7Fa'sZZOUM/X7WHZrd;a@4L+[6&LQ),KWKL0FYdC8C<K4"'t_NFdPcCBP^(tW*%-N"o-F)4]b@?5g*@,;MV1!mg6R8W/f!@$WA'JcDr5Aks8DT0m1'55tm>UOHh;ba;Z<oC6?Wsc!8#'8T;g>Lcdc0Kd8-(U"VD")lIW]"HFd1W>aV&[GhghTuEFfrF7%B,K3pobL9B)L'0]p&<+&%#4>BWATqOG"uSc*MC0ep!0%crXq16Ib6duAVIT<'iBZ<C#TNA[$>GfiL.XA&fKhq[5#*/ab_'T@BeZbr3mmsh`mm'ZNTnD78=*3W$qG7DIH=\nA%!.!X6J_I3RUO/:n^2^oS"9Nc`A!<[6+_9A1b?-r^`U6a⋙H:JP(Vp+WU%,NJ84X;WrKRJFY-\PP35/%<\Vl!;YHY1S<,]ap:j%R:%he*WEI`m:^m9!VR@#Fq\:*b)_7WjZ[)GR+_@_9W<.#S6_UW,@q,!TI9$k$@KC?aMJ^f"ALDIBH'.jCD^^=^lh>4>Rm33K$u&iFN1&4=D@MOil&:j=e=%`eG@BXt'VLa,D&VQ0/9MWK:8;9T*.2c?I%5Yj%^d!SWcIl@"Xmf2.O$XPmMqH%XHFK6m<ZSAi;/q?L:-+"g]_?:+J"$Od]0>&:Wk;b&8FoeB'5a@,Ws0rVmUI5i;MSdat^s+K9&nLW5_3k,Lf=I+p+%Nk=ShmVC-gR%tHbp^L;8>`Pj[=RY.%CSAarA5tQ!\eCnVV<O=\p%4VoTUB1aS"VITWa(Sh.3T*<`,B8fO1$61j(kX47#TR[2_>cKQO/)DdcO?E3,t?;+4E:h@mD[ZSAP"nbUM,ZFeX^-RM42(A&F#&.,7+nqp'4?Hso`(i$NBU,JsBouin4E`!\5uqCbUGIk'r*!1`Vh:=%`roc$k!Kk[17I[ZD:j2Z`GAR(f0jP`#43fJ_TGI`SLmZ?Pqo+u\`tYDrN``*8`9'267H40;LP`*Vb/%Q>IC@'e6[X'.]rPYBXW8VQ+`n9A\1E%1X,hafpak&YO`M()8>!N9_d7`r95-m*m<`Nj"D^TM]Zuf/Aiufs34kj%q+^/X_S`6^Ue%2g!d-355cBtRJW<$lP:]E65FgP5#BnQ^<YSp[@Z'ZD")b@q/tT"gHCI1D\VU[YP1Vm=QUdXmH*tXg4ngZ]'bA\EC"[0!9E_M3;1g,;6_-j).qPK57D45g;2q_SU>>[qE1R;Xgn/e_#4LfCkee~>
|
||||
Gatm=D/\/e&H;*)+nSEk+]e8RGm+DJ:>BGsSLsPi!V9^G&nPjbV,W]>359p,/;JE^C!4=h1_;7$I.$C4j71ToF$&W6n!Mg\s8Um=)nM_gT#^jlDXlM)D--J=N,%t0Dk0[:J,Wm?C8Su-9VRPEc8,/Z;qoW?/_r3&D6hal]!u7&FT%SEP&M8AkgcN\[E9dK^(<2_PJrE>dVN39GEW6k4O#:G/DbT_5Dd3^P5CU"hqI/S)@YBMSHFfohh:'";TupkVKaHGpG<N/?T/![!!9kk^tZ99oCQKc(MfK#G=e#SI<be]m<UN,hi0gL]q(/XBK!O#H98f>mjSjfoQBIB8f`ec6i9L]hcLGcEt[Q#l\F5ag!0J'2P/\g*!N<>N2>9,#n0lo%MBnG?2#&7_3`om0B:s)]a^!sbOE$UXnr>cJ'a.nBd.-mguWR\__U&bm;`QJc'MhM3!nh*Qj7'F.@_/)8[>!S&+^g'/e!#=J1WSI*7>SHdG$>a2MKSH>Z))#j+sf$TFd-8'k&Y0(rVZ:id&B5RZSQqS++D4e-MdB.*rgc)\mbCUKN0!!C=o"T`?W6MY*L5c_*XucRL%[.$,sZ'gYe<7u3KcN5plHnZtPJA@6.Q\5\\l:DR>kHjQc(-n(;,j!o9a"=fsCD@N?gU6]"n*5Akd`b%Ah%L0n!F]Jc-a[r!fIE/>fCUfNG]]6UJ`,Fl)1CQJop<(XLFO81D*XN==10@D8E.dEuGG1iU.%Hs5d#K!rJ\XR7b.pPIdfZ+:@8rnW'hQlaC4i%LB]u0ieM&;,`Eg%eKCssb#@Gp;hcc!*=AIJ*:d[.8cS>GV$#4Q*d?30"9JlWFk@t&J\k[ERC<48&Sj%]o?YBY&Z)/dTm;1FlM5D9=CfDOG?HQR2#4+%tnFDI>'Kh-k(r`u4cUjFbSO2!u29-pNGR&U(9<C0gV\&(Ae`m-!/*b&LXp#KC=#EoMb\3)MiAi`67bfA=[p%:^!pmFBc3euV$4Kbn,RWW^,*;9%;f%2O(L`7Y\tWr,K3<qRgb+j<PdFNm1LVciAoWigYud*&Wl!nJ4_Y)r2JJg7T,*3]jFP<?oSABYJm3d*9MW-5NW:XMjblMi]q1lYL?b.X]J.lejhb21SaO<DLrjg'5tn<->pD,8H4/>C>nS.K&0fFq6%q^<-</`sH0Z.s;1W3b<iUq!9;sn&1ScudZbHst3@JcHYWP'L@7\m\o`_+-QZ96/B%\cgUd`OiUZ)R]"A7^.5S/K0b&TgtPd(Z>ZpDJGNQt$E4/`E3_TAR=Gn=i%%0fk-'B=R?>b0p%QsOUhHn!NpeTV1pctC'M5%a+P[[pfSP\XIe(+QF_!e$bAbBqf80o\:$F(?N+ld87og;d/bYEj)5id4`DSugq<)J]jT^k&_60]B*@lQO-)!.JZdaaMM+9L1FhDVBLQ0q(bhCu`O?=PeT*;,/B]rDn'"S/NuW-Xs^(2COL&0VbLLTL=7EX;MebC>I=7Rt.?,aD(fMD-R\#%.M4L/Zf2g;Uob-9pUTL4+\;a2GiafWZbOr"nS/e6C7h;\fkB?Ot]mq<th\-6]#1dB\l9;X]G)diWhUR<UpspSl,OVUg<K<q,;A=Qcl"#AT.)KD7,@s-KYrpOD)K2#I$?Sb&q$93O$hX;<ht--L6FGSE,07Lr$Pg'lo2KdHlM2/mAs!oOG^>qUEQ_O=D>B\P\qS0`%R;9&2X=a%6t!&oi=uE"l'#;=0BPWlhNXKQtL_5Zpo%q8^PkRkuL\]K[q`(/L7L:SIt&PVt;'K':$`<iPi21bIr&cKQNJ=u2PO0@b#s0Vt^3W3^(fgITiPK)2?e&umBo^-RM42(8iI#&./8+nqp(4?I!p`(haFBH9jOdNZTiH0ggW#"=*fO3TG:b-nIq:BqC4gB(_@k!K_W17IZ.2saMIR*BC-RM]l__/kd[X4g1BbTDS*k3Gc^eVV65RC7!:rOr!XO1b<?=6A]0Ia>@Y<Rg>1>r9,S>Sb$PEZdL76V/^3k:83"MsJs-qMh*'F/p6`6atk1IPs7Bc^IbUI7O$Y($&`D!cQWp!h70\`T=8j><U"1=1j79pH;UL*A5$.)*`Q)Aas543ag8eaj5!%0sV5*';MRUYo`tjQ/cj&jkHf,g&EfY\@=)TFYI\$&c4l)2ZmRXnCa"=$Za5b#K11q?N~>
|
||||
endstream
|
||||
endobj
|
||||
82 0 obj
|
||||
|
@ -469,10 +469,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
83 0 obj
|
||||
<< /Length 1887 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1945 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gatm<=`<%a&:W67+T1:#d>#9onc4OK3U._Ui4?>C(9%,7,+&"t&ukA^m=4":.>fq2dS\^XE6MVU1%WXK(Z2>15HP].(>QHUhiE]JAZc[L!W;E0p3Gm-&X+tn9'r\r^Fn@`2Rk;q,M."j^HUiJ?g5Rmj-9;`r6*t]3eo5]5@21b=aK:d2cl]j4a=BW(2bLf$id`cDtQ7/F5a4/c<ln='C;!j'\u'AT8Vd&EHGbMqN^Z]Zhi@PUF042LIVs3M"sF?l#>$C5]7G?i8FS$Ws:dopK3k&%6#EQC\7>:,S/q^<9&%"(SsmS]6o"/nI6g@p!Ylo9rSfYq`IWUd;u<DeSOO&p.KJ7`HPX1d^*!<$O#[kCa)K,-3_>IR5Y,O+U5.VLdaj/QK_<1of/5%`V)MH&'O-tZe,Jq1n+r+:nN8[WjI]e`Wr=nre-mi!YAQV'HDnq69>rtHu*ipV*USoV9TM#>aK&`D>+*F=bB`/W>\X</m_$$U3._`4D9."!2@XY'+81WZd'M,\q>SLYi)o(N=OOf^+j?EQ9H8+JSbFhH8bTtGq?P=<,"s*(St.'+D0T*H)\9lSnq#WUtL>Fe_Ma/`ertU3ChKDH.;m#=D!9"U<'Ts76W$k$p<F'"hmi:6'/o"qJ*@ebdn,_lS0Gc(YJ,R<kHco89.DE]e1g6+=[];&Nr<:M8g[:9nbJa'm:'TV52a]9hd2pCT^0]#+``r68Zd'E16TpR8J"H,YC``j[KTi/"$p)qN)LVUtQ'edC!.fb.X/^rA\>:llSlNO9Oh.7R$r_6((K"RsFU#m0<n0MW0^hm!t$d*U"[f1q]3N>>jOi1-)dP$<t^YT>*"IRbfdbPe"e([Ha=ZJff10mK(.\V%Zq.a@'so77B\M3A3(`c]G8&U,Fu3!9JE!9pO(<.]CLh^b*:1SB.a7Wt`fXFqc4kB0[Unk%mT$&!X,;r]U5&pVQX[\,2FD@&"YLIcVfL]a&e?-:^2.l4KTY.iu%<kCX0EC<<f>:>^bPIG'Nt>H>%V;>no#]KZ!@cMSRO-R[nPMe#8e`Jdpb":Cle$?J+[Os8LoGF4dD0)(rOc+IBL0eoVuiJ=4IBes1<WiMpPla*VV>X'AP.k#.'lFFOV:PmrY"I>IN(q9m;W@J6*>(pAH6*!p"h-I<3K48;[=Y>3",^U%`E67ai#?@g3ZE2K![U".f2R@6AV^<A:PH=(%HGGt_d)0lZ&c;;u1P^dR"iDLY)De:=/m@OE:2)dL/BTj/.QQOt/rffD&VVIc:;J'NPj386F4+rmjnQ9j*f_W(j&Zbt3>E\Z?1r:dAbIHfe@bfT\jOPsH[E.JI"*EW[mh^\4@D=\U6-D'4n5UuG:,OpB(Q9:pM/18%17FuUeM!:^?]6aZR#tS5>Kt`K.?6tU%%iu@2s$D8Lf5pgQc^tkPU1F,i3>D/s*?c?)09ToR5JWb][W+$iZN+3c@q]cLaB#b5++`E9)`U?8IJ`2r;oHdFaH%`tolsRSlBj&f%8]:=3NiY;%W[\2L/,T(t8gTHl`k'1aq+kE4O$;?#g,ei5dD(1=3sTTe<*5HSpQ97aoHT?/eR"Uu&VDKNG%m?r'6Oi&?^H1%qgnK<=R;r_gG=NkV(etDnlnpnZD$'@cX9'V!KaFE[^>Xi4X.-aV<.3gV%1#VM9<Kh"VBYHE,_J)4X,.2f"0G-l(KX-c!6Y#h5VikAs0P#/?#TOV@;bJnB1c4L<RB$Fja,,ht`$k#.diMXNA>)-<_du0Ck/[aJn>Y%Q&h/Y'`":V>K@DjFT&.LnUc3LEHNG?Z#]q.QfQ&ED*(U;Q6SA.1md-N<F8sT!.Z?Et'U"M%iOZib3`)Ll+S<88D3<A0#6XZU=0;A#[Y/.(>>L*)d+r!$0gr<fh_emGGlIir&7f)~>
|
||||
Gatm<>Ar7S'Roe[&Fu%e%4b?+%2]<AZ4n=:Y$6j%^p42KU26R4jQYY\s*dJ$AXt&EM-Df*)NbZSF2%7:)rE1_^FU3$/\&-AhtN&]AZc[L!W4VLpA%0I0ams4.CHKcr4[FIliP"k5[V:;oc5[C`A1Z-h'*[7EMm1-$Pp8#=mhf0Y*4i?^/W@;]mrhg]rm\@;iN%"gf-<T7s-OM+*;5&8$jbLosut;ROU)"FJHlBB!uAPjQLlmF-Ee?esSYN?U=+0S`R%<4t_cQaRu(sYN3@plaq/t39Ak^J,.-3kD6&a=>V/C-$+Kq?lD)]lS55\$Gf%]H9;9V&<n8pjNBNom1^S[V<3F56Q)JUG9\GI/4r]^3V.XTT>&8=495@o\-6#AU:2)*D!Yc:lCu4O2Yg8e(l,(>FTQ?@d9#NleX:0\oh51U\_?!uY<(oN/+aPXoYZn]*L[*B5Dg:"!8.te=9QPdUaF9,WUh$JZRg:WXZUtpLRc=AkbZsM&VTuh,`C*EAFZ>be_fk;K)ihj#6i.Vqk5#<KtKT.s#"K;^=8ReRY1&NYjJ"^/ShS*HW;9ticnp"g7i`*+\Wrb4rF@JjfaLEY_G"$i&@<:"Y=n47AGLCZnG+AZDGk_EF;iJWudl>@cQtW>,<\D\gX\LbD>tHk,$oL#I5a0P%Fr^<;/%X"$>Ta7766<"!D!>(%R5+a!ql#Ph7e]Hm9F%6tHP<o>f3ZS"40BY9:'@ik_Mo],fF:?^_$f5"T[X_3mkKNp-#<8MgW;D(Du\Akb[iUtG'.9eA,9Xp9XbS]0bY1s#A8Z_EC)o2&o-3%rEF1''hf0e[iD[<HqtCUM$DHSRV(&FsQe:ZYHpJkL-Z$q..&0>Zgso1nZX#SN$SO@eehN(G&R.NL@YR\O(bjL$a'[UKS:bSKZ^.[lft1$b(7Sebu%^*%"2JcQFQ=?:GEMG3X\*J@'Eb5<UgA,6Jaas:_[@g,\XhpbU3IA)7mS-n'2Mp$siWjPIH9P$`1$^_o1aGW%t'(5qaC7YB$^Z#:H6@O\s\ZjEi#/?>BBQ!q.U<*uXDL4car5spB(.`o>,+t+4W9EjtT*acV,2C_KX@ZB@0K?73]o+[,A,bO_HtneF.!M$j/"JpkFB2gVeB:g'\&Z]<EOkg;CZb:V!+\nh_\#>rdUpAW<*)Y*<Yncd\c.J.XIuL8WTX7aInfRL-Et^1`P>3H9O0qld0dUD#-H0RFFVPu%#Tm]k2tf=9!Yq1!gcSF]i2?8/^#W'BA5((J^&"ha/R14FslVM(-Md'\3UYKc:%"`5Kmrlp0PW`UDbr#Hn8IdTC3tuE:f8Yn!Q?,lI*oCdWYG2(CCB"V,RtHSs+>u/DEfXg7<>HcYd?9.RH>qg!cLV3LI0NoDUQ1%O!JHm+%[=hFN08ND+qLLjDlc,<'Fci`Yg>c[+`tkUd\oe@*1$'_X.MRg>HMf>S;69,-rHS@5JX.d[\=i9nZp*,:R,.Y%`XQ#I`+r/%j-hWtM#OKn?S9hok=[k5^<GP5;2a#qgrS,.-%#=;GAlMqtN2]V=U;/Eu:L9?KTDHo@h!U4Yt!HCY0QtA<HMnt%cZrl;@O>;1FDM1AKIYoJ6At[Hp\HS`*&%tj#!OMi:IDL?O4ZTWaUntlb^;ml?lidh1B0PYPk,r@FV*tl\E3(!bE)fM!n2Lfdm:$O90.7?%>aZ.[PcCN3fRs)l:V^NuAr*/keW;O1UE'HN?`TVH1FMe-\eRteKo"6AY$`IESIiq9/rkqElb,o:jhd+u@3BMk4AFXnd>+Dm5L%t)=ZoN%6mbB;0;<f(h2m?id2jq4Loi\C_&RhP[L_U>7!>46NP_I?T@KC%a'GK"GC+Z<+[lZY+"tF?n$>"C*Uj.`pe^DF+%rrsFG(!rc7Oara,t>s?lU2dcjp\6h>".olA),8qjH3irDd(=0sn$[6pKn\Y0\"0=aB5"YlVN;!B3]L;Z~>
|
||||
endstream
|
||||
endobj
|
||||
84 0 obj
|
||||
|
@ -514,10 +514,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
89 0 obj
|
||||
<< /Length 1768 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1808 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gb"/(997gc&AJ$C#ei,'X;/E#?OAV]?#6bp[TGJTD@a$@fgI.792]Q;^V5qKR7!D7HBB>*U'MC!'Sl;<2]mg=%u\i]Mp_.;G?4;'0;FruH;9VKo%tj5Io#'3jou;_c>rRSkeimV^RrQ,+36*nEo_@@T)@V1^6c:&ESr0Mj=G'/cZbFE=lS7s1C:XphG#=]WkkH<R&k1q^RW9H_bEJTNVZ?j(m:BD,I.sJ>:'9*k&Z9K_+ZrgKIW+8\HbTJ_n!h&nYVd3lh479k]LYFJ%6f@-kak'Y1JR_esItU&m%V`i"o.V5_U+0pZhJSZ.d``5VchW9?IU[i'&H!>c;(bFuZBMQ>P24c`1_uegT*W:EfE0TK.S'_X)Od\nUUna_CsqS.T/Z(5k$s:g`pUN>-%R>dKP!JE5_X$Xt)S?qY8o[qL%)/uGIVAP$#6IF8qc0qj:-/t2uO*a!C=jG8<qA?&c0N^+Af(t+&57[l$JO:>ZH/VaMJ,P2qPVqDY69VeYSLmc/6#o5M$e3koHB=U-=(d1?(Zb$6/>YV-gmfh0plehg%Pc+\mf*tEHV7t>fK1H%79i)BZ4$<=h$K6hb3GdQW7F`mp?j_T0.mc9DL,uu%K.Ou"4I^s]bW$0P%"&sH.7%GOaNqLQ_aa(!+5^8?FOZ^Fi;&7u1c0+720PO](US!C2J69ZHAZG!/Sk>DJ9uiNLG?%HNfeZS'Xhf"@6.!JZJ7*bCA3>*^-Kak$N]p#aPpGe;fm&mIBQ3:S2S;PMZq@p)A0fZHWho]P_dOu\N!iX-<ijQ"fJri%EG]H>]Pstq!j'NBW8Gj$/IYSrj2c8cY!L0&+>>-_#S$@-=[,?9pIH#f]%e+.<&G4nq#k0DMc:I+e,`D':9trB4*@t/mOL//Z*df+PgD2nP(R4)uV&(>)iMIN.R.Xn=*`(kXF0ToPCdMj7%/#7X-lp<-,'[^E^T;Nq-8HI)C-=CTjKqS5'g\r/YkG9Hlo@s!<n60HPq*jM7h`]3r;#Va]^:PB\K_f'1f(qbg#!g^C6e0*PTp\>aS/C3n/M_M'h8%ida#+bc?;Da.+S[?]8qJ#gcCi438bId`tjFEcZ-%P+KC')3;,X/UX'M\Qhtk0?9VKo4f5/YBc26XZGsh9nA/cgN0\qL!QQpJ8JiIcmkd';Vlp9og4uC3b38r*W'W-AkqA(CG7^;E-/Q%\+PM#$52[9p,`QChoU^X$8:jjEl7bi?n6l0nF<H&k#!A_s>^?_.*11LWtd8[_4.]"V@t)YbHE?F3YY9R%b.EqIM30)5"n9"DKGWn99FC?4krP'L$%FapK^iQlu`W9V>pUC'oTlh,Oon.GQWRJd\MW$<eM>JLoGLP`XueinV2GG%B`#RC;iU+ec`A$eFC5d9:9dG&H4S*L@bt=;(EE*aAi=N\X<].uF]n)hn56/.d@0`kohQq%WoaPY5&c]tM%ZlT!?5Cd/*TG"]VpOu,3C=CJT7n8tE_=AQW@gOZYRPE5)mEql?SrBYWD9UkBA^pX&"gU.[gP$'8Z%3=dfE/r-o(-bDE2AJ*irf@><_23gI-Zu1?b+AI'lNac6N^%Bdr"*pP!md`8!;h5CWJNMoL%mJ\5N5[;FA9p7UTX.E[dZBW^,L24nZhEK\O#$Er6)h;mi-,87;b.LRf&8LFCfn,4SHJ+[H"f30TBQ:rs!K0cB?gO,O)plf]sd<lks-3]38U2G)27pWCDI%-#.PGkedCRXrBUR;J_C'c,7b16E00>g'(qcV$<K.%`X;t5]ofJ5'$Z)cl3k4~>
|
||||
Gb"/(95iiK&AJ$C#ekB_XL3@I^(ff!ejYJ'8.k;sRN?!rZAD2L6W(,E?XHEkbb7]k:)iPn-l,sQP_Sef2]mgMO3>Y+bK85O[FX@+Vod`!*@m!knUD+Y^Ls74fR_#'[@7"Oq=s7,5:f.Eou*&*0l]6'8,RE[f1B/nd'FrEYa1RIqn%W\..I5URX*^emq/opjkt\G9\)/R+4g#GnKP\_#K%b[%#IF@&6\VLHR\r^Ada?b^e?ifN(T[n\U-g?j10q0n[=q9lZNrr)g!S7J%[.5.h^-;gDNDF)dH%(!gcH<i00TbNcI-d*TZZ8@[4&*JRlp=P*G'a_0$+/C+b8fg>e-p=h<GI/1jMfj\aG$A[ub)]mGTS-\Hd!-=po2&X/@qc-bR2h@mU([L0#!4!ckb"I>7Y6=?9\!hSd5hd?N;:#FUFfTE[l9.)aRF'Gh'QqNYsDndJs@nP6qe-7MNj-OIgGdF'EK!A3\1h0Ml5C@>1JU(Y>?'?E5((_p2nCn$_em_hmQ5jbRbXX#Df4((C%eBi"%;A2uL_;&_<%GK90MIjANmeVk@DE,,*i#"W>rS#7T]d*h4_*3q.X1sW"e(#0<<En]V(OUZ,*G;7i"Ujh%*s5fd;VT;DUGZ?U?[F8d"T>V-'RtX+ZHU(62"e%T(PAgB,h1d$6DoCPb=uVjC_f_@3a>_9]$f-<7GVK&1!+n)$Q!]HAu3CqZSQ59_B8>7'U9k7E`9WNo6M(emV72KVRXH&;AqpehEWtG8\[h:6kECAT],+(HI9I8n;lnLa8P`<P]CVT@&B1BJHDf;^'"nkamrp)'qC&>0`2L\'A"bE5AM[H,V[XBe?j99`4ZP?F3\dl@a&G*NR<eYc"f:(l#Lfn":%DI%)mOZ_gD69Pf0sk2a$#6s2l40XjArLQKX1IhS23KmR2[1G0Z/SLp:TpiDPb0A6+MQ;`cl'P&dcIj%9Ia:%c:/Ip9>EC>,M=LW9p][7tZO(2;'pXT!s7/20`Il5BXCp1GD1<TPDOHf?$at5i"-*@-4:i=(Y2jYcpDNir\j./@[.W8)A^)I+$qT&*><)Tn0>cI#sjg(^CWDng+HHma5F('/REqEms&8$7'*FEj=:/o"$CL&P_f*rtq#7jh1gEWUq=@''R^e=)u9a0$QXG@/4G;QtC,71NU`0FN:#]*D5>'`i[r]R1@O=q[km\r]5XP@ra]\'R.UgMc<YD6bopP-UMCos'4FAbppb0_9s*FoAD@sh"6HQf=ue>>uJrIGh3mc+NX2kb_W$&fgb+<3unDia`'k2T)qCm($im@cfqJ<0qd--]>QF3Za4(Bko102l<$oIP/581QEH^N3B72)/B3iC0'+1fn.>)1k,o"A2^nU+hDG=k2"KKj.$=E=TA`6,Vg?'*\8i'%/<uFff"MfeRuKFNZb=OAOk@F4Om7;-JBLG&H4O'pge\YTWKe4LbYX'kN8-RcbJ^2@B=KQm2V,Xau\l3@HHraoj5ambV7#Fe#EA[Msd-]-ZB>5oQ:*X>!h,Ge?ll^SfrDqiS1-O\b@-Er9=drDD/a9^D$Q_6Nl1\o/n_"LF?^(cpBAisb0V/?U($CapopI1Z\ZhMHnKX6oP?.6D^$g\'^L'n=l5feMW`cC`.>#Q&tJWL5M&L%mL2kj^,M\gHOs;>3ANrNsr9GFdBH_u=G)30qGT>=:2nMf?'^cN?7cH^!pI[AqIDJ]Fp74E_]+n_t[Eh_W-rSk#K>"N#T]\CN%=4YXP*FL=B(FGLM6KcGAu'?O+DdDl7n>rR5*"lJt^c/[#75Ul.mg'2"t.0lF3)N7NF!7`qmLWKE'Wd?m.~>
|
||||
endstream
|
||||
endobj
|
||||
90 0 obj
|
||||
|
@ -1026,37 +1026,37 @@ endobj
|
|||
39 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [80 0 R /XYZ 85.0 659.0 null]
|
||||
/D [80 0 R /XYZ 85.0 637.8 null]
|
||||
>>
|
||||
endobj
|
||||
41 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [80 0 R /XYZ 85.0 606.666 null]
|
||||
/D [80 0 R /XYZ 85.0 585.466 null]
|
||||
>>
|
||||
endobj
|
||||
43 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [84 0 R /XYZ 85.0 659.0 null]
|
||||
/D [84 0 R /XYZ 85.0 637.8 null]
|
||||
>>
|
||||
endobj
|
||||
45 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [84 0 R /XYZ 85.0 520.547 null]
|
||||
/D [84 0 R /XYZ 85.0 499.347 null]
|
||||
>>
|
||||
endobj
|
||||
47 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [84 0 R /XYZ 85.0 442.894 null]
|
||||
/D [84 0 R /XYZ 85.0 421.694 null]
|
||||
>>
|
||||
endobj
|
||||
49 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [84 0 R /XYZ 85.0 190.441 null]
|
||||
/D [84 0 R /XYZ 85.0 169.241 null]
|
||||
>>
|
||||
endobj
|
||||
51 0 obj
|
||||
|
@ -1115,147 +1115,147 @@ endobj
|
|||
xref
|
||||
0 142
|
||||
0000000000 65535 f
|
||||
0000048685 00000 n
|
||||
0000048880 00000 n
|
||||
0000048973 00000 n
|
||||
0000048762 00000 n
|
||||
0000048957 00000 n
|
||||
0000049050 00000 n
|
||||
0000000015 00000 n
|
||||
0000000071 00000 n
|
||||
0000001276 00000 n
|
||||
0000001396 00000 n
|
||||
0000001568 00000 n
|
||||
0000049125 00000 n
|
||||
0000001703 00000 n
|
||||
0000049188 00000 n
|
||||
0000001838 00000 n
|
||||
0000049254 00000 n
|
||||
0000001975 00000 n
|
||||
0000049318 00000 n
|
||||
0000002112 00000 n
|
||||
0000049384 00000 n
|
||||
0000002249 00000 n
|
||||
0000049450 00000 n
|
||||
0000002386 00000 n
|
||||
0000049516 00000 n
|
||||
0000002523 00000 n
|
||||
0000049580 00000 n
|
||||
0000002660 00000 n
|
||||
0000049646 00000 n
|
||||
0000002797 00000 n
|
||||
0000049710 00000 n
|
||||
0000002934 00000 n
|
||||
0000049776 00000 n
|
||||
0000003071 00000 n
|
||||
0000049842 00000 n
|
||||
0000003208 00000 n
|
||||
0000049907 00000 n
|
||||
0000003345 00000 n
|
||||
0000049973 00000 n
|
||||
0000003482 00000 n
|
||||
0000050037 00000 n
|
||||
0000003618 00000 n
|
||||
0000050103 00000 n
|
||||
0000003755 00000 n
|
||||
0000050167 00000 n
|
||||
0000003891 00000 n
|
||||
0000050233 00000 n
|
||||
0000004028 00000 n
|
||||
0000050297 00000 n
|
||||
0000004165 00000 n
|
||||
0000050363 00000 n
|
||||
0000004301 00000 n
|
||||
0000050429 00000 n
|
||||
0000004438 00000 n
|
||||
0000050495 00000 n
|
||||
0000004574 00000 n
|
||||
0000005293 00000 n
|
||||
0000005416 00000 n
|
||||
0000005485 00000 n
|
||||
0000050559 00000 n
|
||||
0000005618 00000 n
|
||||
0000050623 00000 n
|
||||
0000005751 00000 n
|
||||
0000050687 00000 n
|
||||
0000005884 00000 n
|
||||
0000050751 00000 n
|
||||
0000006017 00000 n
|
||||
0000050815 00000 n
|
||||
0000006150 00000 n
|
||||
0000050879 00000 n
|
||||
0000006282 00000 n
|
||||
0000050944 00000 n
|
||||
0000006415 00000 n
|
||||
0000008563 00000 n
|
||||
0000008671 00000 n
|
||||
0000010822 00000 n
|
||||
0000010930 00000 n
|
||||
0000013211 00000 n
|
||||
0000013319 00000 n
|
||||
0000015395 00000 n
|
||||
0000015503 00000 n
|
||||
0000017991 00000 n
|
||||
0000018099 00000 n
|
||||
0000019975 00000 n
|
||||
0000020083 00000 n
|
||||
0000022353 00000 n
|
||||
0000022461 00000 n
|
||||
0000024441 00000 n
|
||||
0000024549 00000 n
|
||||
0000026017 00000 n
|
||||
0000026125 00000 n
|
||||
0000027509 00000 n
|
||||
0000027617 00000 n
|
||||
0000029478 00000 n
|
||||
0000029586 00000 n
|
||||
0000031316 00000 n
|
||||
0000031424 00000 n
|
||||
0000033608 00000 n
|
||||
0000033716 00000 n
|
||||
0000035499 00000 n
|
||||
0000035607 00000 n
|
||||
0000037552 00000 n
|
||||
0000037660 00000 n
|
||||
0000039066 00000 n
|
||||
0000039175 00000 n
|
||||
0000041074 00000 n
|
||||
0000041184 00000 n
|
||||
0000042398 00000 n
|
||||
0000051009 00000 n
|
||||
0000042508 00000 n
|
||||
0000042708 00000 n
|
||||
0000042926 00000 n
|
||||
0000043132 00000 n
|
||||
0000043340 00000 n
|
||||
0000043508 00000 n
|
||||
0000043708 00000 n
|
||||
0000043866 00000 n
|
||||
0000044041 00000 n
|
||||
0000044282 00000 n
|
||||
0000044411 00000 n
|
||||
0000044565 00000 n
|
||||
0000044719 00000 n
|
||||
0000044863 00000 n
|
||||
0000045013 00000 n
|
||||
0000045154 00000 n
|
||||
0000045394 00000 n
|
||||
0000045576 00000 n
|
||||
0000045749 00000 n
|
||||
0000045952 00000 n
|
||||
0000046140 00000 n
|
||||
0000046392 00000 n
|
||||
0000046533 00000 n
|
||||
0000046742 00000 n
|
||||
0000046928 00000 n
|
||||
0000047102 00000 n
|
||||
0000047347 00000 n
|
||||
0000047538 00000 n
|
||||
0000047744 00000 n
|
||||
0000047905 00000 n
|
||||
0000048019 00000 n
|
||||
0000048130 00000 n
|
||||
0000048242 00000 n
|
||||
0000048351 00000 n
|
||||
0000048458 00000 n
|
||||
0000048575 00000 n
|
||||
0000001278 00000 n
|
||||
0000001398 00000 n
|
||||
0000001570 00000 n
|
||||
0000049202 00000 n
|
||||
0000001705 00000 n
|
||||
0000049265 00000 n
|
||||
0000001840 00000 n
|
||||
0000049331 00000 n
|
||||
0000001977 00000 n
|
||||
0000049395 00000 n
|
||||
0000002114 00000 n
|
||||
0000049461 00000 n
|
||||
0000002251 00000 n
|
||||
0000049527 00000 n
|
||||
0000002388 00000 n
|
||||
0000049593 00000 n
|
||||
0000002525 00000 n
|
||||
0000049657 00000 n
|
||||
0000002662 00000 n
|
||||
0000049723 00000 n
|
||||
0000002799 00000 n
|
||||
0000049787 00000 n
|
||||
0000002936 00000 n
|
||||
0000049853 00000 n
|
||||
0000003073 00000 n
|
||||
0000049919 00000 n
|
||||
0000003210 00000 n
|
||||
0000049984 00000 n
|
||||
0000003347 00000 n
|
||||
0000050050 00000 n
|
||||
0000003484 00000 n
|
||||
0000050114 00000 n
|
||||
0000003620 00000 n
|
||||
0000050180 00000 n
|
||||
0000003757 00000 n
|
||||
0000050244 00000 n
|
||||
0000003893 00000 n
|
||||
0000050310 00000 n
|
||||
0000004030 00000 n
|
||||
0000050374 00000 n
|
||||
0000004167 00000 n
|
||||
0000050440 00000 n
|
||||
0000004303 00000 n
|
||||
0000050506 00000 n
|
||||
0000004440 00000 n
|
||||
0000050572 00000 n
|
||||
0000004576 00000 n
|
||||
0000005295 00000 n
|
||||
0000005418 00000 n
|
||||
0000005487 00000 n
|
||||
0000050636 00000 n
|
||||
0000005620 00000 n
|
||||
0000050700 00000 n
|
||||
0000005753 00000 n
|
||||
0000050764 00000 n
|
||||
0000005886 00000 n
|
||||
0000050828 00000 n
|
||||
0000006019 00000 n
|
||||
0000050892 00000 n
|
||||
0000006152 00000 n
|
||||
0000050956 00000 n
|
||||
0000006284 00000 n
|
||||
0000051021 00000 n
|
||||
0000006417 00000 n
|
||||
0000008565 00000 n
|
||||
0000008673 00000 n
|
||||
0000010824 00000 n
|
||||
0000010932 00000 n
|
||||
0000013213 00000 n
|
||||
0000013321 00000 n
|
||||
0000015397 00000 n
|
||||
0000015505 00000 n
|
||||
0000017954 00000 n
|
||||
0000018062 00000 n
|
||||
0000019976 00000 n
|
||||
0000020084 00000 n
|
||||
0000022332 00000 n
|
||||
0000022440 00000 n
|
||||
0000024478 00000 n
|
||||
0000024586 00000 n
|
||||
0000026054 00000 n
|
||||
0000026162 00000 n
|
||||
0000027546 00000 n
|
||||
0000027654 00000 n
|
||||
0000029555 00000 n
|
||||
0000029663 00000 n
|
||||
0000031393 00000 n
|
||||
0000031501 00000 n
|
||||
0000033685 00000 n
|
||||
0000033793 00000 n
|
||||
0000035576 00000 n
|
||||
0000035684 00000 n
|
||||
0000037629 00000 n
|
||||
0000037737 00000 n
|
||||
0000039143 00000 n
|
||||
0000039252 00000 n
|
||||
0000041151 00000 n
|
||||
0000041261 00000 n
|
||||
0000042475 00000 n
|
||||
0000051086 00000 n
|
||||
0000042585 00000 n
|
||||
0000042785 00000 n
|
||||
0000043003 00000 n
|
||||
0000043209 00000 n
|
||||
0000043417 00000 n
|
||||
0000043585 00000 n
|
||||
0000043785 00000 n
|
||||
0000043943 00000 n
|
||||
0000044118 00000 n
|
||||
0000044359 00000 n
|
||||
0000044488 00000 n
|
||||
0000044642 00000 n
|
||||
0000044796 00000 n
|
||||
0000044940 00000 n
|
||||
0000045090 00000 n
|
||||
0000045231 00000 n
|
||||
0000045471 00000 n
|
||||
0000045653 00000 n
|
||||
0000045826 00000 n
|
||||
0000046029 00000 n
|
||||
0000046217 00000 n
|
||||
0000046469 00000 n
|
||||
0000046610 00000 n
|
||||
0000046819 00000 n
|
||||
0000047005 00000 n
|
||||
0000047179 00000 n
|
||||
0000047424 00000 n
|
||||
0000047615 00000 n
|
||||
0000047821 00000 n
|
||||
0000047982 00000 n
|
||||
0000048096 00000 n
|
||||
0000048207 00000 n
|
||||
0000048319 00000 n
|
||||
0000048428 00000 n
|
||||
0000048535 00000 n
|
||||
0000048652 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 142
|
||||
|
@ -1263,5 +1263,5 @@ trailer
|
|||
/Info 4 0 R
|
||||
>>
|
||||
startxref
|
||||
51063
|
||||
51140
|
||||
%%EOF
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
|
@ -291,7 +292,7 @@ final class DocumentsWriter {
|
|||
assert docStoreSegment != null;
|
||||
fieldsWriter.close();
|
||||
fieldsWriter = null;
|
||||
assert numDocsInStore*8 == directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION):
|
||||
assert 4+numDocsInStore*8 == directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION):
|
||||
"after flush: fdx size mismatch: " + numDocsInStore + " docs vs " + directory.fileLength(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
|
||||
}
|
||||
|
||||
|
@ -754,27 +755,26 @@ final class DocumentsWriter {
|
|||
return segment + "." + extension;
|
||||
}
|
||||
|
||||
static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
|
||||
private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) {
|
||||
while(true) {
|
||||
final char c1 = text1[pos1++];
|
||||
final char c2 = text2[pos2++];
|
||||
if (c1 < c2)
|
||||
if (c1 != c2) {
|
||||
if (0xffff == c2)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
else if (c2 < c1)
|
||||
if (0xffff == c1)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
else if (0xffff == c1)
|
||||
return -1;
|
||||
else
|
||||
return c1-c2;
|
||||
} else if (0xffff == c1)
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
|
||||
final UnicodeUtil.UTF8Result termsUTF8 = new UnicodeUtil.UTF8Result();
|
||||
|
||||
/* Walk through all unique text tokens (Posting
|
||||
* instances) found in this field and serialize them
|
||||
* into a single RAM segment. */
|
||||
|
@ -831,9 +831,6 @@ final class DocumentsWriter {
|
|||
|
||||
final char[] text = termStates[0].text;
|
||||
final int start = termStates[0].textOffset;
|
||||
int pos = start;
|
||||
while(text[pos] != 0xffff)
|
||||
pos++;
|
||||
|
||||
long freqPointer = freqOut.getFilePointer();
|
||||
long proxPointer = proxOut.getFilePointer();
|
||||
|
@ -932,7 +929,17 @@ final class DocumentsWriter {
|
|||
|
||||
// Write term
|
||||
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
|
||||
termsOut.add(fieldNumber, text, start, pos-start, termInfo);
|
||||
|
||||
// TODO: we could do this incrementally
|
||||
UnicodeUtil.UTF16toUTF8(text, start, termsUTF8);
|
||||
|
||||
// TODO: we could save O(n) re-scan of the term by
|
||||
// computing the shared prefix with the last term
|
||||
// while during the UTF8 encoding
|
||||
termsOut.add(fieldNumber,
|
||||
termsUTF8.result,
|
||||
termsUTF8.length,
|
||||
termInfo);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1048,7 +1055,12 @@ final class DocumentsWriter {
|
|||
// This call is not synchronized and does all the work
|
||||
state.processDocument(analyzer);
|
||||
} finally {
|
||||
// This call is synchronized but fast
|
||||
// Note that we must call finishDocument even on
|
||||
// exception, because for a non-aborting
|
||||
// exception, a portion of the document has been
|
||||
// indexed (and its ID is marked for deletion), so
|
||||
// all index files must be updated to record this
|
||||
// document. This call is synchronized but fast.
|
||||
finishDocument(state);
|
||||
}
|
||||
success = true;
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
@ -337,12 +338,36 @@ final class DocumentsWriterFieldData implements Comparable {
|
|||
|
||||
int code = 0;
|
||||
|
||||
// Compute hashcode
|
||||
// Compute hashcode & replace any invalid UTF16 sequences
|
||||
int downto = tokenTextLen;
|
||||
while (downto > 0)
|
||||
code = (code*31) + tokenText[--downto];
|
||||
while (downto > 0) {
|
||||
char ch = tokenText[--downto];
|
||||
|
||||
// System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
||||
if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
|
||||
if (0 == downto) {
|
||||
// Unpaired
|
||||
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
|
||||
} else {
|
||||
final char ch2 = tokenText[downto-1];
|
||||
if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
|
||||
// OK: high followed by low. This is a valid
|
||||
// surrogate pair.
|
||||
code = ((code*31) + ch)*31+ch2;
|
||||
downto--;
|
||||
continue;
|
||||
} else {
|
||||
// Unpaired
|
||||
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
} else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
|
||||
// Unpaired
|
||||
ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
|
||||
|
||||
code = (code*31) + ch;
|
||||
}
|
||||
|
||||
// System.out.println(" addPosition: field=" + fieldInfo.name + " buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
||||
|
||||
int hashPos = code & postingsHashMask;
|
||||
|
||||
|
@ -713,7 +738,8 @@ final class DocumentsWriterFieldData implements Comparable {
|
|||
|
||||
threadState.doVectorSort(postingsVectors, numPostingsVectors);
|
||||
|
||||
Posting lastPosting = null;
|
||||
int encoderUpto = 0;
|
||||
int lastTermBytesCount = 0;
|
||||
|
||||
final ByteSliceReader reader = vectorSliceReader;
|
||||
final char[][] charBuffers = threadState.charPool.buffers;
|
||||
|
@ -723,40 +749,37 @@ final class DocumentsWriterFieldData implements Comparable {
|
|||
Posting posting = vector.p;
|
||||
final int freq = posting.docFreq;
|
||||
|
||||
final int prefix;
|
||||
final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
int pos2 = start2;
|
||||
|
||||
// We swap between two encoders to save copying
|
||||
// last Term's byte array
|
||||
final UnicodeUtil.UTF8Result utf8Result = threadState.utf8Results[encoderUpto];
|
||||
|
||||
// TODO: we could do this incrementally
|
||||
UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
|
||||
final int termBytesCount = utf8Result.length;
|
||||
|
||||
// TODO: UTF16toUTF8 could tell us this prefix
|
||||
// Compute common prefix between last term and
|
||||
// this term
|
||||
if (lastPosting == null)
|
||||
prefix = 0;
|
||||
else {
|
||||
final char[] text1 = charBuffers[lastPosting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
final int start1 = lastPosting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
int pos1 = start1;
|
||||
while(true) {
|
||||
final char c1 = text1[pos1];
|
||||
final char c2 = text2[pos2];
|
||||
if (c1 != c2 || c1 == 0xffff) {
|
||||
prefix = pos1-start1;
|
||||
int prefix = 0;
|
||||
if (j > 0) {
|
||||
final byte[] lastTermBytes = threadState.utf8Results[1-encoderUpto].result;
|
||||
final byte[] termBytes = threadState.utf8Results[encoderUpto].result;
|
||||
while(prefix < lastTermBytesCount && prefix < termBytesCount) {
|
||||
if (lastTermBytes[prefix] != termBytes[prefix])
|
||||
break;
|
||||
}
|
||||
pos1++;
|
||||
pos2++;
|
||||
prefix++;
|
||||
}
|
||||
}
|
||||
lastPosting = posting;
|
||||
encoderUpto = 1-encoderUpto;
|
||||
lastTermBytesCount = termBytesCount;
|
||||
|
||||
// Compute length
|
||||
while(text2[pos2] != 0xffff)
|
||||
pos2++;
|
||||
|
||||
final int suffix = pos2 - start2 - prefix;
|
||||
final int suffix = termBytesCount - prefix;
|
||||
tvfLocal.writeVInt(prefix);
|
||||
tvfLocal.writeVInt(suffix);
|
||||
tvfLocal.writeChars(text2, start2 + prefix, suffix);
|
||||
tvfLocal.writeBytes(utf8Result.result, prefix, suffix);
|
||||
tvfLocal.writeVInt(freq);
|
||||
|
||||
if (doVectorPositions) {
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.store.IndexOutput;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** Used by DocumentsWriter to maintain per-thread state.
|
||||
* We keep a separate Posting hash and other state for each
|
||||
|
@ -311,6 +312,7 @@ final class DocumentsWriterThreadState {
|
|||
if (docWriter.fieldsWriter == null) {
|
||||
assert docWriter.docStoreSegment == null;
|
||||
assert docWriter.segment != null;
|
||||
docWriter.files = null;
|
||||
docWriter.docStoreSegment = docWriter.segment;
|
||||
// If we hit an exception while init'ing the
|
||||
// fieldsWriter, we must abort this segment
|
||||
|
@ -321,7 +323,6 @@ final class DocumentsWriterThreadState {
|
|||
} catch (Throwable t) {
|
||||
throw new AbortException(t, docWriter);
|
||||
}
|
||||
docWriter.files = null;
|
||||
}
|
||||
localFieldsWriter = new FieldsWriter(null, fdtLocal, docWriter.fieldInfos);
|
||||
}
|
||||
|
@ -331,17 +332,18 @@ final class DocumentsWriterThreadState {
|
|||
if (docHasVectors) {
|
||||
if (docWriter.tvx == null) {
|
||||
assert docWriter.docStoreSegment != null;
|
||||
docWriter.files = null;
|
||||
// If we hit an exception while init'ing the term
|
||||
// vector output files, we must abort this segment
|
||||
// because those files will be in an unknown
|
||||
// state:
|
||||
try {
|
||||
docWriter.tvx = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
|
||||
docWriter.tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
docWriter.tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
docWriter.tvd = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
|
||||
docWriter.tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
docWriter.tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
docWriter.tvf = docWriter.directory.createOutput(docWriter.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
|
||||
docWriter.tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
docWriter.tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
|
||||
// We must "catch up" for all docs before us
|
||||
// that had no vectors:
|
||||
|
@ -353,7 +355,6 @@ final class DocumentsWriterThreadState {
|
|||
} catch (Throwable t) {
|
||||
throw new AbortException(t, docWriter);
|
||||
}
|
||||
docWriter.files = null;
|
||||
}
|
||||
numVectorFields = 0;
|
||||
}
|
||||
|
@ -672,21 +673,23 @@ final class DocumentsWriterThreadState {
|
|||
int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
|
||||
int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
|
||||
|
||||
assert text1 != text2 || pos1 != pos2;
|
||||
|
||||
while(true) {
|
||||
final char c1 = text1[pos1++];
|
||||
final char c2 = text2[pos2++];
|
||||
if (c1 < c2)
|
||||
if (c1 != c2) {
|
||||
if (0xffff == c2)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
else if (c2 < c1)
|
||||
if (0xffff == c1)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
else if (0xffff == c1)
|
||||
return 0;
|
||||
return -1;
|
||||
else
|
||||
return c1-c2;
|
||||
} else
|
||||
// This method should never compare equal postings
|
||||
// unless p1==p2
|
||||
assert c1 != 0xffff;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -715,5 +718,8 @@ final class DocumentsWriterThreadState {
|
|||
|
||||
// Used to read a string value for a field
|
||||
ReusableStringReader stringReader = new ReusableStringReader();
|
||||
|
||||
final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
|
||||
new UnicodeUtil.UTF8Result()};
|
||||
}
|
||||
|
||||
|
|
|
@ -51,6 +51,8 @@ final class FieldsReader {
|
|||
private int numTotalDocs;
|
||||
private int size;
|
||||
private boolean closed;
|
||||
private final int format;
|
||||
private final int formatSize;
|
||||
|
||||
// The docID offset where our docs begin in the index
|
||||
// file. This will be 0 if we have our own private file.
|
||||
|
@ -72,9 +74,33 @@ final class FieldsReader {
|
|||
try {
|
||||
fieldInfos = fn;
|
||||
|
||||
cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize);
|
||||
cloneableFieldsStream = d.openInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize);
|
||||
indexStream = d.openInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize);
|
||||
|
||||
// First version of fdx did not include a format
|
||||
// header, but, the first int will always be 0 in that
|
||||
// case
|
||||
int firstInt = indexStream.readInt();
|
||||
if (firstInt == 0)
|
||||
format = 0;
|
||||
else
|
||||
format = firstInt;
|
||||
|
||||
if (format > FieldsWriter.FORMAT_CURRENT)
|
||||
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
|
||||
+ FieldsWriter.FORMAT_CURRENT + " or lower");
|
||||
|
||||
if (format > FieldsWriter.FORMAT)
|
||||
formatSize = 4;
|
||||
else
|
||||
formatSize = 0;
|
||||
|
||||
if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
|
||||
cloneableFieldsStream.setModifiedUTF8StringsMode();
|
||||
|
||||
fieldsStream = (IndexInput) cloneableFieldsStream.clone();
|
||||
indexStream = d.openInput(segment + ".fdx", readBufferSize);
|
||||
|
||||
final long indexSize = indexStream.length()-formatSize;
|
||||
|
||||
if (docStoreOffset != -1) {
|
||||
// We read only a slice out of this shared fields file
|
||||
|
@ -83,13 +109,13 @@ final class FieldsReader {
|
|||
|
||||
// Verify the file is long enough to hold all of our
|
||||
// docs
|
||||
assert ((int) (indexStream.length() / 8)) >= size + this.docStoreOffset;
|
||||
assert ((int) (indexSize / 8)) >= size + this.docStoreOffset;
|
||||
} else {
|
||||
this.docStoreOffset = 0;
|
||||
this.size = (int) (indexStream.length() >> 3);
|
||||
this.size = (int) (indexSize >> 3);
|
||||
}
|
||||
|
||||
numTotalDocs = (int) (indexStream.length() >> 3);
|
||||
numTotalDocs = (int) (indexSize >> 3);
|
||||
success = true;
|
||||
} finally {
|
||||
// With lock-less commits, it's entirely possible (and
|
||||
|
@ -142,8 +168,12 @@ final class FieldsReader {
|
|||
return size;
|
||||
}
|
||||
|
||||
private final void seekIndex(int docID) throws IOException {
|
||||
indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
|
||||
}
|
||||
|
||||
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
|
||||
indexStream.seek((n + docStoreOffset) * 8L);
|
||||
seekIndex(n);
|
||||
long position = indexStream.readLong();
|
||||
fieldsStream.seek(position);
|
||||
|
||||
|
@ -195,7 +225,7 @@ final class FieldsReader {
|
|||
* startDocID. Returns the IndexInput (the fieldStream),
|
||||
* already seeked to the starting point for startDocID.*/
|
||||
final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
|
||||
indexStream.seek((docStoreOffset+startDocID) * 8L);
|
||||
seekIndex(startDocID);
|
||||
long startOffset = indexStream.readLong();
|
||||
long lastOffset = startOffset;
|
||||
int count = 0;
|
||||
|
@ -225,11 +255,10 @@ final class FieldsReader {
|
|||
}
|
||||
|
||||
private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
|
||||
if (binary || compressed) {
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
fieldsStream.seek(pointer + toRead);
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
|
||||
fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
|
||||
} else {
|
||||
//We need to skip chars. This will slow us down, but still better
|
||||
// We need to skip chars. This will slow us down, but still better
|
||||
fieldsStream.skipChars(toRead);
|
||||
}
|
||||
}
|
||||
|
@ -265,6 +294,9 @@ final class FieldsReader {
|
|||
int length = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//Skip ahead of where we are by the length of what is stored
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
|
||||
fieldsStream.seek(pointer+length);
|
||||
else
|
||||
fieldsStream.skipChars(length);
|
||||
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
|
@ -470,12 +502,18 @@ final class FieldsReader {
|
|||
final byte[] b = new byte[toRead];
|
||||
localFieldsStream.readBytes(b, 0, b.length);
|
||||
fieldsData = new String(uncompress(b), "UTF-8");
|
||||
} else {
|
||||
if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
|
||||
byte[] bytes = new byte[toRead];
|
||||
localFieldsStream.readBytes(bytes, 0, toRead);
|
||||
fieldsData = new String(bytes, "UTF-8");
|
||||
} else {
|
||||
//read in chars b/c we already know the length we need to read
|
||||
char[] chars = new char[toRead];
|
||||
localFieldsStream.readChars(chars, 0, toRead);
|
||||
fieldsData = new String(chars);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new FieldReaderException(e);
|
||||
}
|
||||
|
|
|
@ -34,6 +34,17 @@ final class FieldsWriter
|
|||
static final byte FIELD_IS_BINARY = 0x2;
|
||||
static final byte FIELD_IS_COMPRESSED = 0x4;
|
||||
|
||||
// Original format
|
||||
static final int FORMAT = 0;
|
||||
|
||||
// Changed strings to UTF8
|
||||
static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1;
|
||||
|
||||
// NOTE: if you introduce a new format, make it 1 higher
|
||||
// than the current one, and always change this if you
|
||||
// switch to a new format!
|
||||
static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private IndexOutput fieldsStream;
|
||||
|
@ -44,8 +55,34 @@ final class FieldsWriter
|
|||
|
||||
FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
|
||||
fieldInfos = fn;
|
||||
fieldsStream = d.createOutput(segment + ".fdt");
|
||||
indexStream = d.createOutput(segment + ".fdx");
|
||||
|
||||
boolean success = false;
|
||||
final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION;
|
||||
try {
|
||||
fieldsStream = d.createOutput(fieldsName);
|
||||
fieldsStream.writeInt(FORMAT_CURRENT);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
close();
|
||||
d.deleteFile(fieldsName);
|
||||
}
|
||||
}
|
||||
|
||||
success = false;
|
||||
final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
|
||||
try {
|
||||
indexStream = d.createOutput(indexName);
|
||||
indexStream.writeInt(FORMAT_CURRENT);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
close();
|
||||
d.deleteFile(fieldsName);
|
||||
d.deleteFile(indexName);
|
||||
}
|
||||
}
|
||||
|
||||
doClose = true;
|
||||
}
|
||||
|
||||
|
@ -73,7 +110,9 @@ final class FieldsWriter
|
|||
|
||||
final void close() throws IOException {
|
||||
if (doClose) {
|
||||
if (fieldsStream != null)
|
||||
fieldsStream.close();
|
||||
if (indexStream != null)
|
||||
indexStream.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,6 +67,7 @@ import java.util.Iterator;
|
|||
(which just deletes and then adds the entire document).
|
||||
When finished adding, deleting and updating documents, <a href="#close()"><b>close</b></a> should be called.</p>
|
||||
|
||||
<a name="flush"></a>
|
||||
<p>These changes are buffered in memory and periodically
|
||||
flushed to the {@link Directory} (during the above method
|
||||
calls). A flush is triggered when there are enough
|
||||
|
@ -1843,26 +1844,30 @@ public class IndexWriter {
|
|||
* partially succeeded).</p>
|
||||
*
|
||||
* <p> This method periodically flushes pending documents
|
||||
* to the Directory (every {@link #setMaxBufferedDocs}),
|
||||
* and also periodically merges segments in the index
|
||||
* (every {@link #setMergeFactor} flushes). When this
|
||||
* occurs, the method will take more time to run (possibly
|
||||
* a long time if the index is large), and will require
|
||||
* free temporary space in the Directory to do the
|
||||
* merging.</p>
|
||||
* to the Directory (see <a href="#flush">above</a>), and
|
||||
* also periodically triggers segment merges in the index
|
||||
* according to the {@link MergePolicy} in use.</p>
|
||||
*
|
||||
* <p>The amount of free space required when a merge is triggered is
|
||||
* up to 1X the size of all segments being merged, when no
|
||||
* readers/searchers are open against the index, and up to 2X the
|
||||
* size of all segments being merged when readers/searchers are open
|
||||
* against the index (see {@link #optimize()} for details). The
|
||||
* sequence of primitive merge operations performed is governed by
|
||||
* the merge policy.
|
||||
* <p>Merges temporarily consume space in the
|
||||
* directory. The amount of space required is up to 1X the
|
||||
* size of all segments being merged, when no
|
||||
* readers/searchers are open against the index, and up to
|
||||
* 2X the size of all segments being merged when
|
||||
* readers/searchers are open against the index (see
|
||||
* {@link #optimize()} for details). The sequence of
|
||||
* primitive merge operations performed is governed by the
|
||||
* merge policy.
|
||||
*
|
||||
* <p>Note that each term in the document can be no longer
|
||||
* than 16383 characters, otherwise an
|
||||
* IllegalArgumentException will be thrown.</p>
|
||||
*
|
||||
* <p>Note that it's possible to create an invalid Unicode
|
||||
* string in java if a UTF16 surrogate pair is malformed.
|
||||
* In this case, the invalid characters are silently
|
||||
* replaced with the Unicode replacement character
|
||||
* U+FFFD.</p>
|
||||
*
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
|
|
|
@ -349,7 +349,7 @@ final class SegmentMerger {
|
|||
fieldsWriter.close();
|
||||
}
|
||||
|
||||
assert docCount*8 == directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) :
|
||||
assert 4+docCount*8 == directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) :
|
||||
"after mergeFields: fdx size mismatch: " + docCount + " docs vs " + directory.fileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
|
||||
|
||||
} else
|
||||
|
|
|
@ -61,8 +61,8 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
format = firstInt;
|
||||
|
||||
// check that it is a format we can understand
|
||||
if (format < TermInfosWriter.FORMAT)
|
||||
throw new CorruptIndexException("Unknown format version:" + format);
|
||||
if (format < TermInfosWriter.FORMAT_CURRENT)
|
||||
throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher");
|
||||
|
||||
size = input.readLong(); // read the size
|
||||
|
||||
|
@ -77,13 +77,17 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
} else {
|
||||
indexInterval = input.readInt();
|
||||
skipInterval = input.readInt();
|
||||
if (format == -3) {
|
||||
if (format <= TermInfosWriter.FORMAT) {
|
||||
// this new format introduces multi-level skipping
|
||||
maxSkipLevels = input.readInt();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
|
||||
termBuffer.setPreUTF8Strings();
|
||||
scanBuffer.setPreUTF8Strings();
|
||||
prevBuffer.setPreUTF8Strings();
|
||||
}
|
||||
}
|
||||
|
||||
protected Object clone() {
|
||||
|
|
|
@ -19,28 +19,31 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
final class TermBuffer implements Cloneable {
|
||||
private static final char[] NO_CHARS = new char[0];
|
||||
|
||||
private String field;
|
||||
private char[] text = NO_CHARS;
|
||||
private int textLength;
|
||||
private Term term; // cached
|
||||
private boolean preUTF8Strings; // true if strings are stored in modified UTF8 encoding (LUCENE-510)
|
||||
private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes)
|
||||
|
||||
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
|
||||
private UnicodeUtil.UTF8Result bytes = new UnicodeUtil.UTF8Result();
|
||||
|
||||
public final int compareTo(TermBuffer other) {
|
||||
if (field == other.field) // fields are interned
|
||||
return compareChars(text, textLength, other.text, other.textLength);
|
||||
return compareChars(text.result, text.length, other.text.result, other.text.length);
|
||||
else
|
||||
return field.compareTo(other.field);
|
||||
}
|
||||
|
||||
private static final int compareChars(char[] v1, int len1,
|
||||
char[] v2, int len2) {
|
||||
int end = Math.min(len1, len2);
|
||||
private static final int compareChars(char[] chars1, int len1,
|
||||
char[] chars2, int len2) {
|
||||
final int end = len1 < len2 ? len1:len2;
|
||||
for (int k = 0; k < end; k++) {
|
||||
char c1 = v1[k];
|
||||
char c2 = v2[k];
|
||||
char c1 = chars1[k];
|
||||
char c2 = chars2[k];
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
|
@ -48,13 +51,11 @@ final class TermBuffer implements Cloneable {
|
|||
return len1 - len2;
|
||||
}
|
||||
|
||||
private final void setTextLength(int newLength) {
|
||||
if (text.length < newLength) {
|
||||
char[] newText = new char[newLength];
|
||||
System.arraycopy(text, 0, newText, 0, textLength);
|
||||
text = newText;
|
||||
}
|
||||
textLength = newLength;
|
||||
/** Call this if the IndexInput passed to {@link #read}
|
||||
* stores terms in the "modified UTF8" (pre LUCENE-510)
|
||||
* format. */
|
||||
void setPreUTF8Strings() {
|
||||
preUTF8Strings = true;
|
||||
}
|
||||
|
||||
public final void read(IndexInput input, FieldInfos fieldInfos)
|
||||
|
@ -63,8 +64,25 @@ final class TermBuffer implements Cloneable {
|
|||
int start = input.readVInt();
|
||||
int length = input.readVInt();
|
||||
int totalLength = start + length;
|
||||
setTextLength(totalLength);
|
||||
input.readChars(this.text, start, length);
|
||||
if (preUTF8Strings) {
|
||||
text.setLength(totalLength);
|
||||
input.readChars(text.result, start, length);
|
||||
} else {
|
||||
|
||||
if (dirty) {
|
||||
// Fully convert all bytes since bytes is dirty
|
||||
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
|
||||
bytes.setLength(totalLength);
|
||||
input.readBytes(bytes.result, start, length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text);
|
||||
dirty = false;
|
||||
} else {
|
||||
// Incrementally convert only the UTF8 bytes that are new:
|
||||
bytes.setLength(totalLength);
|
||||
input.readBytes(bytes.result, start, length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text);
|
||||
}
|
||||
}
|
||||
this.field = fieldInfos.fieldName(input.readVInt());
|
||||
}
|
||||
|
||||
|
@ -73,27 +91,27 @@ final class TermBuffer implements Cloneable {
|
|||
reset();
|
||||
return;
|
||||
}
|
||||
|
||||
// copy text into the buffer
|
||||
setTextLength(term.text().length());
|
||||
term.text().getChars(0, term.text().length(), text, 0);
|
||||
|
||||
this.field = term.field();
|
||||
final String termText = term.text();
|
||||
final int termLen = termText.length();
|
||||
text.setLength(termLen);
|
||||
termText.getChars(0, termLen, text.result, 0);
|
||||
dirty = true;
|
||||
field = term.field();
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
public final void set(TermBuffer other) {
|
||||
setTextLength(other.textLength);
|
||||
System.arraycopy(other.text, 0, text, 0, textLength);
|
||||
|
||||
this.field = other.field;
|
||||
this.term = other.term;
|
||||
text.copyText(other.text);
|
||||
dirty = true;
|
||||
field = other.field;
|
||||
term = other.term;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
this.field = null;
|
||||
this.textLength = 0;
|
||||
this.term = null;
|
||||
field = null;
|
||||
text.setLength(0);
|
||||
term = null;
|
||||
dirty = true;
|
||||
}
|
||||
|
||||
public Term toTerm() {
|
||||
|
@ -101,7 +119,7 @@ final class TermBuffer implements Cloneable {
|
|||
return null;
|
||||
|
||||
if (term == null)
|
||||
term = new Term(field, new String(text, 0, textLength), false);
|
||||
term = new Term(field, new String(text.result, 0, text.length), false);
|
||||
|
||||
return term;
|
||||
}
|
||||
|
@ -112,9 +130,10 @@ final class TermBuffer implements Cloneable {
|
|||
clone = (TermBuffer)super.clone();
|
||||
} catch (CloneNotSupportedException e) {}
|
||||
|
||||
clone.text = new char[text.length];
|
||||
System.arraycopy(text, 0, clone.text, 0, textLength);
|
||||
|
||||
clone.dirty = true;
|
||||
clone.bytes = new UnicodeUtil.UTF8Result();
|
||||
clone.text = new UnicodeUtil.UTF16Result();
|
||||
clone.text.copyText(text);
|
||||
return clone;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,12 +58,12 @@ final class TermInfosReader {
|
|||
segment = seg;
|
||||
fieldInfos = fis;
|
||||
|
||||
origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis",
|
||||
origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
|
||||
readBufferSize), fieldInfos, false);
|
||||
size = origEnum.size;
|
||||
totalIndexInterval = origEnum.indexInterval;
|
||||
|
||||
indexEnum = new SegmentTermEnum(directory.openInput(segment + ".tii",
|
||||
indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
|
||||
readBufferSize), fieldInfos, true);
|
||||
|
||||
success = true;
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
||||
Directory. A TermInfos can be written once, in order. */
|
||||
|
@ -29,6 +30,13 @@ final class TermInfosWriter {
|
|||
/** The file format version, a negative number. */
|
||||
public static final int FORMAT = -3;
|
||||
|
||||
// Changed strings to true utf8 with length-in-bytes not
|
||||
// length-in-chars
|
||||
public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
|
||||
|
||||
// NOTE: always change this if you switch to a new format!
|
||||
public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private IndexOutput output;
|
||||
private TermInfo lastTi = new TermInfo();
|
||||
|
@ -62,13 +70,12 @@ final class TermInfosWriter {
|
|||
|
||||
private long lastIndexPointer;
|
||||
private boolean isIndex;
|
||||
private char[] lastTermText = new char[10];
|
||||
private int lastTermTextLength;
|
||||
private byte[] lastTermBytes = new byte[10];
|
||||
private int lastTermBytesLength = 0;
|
||||
private int lastFieldNumber = -1;
|
||||
|
||||
private char[] termTextBuffer = new char[10];
|
||||
|
||||
private TermInfosWriter other;
|
||||
private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
|
||||
|
||||
TermInfosWriter(Directory directory, String segment, FieldInfos fis,
|
||||
int interval)
|
||||
|
@ -89,27 +96,32 @@ final class TermInfosWriter {
|
|||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
|
||||
output.writeInt(FORMAT); // write format
|
||||
output.writeInt(FORMAT_CURRENT); // write format
|
||||
output.writeLong(0); // leave space for size
|
||||
output.writeInt(indexInterval); // write indexInterval
|
||||
output.writeInt(skipInterval); // write skipInterval
|
||||
output.writeInt(maxSkipLevels); // write maxSkipLevels
|
||||
assert initUTF16Results();
|
||||
}
|
||||
|
||||
void add(Term term, TermInfo ti) throws IOException {
|
||||
UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result);
|
||||
add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti);
|
||||
}
|
||||
|
||||
final int length = term.text.length();
|
||||
if (termTextBuffer.length < length)
|
||||
termTextBuffer = new char[(int) (length*1.25)];
|
||||
// Currently used only by assert statements
|
||||
UnicodeUtil.UTF16Result utf16Result1;
|
||||
UnicodeUtil.UTF16Result utf16Result2;
|
||||
|
||||
term.text.getChars(0, length, termTextBuffer, 0);
|
||||
|
||||
add(fieldInfos.fieldNumber(term.field), termTextBuffer, 0, length, ti);
|
||||
// Currently used only by assert statements
|
||||
private boolean initUTF16Results() {
|
||||
utf16Result1 = new UnicodeUtil.UTF16Result();
|
||||
utf16Result2 = new UnicodeUtil.UTF16Result();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Currently used only by assert statement
|
||||
private int compareToLastTerm(int fieldNumber, char[] termText, int start, int length) {
|
||||
int pos = 0;
|
||||
private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
|
||||
|
||||
if (lastFieldNumber != fieldNumber) {
|
||||
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
|
||||
|
@ -121,45 +133,42 @@ final class TermInfosWriter {
|
|||
return cmp;
|
||||
}
|
||||
|
||||
while(pos < length && pos < lastTermTextLength) {
|
||||
final char c1 = lastTermText[pos];
|
||||
final char c2 = termText[pos + start];
|
||||
if (c1 < c2)
|
||||
return -1;
|
||||
else if (c1 > c2)
|
||||
return 1;
|
||||
pos++;
|
||||
}
|
||||
|
||||
if (pos < lastTermTextLength)
|
||||
// Last term was longer
|
||||
return 1;
|
||||
else if (pos < length)
|
||||
// Last term was shorter
|
||||
return -1;
|
||||
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
|
||||
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
|
||||
final int len;
|
||||
if (utf16Result1.length < utf16Result2.length)
|
||||
len = utf16Result1.length;
|
||||
else
|
||||
return 0;
|
||||
len = utf16Result2.length;
|
||||
|
||||
for(int i=0;i<len;i++) {
|
||||
final char ch1 = utf16Result1.result[i];
|
||||
final char ch2 = utf16Result2.result[i];
|
||||
if (ch1 != ch2)
|
||||
return ch1-ch2;
|
||||
}
|
||||
return utf16Result1.length - utf16Result2.length;
|
||||
}
|
||||
|
||||
/** Adds a new <<fieldNumber, termText>, TermInfo> pair to the set.
|
||||
/** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
|
||||
Term must be lexicographically greater than all previous Terms added.
|
||||
TermInfo pointers must be positive and greater than all previous.*/
|
||||
void add(int fieldNumber, char[] termText, int termTextStart, int termTextLength, TermInfo ti)
|
||||
void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
|
||||
throws IOException {
|
||||
|
||||
assert compareToLastTerm(fieldNumber, termText, termTextStart, termTextLength) < 0 ||
|
||||
(isIndex && termTextLength == 0 && lastTermTextLength == 0) :
|
||||
assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
|
||||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
|
||||
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
|
||||
" lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
|
||||
" text=" + new String(termText, termTextStart, termTextLength) + " lastText=" + new String(lastTermText, 0, lastTermTextLength);
|
||||
" text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
|
||||
|
||||
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
|
||||
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
|
||||
|
||||
if (!isIndex && size % indexInterval == 0)
|
||||
other.add(lastFieldNumber, lastTermText, 0, lastTermTextLength, lastTi); // add an index term
|
||||
other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
|
||||
|
||||
writeTerm(fieldNumber, termText, termTextStart, termTextLength); // write term
|
||||
writeTerm(fieldNumber, termBytes, termBytesLength); // write term
|
||||
|
||||
output.writeVInt(ti.docFreq); // write doc freq
|
||||
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
|
||||
|
@ -174,34 +183,36 @@ final class TermInfosWriter {
|
|||
lastIndexPointer = other.output.getFilePointer(); // write pointer
|
||||
}
|
||||
|
||||
if (lastTermText.length < termTextLength)
|
||||
lastTermText = new char[(int) (termTextLength*1.25)];
|
||||
System.arraycopy(termText, termTextStart, lastTermText, 0, termTextLength);
|
||||
lastTermTextLength = termTextLength;
|
||||
lastFieldNumber = fieldNumber;
|
||||
|
||||
lastTi.set(ti);
|
||||
size++;
|
||||
}
|
||||
|
||||
private void writeTerm(int fieldNumber, char[] termText, int termTextStart, int termTextLength)
|
||||
private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
|
||||
throws IOException {
|
||||
|
||||
// TODO: UTF16toUTF8 could tell us this prefix
|
||||
// Compute prefix in common with last term:
|
||||
int start = 0;
|
||||
final int limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength;
|
||||
final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
|
||||
while(start < limit) {
|
||||
if (termText[termTextStart+start] != lastTermText[start])
|
||||
if (termBytes[start] != lastTermBytes[start])
|
||||
break;
|
||||
start++;
|
||||
}
|
||||
|
||||
int length = termTextLength - start;
|
||||
|
||||
final int length = termBytesLength - start;
|
||||
output.writeVInt(start); // write shared prefix length
|
||||
output.writeVInt(length); // write delta length
|
||||
output.writeChars(termText, start+termTextStart, length); // write delta chars
|
||||
output.writeBytes(termBytes, start, length); // write delta bytes
|
||||
output.writeVInt(fieldNumber); // write field num
|
||||
if (lastTermBytes.length < termBytesLength) {
|
||||
byte[] newArray = new byte[(int) (termBytesLength*1.5)];
|
||||
System.arraycopy(lastTermBytes, 0, newArray, 0, start);
|
||||
lastTermBytes = newArray;
|
||||
}
|
||||
System.arraycopy(termBytes, start, lastTermBytes, start, length);
|
||||
lastTermBytesLength = termBytesLength;
|
||||
}
|
||||
|
||||
/** Called to complete TermInfos creation. */
|
||||
|
|
|
@ -32,8 +32,16 @@ class TermVectorsReader implements Cloneable {
|
|||
// NOTE: if you make a new format, it must be larger than
|
||||
// the current format
|
||||
static final int FORMAT_VERSION = 2;
|
||||
|
||||
// Changes to speed up bulk merging of term vectors:
|
||||
static final int FORMAT_VERSION2 = 3;
|
||||
|
||||
// Changed strings to UTF8 with length-in-bytes not length-in-chars
|
||||
static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
|
||||
|
||||
// NOTE: always change this if you switch to a new format!
|
||||
static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
|
||||
|
||||
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
|
||||
static final int FORMAT_SIZE = 4;
|
||||
|
||||
|
@ -134,7 +142,7 @@ class TermVectorsReader implements Cloneable {
|
|||
}
|
||||
|
||||
boolean canReadRawDocs() {
|
||||
return format >= FORMAT_VERSION2;
|
||||
return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
|
||||
}
|
||||
|
||||
/** Retrieve the length (in bytes) of the tvd and tvf
|
||||
|
@ -190,9 +198,9 @@ class TermVectorsReader implements Cloneable {
|
|||
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
|
||||
{
|
||||
int format = in.readInt();
|
||||
if (format > FORMAT_VERSION2) {
|
||||
if (format > FORMAT_CURRENT) {
|
||||
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
|
||||
+ FORMAT_VERSION2 + " or less");
|
||||
+ FORMAT_CURRENT + " or less");
|
||||
}
|
||||
return format;
|
||||
}
|
||||
|
@ -434,24 +442,45 @@ class TermVectorsReader implements Cloneable {
|
|||
int start = 0;
|
||||
int deltaLength = 0;
|
||||
int totalLength = 0;
|
||||
char [] buffer = new char[10]; // init the buffer with a length of 10 character
|
||||
char[] previousBuffer = {};
|
||||
byte[] byteBuffer;
|
||||
char[] charBuffer;
|
||||
final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
|
||||
|
||||
// init the buffers
|
||||
if (preUTF8) {
|
||||
charBuffer = new char[10];
|
||||
byteBuffer = null;
|
||||
} else {
|
||||
charBuffer = null;
|
||||
byteBuffer = new byte[20];
|
||||
}
|
||||
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
start = tvf.readVInt();
|
||||
deltaLength = tvf.readVInt();
|
||||
totalLength = start + deltaLength;
|
||||
if (buffer.length < totalLength) { // increase buffer
|
||||
buffer = null; // give a hint to garbage collector
|
||||
buffer = new char[totalLength];
|
||||
|
||||
if (start > 0) // just copy if necessary
|
||||
System.arraycopy(previousBuffer, 0, buffer, 0, start);
|
||||
final String term;
|
||||
|
||||
if (preUTF8) {
|
||||
// Term stored as java chars
|
||||
if (charBuffer.length < totalLength) {
|
||||
char[] newCharBuffer = new char[(int) (1.5*totalLength)];
|
||||
System.arraycopy(charBuffer, 0, newCharBuffer, 0, start);
|
||||
charBuffer = newCharBuffer;
|
||||
}
|
||||
tvf.readChars(charBuffer, start, deltaLength);
|
||||
term = new String(charBuffer, 0, totalLength);
|
||||
} else {
|
||||
// Term stored as utf8 bytes
|
||||
if (byteBuffer.length < totalLength) {
|
||||
byte[] newByteBuffer = new byte[(int) (1.5*totalLength)];
|
||||
System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start);
|
||||
byteBuffer = newByteBuffer;
|
||||
}
|
||||
tvf.readBytes(byteBuffer, start, deltaLength);
|
||||
term = new String(byteBuffer, 0, totalLength, "UTF-8");
|
||||
}
|
||||
|
||||
tvf.readChars(buffer, start, deltaLength);
|
||||
String term = new String(buffer, 0, totalLength);
|
||||
previousBuffer = buffer;
|
||||
int freq = tvf.readVInt();
|
||||
int [] positions = null;
|
||||
if (storePositions) { //read in the positions
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -27,17 +28,19 @@ final class TermVectorsWriter {
|
|||
|
||||
private IndexOutput tvx = null, tvd = null, tvf = null;
|
||||
private FieldInfos fieldInfos;
|
||||
final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(),
|
||||
new UnicodeUtil.UTF8Result()};
|
||||
|
||||
public TermVectorsWriter(Directory directory, String segment,
|
||||
FieldInfos fieldInfos)
|
||||
throws IOException {
|
||||
// Open files for TermVector storage
|
||||
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
|
||||
tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
|
||||
tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
|
||||
tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
|
||||
tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
|
||||
|
||||
this.fieldInfos = fieldInfos;
|
||||
}
|
||||
|
@ -97,15 +100,22 @@ final class TermVectorsWriter {
|
|||
final String[] terms = vectors[i].getTerms();
|
||||
final int[] freqs = vectors[i].getTermFrequencies();
|
||||
|
||||
String lastTermText = "";
|
||||
int utf8Upto = 0;
|
||||
utf8Results[1].length = 0;
|
||||
|
||||
for (int j=0; j<numTerms; j++) {
|
||||
final String termText = terms[j];
|
||||
int start = StringHelper.stringDifference(lastTermText, termText);
|
||||
int length = termText.length() - start;
|
||||
|
||||
UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]);
|
||||
|
||||
int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result,
|
||||
utf8Results[1-utf8Upto].length,
|
||||
utf8Results[utf8Upto].result,
|
||||
utf8Results[utf8Upto].length);
|
||||
int length = utf8Results[utf8Upto].length - start;
|
||||
tvf.writeVInt(start); // write shared prefix length
|
||||
tvf.writeVInt(length); // write delta length
|
||||
tvf.writeChars(termText, start, length); // write delta chars
|
||||
lastTermText = termText;
|
||||
tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
|
||||
utf8Upto = 1-utf8Upto;
|
||||
|
||||
final int termFreq = freqs[j];
|
||||
|
||||
|
|
|
@ -24,7 +24,9 @@ import java.io.IOException;
|
|||
* @see Directory
|
||||
*/
|
||||
public abstract class IndexInput implements Cloneable {
|
||||
private char[] chars; // used by readString()
|
||||
private byte[] bytes; // used by readString()
|
||||
private char[] chars; // used by readModifiedUTF8String()
|
||||
private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format
|
||||
|
||||
/** Reads and returns a single byte.
|
||||
* @see IndexOutput#writeByte(byte)
|
||||
|
@ -102,10 +104,28 @@ public abstract class IndexInput implements Cloneable {
|
|||
return i;
|
||||
}
|
||||
|
||||
/** Call this if readString should read characters stored
|
||||
* in the old modified UTF8 format (length in java chars
|
||||
* and java's modified UTF8 encoding). This is used for
|
||||
* indices written pre-2.4 See LUCENE-510 for details. */
|
||||
public void setModifiedUTF8StringsMode() {
|
||||
preUTF8Strings = true;
|
||||
}
|
||||
|
||||
/** Reads a string.
|
||||
* @see IndexOutput#writeString(String)
|
||||
*/
|
||||
public String readString() throws IOException {
|
||||
if (preUTF8Strings)
|
||||
return readModifiedUTF8String();
|
||||
int length = readVInt();
|
||||
if (bytes == null || length > bytes.length)
|
||||
bytes = new byte[(int) (length*1.25)];
|
||||
readBytes(bytes, 0, length);
|
||||
return new String(bytes, 0, length, "UTF-8");
|
||||
}
|
||||
|
||||
private String readModifiedUTF8String() throws IOException {
|
||||
int length = readVInt();
|
||||
if (chars == null || length > chars.length)
|
||||
chars = new char[length];
|
||||
|
@ -113,11 +133,15 @@ public abstract class IndexInput implements Cloneable {
|
|||
return new String(chars, 0, length);
|
||||
}
|
||||
|
||||
/** Reads UTF-8 encoded characters into an array.
|
||||
/** Reads Lucene's old "modified UTF-8" encoded
|
||||
* characters into an array.
|
||||
* @param buffer the array to read characters into
|
||||
* @param start the offset in the array to start storing characters
|
||||
* @param length the number of characters to read
|
||||
* @see IndexOutput#writeChars(String,int,int)
|
||||
* @deprecated -- please use readString or readBytes
|
||||
* instead, and construct the string
|
||||
* from those utf8 bytes
|
||||
*/
|
||||
public void readChars(char[] buffer, int start, int length)
|
||||
throws IOException {
|
||||
|
@ -144,6 +168,8 @@ public abstract class IndexInput implements Cloneable {
|
|||
* and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine
|
||||
* how many more bytes to read
|
||||
* @param length The number of chars to read
|
||||
* @deprecated this method operates on old "modified utf8" encoded
|
||||
* strings
|
||||
*/
|
||||
public void skipChars(int length) throws IOException{
|
||||
for (int i = 0; i < length; i++) {
|
||||
|
@ -194,6 +220,7 @@ public abstract class IndexInput implements Cloneable {
|
|||
clone = (IndexInput)super.clone();
|
||||
} catch (CloneNotSupportedException e) {}
|
||||
|
||||
clone.bytes = null;
|
||||
clone.chars = null;
|
||||
|
||||
return clone;
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.store;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** Abstract base class for output to a file in a Directory. A random-access
|
||||
* output stream. Used for all Lucene index output operations.
|
||||
|
@ -26,6 +27,8 @@ import java.io.IOException;
|
|||
*/
|
||||
public abstract class IndexOutput {
|
||||
|
||||
private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
|
||||
|
||||
/** Writes a single byte.
|
||||
* @see IndexInput#readByte()
|
||||
*/
|
||||
|
@ -96,16 +99,18 @@ public abstract class IndexOutput {
|
|||
* @see IndexInput#readString()
|
||||
*/
|
||||
public void writeString(String s) throws IOException {
|
||||
int length = s.length();
|
||||
writeVInt(length);
|
||||
writeChars(s, 0, length);
|
||||
UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
|
||||
writeVInt(utf8Result.length);
|
||||
writeBytes(utf8Result.result, 0, utf8Result.length);
|
||||
}
|
||||
|
||||
/** Writes a sequence of UTF-8 encoded characters from a string.
|
||||
/** Writes a sub sequence of characters from s as the old
|
||||
* format (modified UTF-8 encoded bytes).
|
||||
* @param s the source of the characters
|
||||
* @param start the first character in the sequence
|
||||
* @param length the number of characters in the sequence
|
||||
* @see IndexInput#readChars(char[],int,int)
|
||||
* @deprecated -- please pre-convert to utf8 bytes
|
||||
* instead or use {@link #writeString}
|
||||
*/
|
||||
public void writeChars(String s, int start, int length)
|
||||
throws IOException {
|
||||
|
@ -125,11 +130,12 @@ public abstract class IndexOutput {
|
|||
}
|
||||
}
|
||||
|
||||
/** Writes a sequence of UTF-8 encoded characters from a char[].
|
||||
/** Writes a sub sequence of characters from char[] as
|
||||
* the old format (modified UTF-8 encoded bytes).
|
||||
* @param s the source of the characters
|
||||
* @param start the first character in the sequence
|
||||
* @param length the number of characters in the sequence
|
||||
* @see IndexInput#readChars(char[],int,int)
|
||||
* @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString}
|
||||
*/
|
||||
public void writeChars(char[] s, int start, int length)
|
||||
throws IOException {
|
||||
|
|
|
@ -25,6 +25,22 @@ package org.apache.lucene.util;
|
|||
*/
|
||||
public abstract class StringHelper {
|
||||
|
||||
/**
|
||||
* Compares two byte[] arrays, element by element, and returns the
|
||||
* number of elements common to both arrays.
|
||||
*
|
||||
* @param bytes1 The first byte[] to compare
|
||||
* @param bytes2 The second byte[] to compare
|
||||
* @return The number of common elements.
|
||||
*/
|
||||
public static final int bytesDifference(byte[] bytes1, int len1, byte[] bytes2, int len2) {
|
||||
int len = len1 < len2 ? len1 : len2;
|
||||
for (int i = 0; i < len; i++)
|
||||
if (bytes1[i] != bytes2[i])
|
||||
return i;
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two strings, character by character, and returns the
|
||||
* first position where the two strings differ from one another.
|
||||
|
@ -45,7 +61,6 @@ public abstract class StringHelper {
|
|||
return len;
|
||||
}
|
||||
|
||||
|
||||
private StringHelper() {
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,447 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Some of this code came from the excellent Unicode
|
||||
* conversion examples from:
|
||||
*
|
||||
* http://www.unicode.org/Public/PROGRAMS/CVTUTF
|
||||
*
|
||||
* Full Copyright for that code follows:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright 2001-2004 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Class to encode java's UTF16 char[] into UTF8 byte[]
|
||||
* without always allocating a new byte[] as
|
||||
* String.getBytes("UTF-8") does.
|
||||
*
|
||||
* <p><b>WARNING</b>: This API is a new and experimental and
|
||||
* may suddenly change. </p>
|
||||
*/
|
||||
|
||||
final public class UnicodeUtil {
|
||||
|
||||
public static final int UNI_SUR_HIGH_START = 0xD800;
|
||||
public static final int UNI_SUR_HIGH_END = 0xDBFF;
|
||||
public static final int UNI_SUR_LOW_START = 0xDC00;
|
||||
public static final int UNI_SUR_LOW_END = 0xDFFF;
|
||||
public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
|
||||
|
||||
private static final long UNI_MAX_BMP = 0x0000FFFF;
|
||||
|
||||
private static final int HALF_BASE = 0x0010000;
|
||||
private static final long HALF_SHIFT = 10;
|
||||
private static final long HALF_MASK = 0x3FFL;
|
||||
|
||||
public static final class UTF8Result {
|
||||
public byte[] result = new byte[10];
|
||||
public int length;
|
||||
|
||||
public void setLength(int newLength) {
|
||||
if (result.length < newLength) {
|
||||
byte[] newArray = new byte[(int) (1.5*newLength)];
|
||||
System.arraycopy(result, 0, newArray, 0, length);
|
||||
result = newArray;
|
||||
}
|
||||
length = newLength;
|
||||
}
|
||||
}
|
||||
|
||||
public static final class UTF16Result {
|
||||
public char[] result = new char[10];
|
||||
public int[] offsets = new int[10];
|
||||
public int length;
|
||||
|
||||
public void setLength(int newLength) {
|
||||
if (result.length < newLength) {
|
||||
char[] newArray = new char[(int) (1.5*newLength)];
|
||||
System.arraycopy(result, 0, newArray, 0, length);
|
||||
result = newArray;
|
||||
}
|
||||
length = newLength;
|
||||
}
|
||||
|
||||
public void copyText(UTF16Result other) {
|
||||
setLength(other.length);
|
||||
System.arraycopy(other.result, 0, result, 0, length);
|
||||
}
|
||||
}
|
||||
|
||||
/** Encode characters from a char[] source, starting at
|
||||
* offset and stopping when the character 0xffff is seen.
|
||||
* Returns the number of bytes written to bytesOut. */
|
||||
public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) {
|
||||
|
||||
int upto = 0;
|
||||
int i = offset;
|
||||
byte[] out = result.result;
|
||||
|
||||
while(true) {
|
||||
|
||||
final int code = (int) source[i++];
|
||||
|
||||
if (upto+4 > out.length) {
|
||||
byte[] newOut = new byte[2*out.length];
|
||||
assert newOut.length >= upto+4;
|
||||
System.arraycopy(out, 0, newOut, 0, upto);
|
||||
result.result = out = newOut;
|
||||
}
|
||||
if (code < 0x80)
|
||||
out[upto++] = (byte) code;
|
||||
else if (code < 0x800) {
|
||||
out[upto++] = (byte) (0xC0 | (code >> 6));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||
if (code == 0xffff)
|
||||
// END
|
||||
break;
|
||||
out[upto++] = (byte)(0xE0 | (code >> 12));
|
||||
out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else {
|
||||
// surrogate pair
|
||||
// confirm valid high surrogate
|
||||
if (code < 0xDC00 && source[i] != 0xffff) {
|
||||
int utf32 = (int) source[i];
|
||||
// confirm valid low surrogate and write pair
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
|
||||
i++;
|
||||
out[upto++] = (byte)(0xF0 | (utf32 >> 18));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// replace unpaired surrogate or out-of-order low surrogate
|
||||
// with substitution character
|
||||
out[upto++] = (byte) 0xEF;
|
||||
out[upto++] = (byte) 0xBF;
|
||||
out[upto++] = (byte) 0xBD;
|
||||
}
|
||||
}
|
||||
//assert matches(source, offset, i-offset-1, out, upto);
|
||||
result.length = upto;
|
||||
}
|
||||
|
||||
/** Encode characters from a char[] source, starting at
|
||||
* offset for length chars. Returns the number of bytes
|
||||
* written to bytesOut. */
|
||||
public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {
|
||||
|
||||
int upto = 0;
|
||||
int i = offset;
|
||||
final int end = offset + length;
|
||||
byte[] out = result.result;
|
||||
|
||||
while(i < end) {
|
||||
|
||||
final int code = (int) source[i++];
|
||||
|
||||
if (upto+4 > out.length) {
|
||||
byte[] newOut = new byte[2*out.length];
|
||||
assert newOut.length >= upto+4;
|
||||
System.arraycopy(out, 0, newOut, 0, upto);
|
||||
result.result = out = newOut;
|
||||
}
|
||||
if (code < 0x80)
|
||||
out[upto++] = (byte) code;
|
||||
else if (code < 0x800) {
|
||||
out[upto++] = (byte) (0xC0 | (code >> 6));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||
out[upto++] = (byte)(0xE0 | (code >> 12));
|
||||
out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else {
|
||||
// surrogate pair
|
||||
// confirm valid high surrogate
|
||||
if (code < 0xDC00 && i < end && source[i] != 0xffff) {
|
||||
int utf32 = (int) source[i];
|
||||
// confirm valid low surrogate and write pair
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
|
||||
i++;
|
||||
out[upto++] = (byte)(0xF0 | (utf32 >> 18));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// replace unpaired surrogate or out-of-order low surrogate
|
||||
// with substitution character
|
||||
out[upto++] = (byte) 0xEF;
|
||||
out[upto++] = (byte) 0xBF;
|
||||
out[upto++] = (byte) 0xBD;
|
||||
}
|
||||
}
|
||||
//assert matches(source, offset, length, out, upto);
|
||||
result.length = upto;
|
||||
}
|
||||
|
||||
/** Encode characters from this String, starting at offset
|
||||
* for length characters. Returns the number of bytes
|
||||
* written to bytesOut. */
|
||||
public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) {
|
||||
final int end = offset + length;
|
||||
|
||||
byte[] out = result.result;
|
||||
|
||||
int upto = 0;
|
||||
for(int i=offset;i<end;i++) {
|
||||
final int code = (int) s.charAt(i);
|
||||
|
||||
if (upto+4 > out.length) {
|
||||
byte[] newOut = new byte[2*out.length];
|
||||
assert newOut.length >= upto+4;
|
||||
System.arraycopy(out, 0, newOut, 0, upto);
|
||||
result.result = out = newOut;
|
||||
}
|
||||
if (code < 0x80)
|
||||
out[upto++] = (byte) code;
|
||||
else if (code < 0x800) {
|
||||
out[upto++] = (byte) (0xC0 | (code >> 6));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else if (code < 0xD800 || code > 0xDFFF) {
|
||||
out[upto++] = (byte)(0xE0 | (code >> 12));
|
||||
out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (code & 0x3F));
|
||||
} else {
|
||||
// surrogate pair
|
||||
// confirm valid high surrogate
|
||||
if (code < 0xDC00 && (i < end-1)) {
|
||||
int utf32 = (int) s.charAt(i+1);
|
||||
// confirm valid low surrogate and write pair
|
||||
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
|
||||
utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
|
||||
i++;
|
||||
out[upto++] = (byte)(0xF0 | (utf32 >> 18));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
|
||||
out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// replace unpaired surrogate or out-of-order low surrogate
|
||||
// with substitution character
|
||||
out[upto++] = (byte) 0xEF;
|
||||
out[upto++] = (byte) 0xBF;
|
||||
out[upto++] = (byte) 0xBD;
|
||||
}
|
||||
}
|
||||
//assert matches(s, offset, length, out, upto);
|
||||
result.length = upto;
|
||||
}
|
||||
|
||||
/** Convert UTF8 bytes into UTF16 characters. If offset
|
||||
* is non-zero, conversion starts at that starting point
|
||||
* in utf8, re-using the results from the previous call
|
||||
* up until offset. */
|
||||
public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
|
||||
|
||||
final int end = offset + length;
|
||||
char[] out = result.result;
|
||||
if (result.offsets.length <= end) {
|
||||
int[] newOffsets = new int[2*end];
|
||||
System.arraycopy(result.offsets, 0, newOffsets, 0, result.offsets.length);
|
||||
result.offsets = newOffsets;
|
||||
}
|
||||
final int[] offsets = result.offsets;
|
||||
|
||||
// If incremental decoding fell in the middle of a
|
||||
// single unicode character, rollback to its start:
|
||||
int upto = offset;
|
||||
while(offsets[upto] == -1)
|
||||
upto--;
|
||||
|
||||
int outUpto = offsets[upto];
|
||||
|
||||
// Pre-allocate for worst case 1-for-1
|
||||
if (outUpto+length >= out.length) {
|
||||
char[] newOut = new char[2*(outUpto+length)];
|
||||
System.arraycopy(out, 0, newOut, 0, outUpto);
|
||||
result.result = out = newOut;
|
||||
}
|
||||
|
||||
while (upto < end) {
|
||||
|
||||
final int b = utf8[upto]&0xff;
|
||||
final int ch;
|
||||
|
||||
offsets[upto++] = outUpto;
|
||||
|
||||
if (b < 0xc0) {
|
||||
assert b < 0x80;
|
||||
ch = b;
|
||||
} else if (b < 0xe0) {
|
||||
ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
} else if (b < 0xf0) {
|
||||
ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
}
|
||||
|
||||
if (ch <= UNI_MAX_BMP) {
|
||||
// target is a character <= 0xFFFF
|
||||
out[outUpto++] = (char) ch;
|
||||
} else {
|
||||
// target is a character in range 0xFFFF - 0x10FFFF
|
||||
final int chHalf = ch - HALF_BASE;
|
||||
out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START);
|
||||
out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
|
||||
}
|
||||
}
|
||||
|
||||
offsets[upto] = outUpto;
|
||||
result.length = outUpto;
|
||||
}
|
||||
|
||||
// Only called from assert
|
||||
/*
|
||||
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
|
||||
try {
|
||||
String s1 = new String(source, offset, length);
|
||||
String s2 = new String(result, 0, upto, "UTF-8");
|
||||
if (!s1.equals(s2)) {
|
||||
//System.out.println("DIFF: s1 len=" + s1.length());
|
||||
//for(int i=0;i<s1.length();i++)
|
||||
// System.out.println(" " + i + ": " + (int) s1.charAt(i));
|
||||
//System.out.println("s2 len=" + s2.length());
|
||||
//for(int i=0;i<s2.length();i++)
|
||||
// System.out.println(" " + i + ": " + (int) s2.charAt(i));
|
||||
|
||||
// If the input string was invalid, then the
|
||||
// difference is OK
|
||||
if (!validUTF16String(s1))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
return s1.equals(s2);
|
||||
} catch (UnsupportedEncodingException uee) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Only called from assert
|
||||
private static boolean matches(String source, int offset, int length, byte[] result, int upto) {
|
||||
try {
|
||||
String s1 = source.substring(offset, offset+length);
|
||||
String s2 = new String(result, 0, upto, "UTF-8");
|
||||
if (!s1.equals(s2)) {
|
||||
// Allow a difference if s1 is not valid UTF-16
|
||||
|
||||
//System.out.println("DIFF: s1 len=" + s1.length());
|
||||
//for(int i=0;i<s1.length();i++)
|
||||
// System.out.println(" " + i + ": " + (int) s1.charAt(i));
|
||||
//System.out.println(" s2 len=" + s2.length());
|
||||
//for(int i=0;i<s2.length();i++)
|
||||
// System.out.println(" " + i + ": " + (int) s2.charAt(i));
|
||||
|
||||
// If the input string was invalid, then the
|
||||
// difference is OK
|
||||
if (!validUTF16String(s1))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
return s1.equals(s2);
|
||||
} catch (UnsupportedEncodingException uee) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static final boolean validUTF16String(String s) {
|
||||
final int size = s.length();
|
||||
for(int i=0;i<size;i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
if (i < size-1) {
|
||||
i++;
|
||||
char nextCH = s.charAt(i);
|
||||
if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
|
||||
// Valid surrogate pair
|
||||
} else
|
||||
// Unmatched hight surrogate
|
||||
return false;
|
||||
} else
|
||||
// Unmatched hight surrogate
|
||||
return false;
|
||||
} else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
|
||||
// Unmatched low surrogate
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static final boolean validUTF16String(char[] s, int size) {
|
||||
for(int i=0;i<size;i++) {
|
||||
char ch = s[i];
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
if (i < size-1) {
|
||||
i++;
|
||||
char nextCH = s[i];
|
||||
if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) {
|
||||
// Valid surrogate pair
|
||||
} else
|
||||
return false;
|
||||
} else
|
||||
return false;
|
||||
} else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
|
||||
// Unmatched low surrogate
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
}
|
|
@ -736,10 +736,7 @@
|
|||
|
||||
<p>
|
||||
Lucene writes unicode
|
||||
character sequences using Java's
|
||||
<a href="http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8">"modified
|
||||
UTF-8 encoding"</a>
|
||||
.
|
||||
character sequences as UTF-8 encoded bytes.
|
||||
</p>
|
||||
|
||||
|
||||
|
@ -748,8 +745,9 @@
|
|||
<section id="String"><title>String</title>
|
||||
|
||||
<p>
|
||||
Lucene writes strings as a VInt representing the length, followed by
|
||||
the character data.
|
||||
Lucene writes strings as UTF-8 encoded bytes.
|
||||
First the length, in bytes, is written as a VInt,
|
||||
followed by the bytes.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
@ -1233,10 +1231,12 @@
|
|||
<br/>
|
||||
--> VInt
|
||||
</p>
|
||||
<p>This
|
||||
file is sorted by Term. Terms are ordered first lexicographically
|
||||
by the term's field name, and within that lexicographically by the
|
||||
term's text.
|
||||
<p>
|
||||
This file is sorted by Term. Terms are
|
||||
ordered first lexicographically (by UTF16
|
||||
character code) by the term's field name,
|
||||
and within that lexicographically (by
|
||||
UTF16 character code) by the term's text.
|
||||
</p>
|
||||
<p>TIVersion names the version of the format
|
||||
of this file and is -2 in Lucene 1.4.
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Enumeration;
|
||||
import java.util.zip.ZipFile;
|
||||
import java.util.zip.ZipEntry;
|
||||
|
@ -39,6 +40,7 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/*
|
||||
Verify we can read the pre-2.1 file format, do searches
|
||||
|
@ -131,7 +133,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
for(int i=0;i<oldNames.length;i++) {
|
||||
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
|
||||
unzip(dirName, oldNames[i]);
|
||||
searchIndex(oldNames[i]);
|
||||
searchIndex(oldNames[i], oldNames[i]);
|
||||
rmDir(oldNames[i]);
|
||||
}
|
||||
}
|
||||
|
@ -171,7 +173,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
}
|
||||
}
|
||||
|
||||
public void searchIndex(String dirName) throws IOException {
|
||||
public void searchIndex(String dirName, String oldName) throws IOException {
|
||||
//QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
|
||||
//Query query = parser.parse("handle:1");
|
||||
|
||||
|
@ -179,6 +181,29 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
|
||||
Directory dir = FSDirectory.getDirectory(dirName);
|
||||
IndexSearcher searcher = new IndexSearcher(dir);
|
||||
IndexReader reader = searcher.getIndexReader();
|
||||
|
||||
_TestUtil.checkIndex(dir);
|
||||
|
||||
for(int i=0;i<35;i++) {
|
||||
if (!reader.isDeleted(i)) {
|
||||
Document d = reader.document(i);
|
||||
List fields = d.getFields();
|
||||
if (oldName.startsWith("23.")) {
|
||||
assertEquals(3, fields.size());
|
||||
Field f = (Field) d.getField("id");
|
||||
assertEquals(""+i, f.stringValue());
|
||||
|
||||
f = (Field) d.getField("utf8");
|
||||
assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", f.stringValue());
|
||||
|
||||
f = (Field) d.getField("content2");
|
||||
assertEquals("here is more content with aaa aaa aaa", f.stringValue());
|
||||
}
|
||||
} else
|
||||
// Only ID 7 is deleted
|
||||
assertEquals(7, i);
|
||||
}
|
||||
|
||||
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
|
||||
|
||||
|
@ -189,6 +214,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
|
||||
testHits(hits, 34, searcher.getIndexReader());
|
||||
|
||||
if (oldName.startsWith("23.")) {
|
||||
hits = searcher.search(new TermQuery(new Term("utf8", "\u0000")));
|
||||
assertEquals(34, hits.length());
|
||||
hits = searcher.search(new TermQuery(new Term("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne")));
|
||||
assertEquals(34, hits.length());
|
||||
hits = searcher.search(new TermQuery(new Term("utf8", "ab\ud917\udc17cd")));
|
||||
assertEquals(34, hits.length());
|
||||
}
|
||||
|
||||
searcher.close();
|
||||
dir.close();
|
||||
}
|
||||
|
@ -421,6 +455,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase
|
|||
Document doc = new Document();
|
||||
doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
|
||||
doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
|
|
@ -24,16 +24,70 @@ import java.io.IOException;
|
|||
|
||||
public class TestIndexInput extends LuceneTestCase {
|
||||
public void testRead() throws IOException {
|
||||
IndexInput is = new MockIndexInput(new byte[]{(byte) 0x80, 0x01,
|
||||
IndexInput is = new MockIndexInput(new byte[] {
|
||||
(byte) 0x80, 0x01,
|
||||
(byte) 0xFF, 0x7F,
|
||||
(byte) 0x80, (byte) 0x80, 0x01,
|
||||
(byte) 0x81, (byte) 0x80, 0x01,
|
||||
0x06, 'L', 'u', 'c', 'e', 'n', 'e'});
|
||||
assertEquals(128, is.readVInt());
|
||||
assertEquals(16383, is.readVInt());
|
||||
assertEquals(16384, is.readVInt());
|
||||
assertEquals(16385, is.readVInt());
|
||||
assertEquals("Lucene", is.readString());
|
||||
0x06, 'L', 'u', 'c', 'e', 'n', 'e',
|
||||
|
||||
// 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK")
|
||||
0x02, (byte) 0xC2, (byte) 0xBF,
|
||||
0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF,
|
||||
'c', 'e', (byte) 0xC2, (byte) 0xBF,
|
||||
'n', 'e',
|
||||
|
||||
// 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES")
|
||||
0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
|
||||
0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
|
||||
'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0,
|
||||
'n', 'e',
|
||||
|
||||
// surrogate pairs
|
||||
// (U+1D11E "MUSICAL SYMBOL G CLEF")
|
||||
// (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE")
|
||||
0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
|
||||
0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
|
||||
(byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
|
||||
0x0E, 'L', 'u',
|
||||
(byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E,
|
||||
'c', 'e',
|
||||
(byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0,
|
||||
'n', 'e',
|
||||
|
||||
// null bytes
|
||||
0x01, 0x00,
|
||||
0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e',
|
||||
|
||||
// Modified UTF-8 null bytes
|
||||
0x02, (byte) 0xC0, (byte) 0x80,
|
||||
0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80,
|
||||
'c', 'e', (byte) 0xC0, (byte) 0x80,
|
||||
'n', 'e',
|
||||
|
||||
});
|
||||
|
||||
assertEquals(128,is.readVInt());
|
||||
assertEquals(16383,is.readVInt());
|
||||
assertEquals(16384,is.readVInt());
|
||||
assertEquals(16385,is.readVInt());
|
||||
assertEquals("Lucene",is.readString());
|
||||
|
||||
assertEquals("\u00BF",is.readString());
|
||||
assertEquals("Lu\u00BFce\u00BFne",is.readString());
|
||||
|
||||
assertEquals("\u2620",is.readString());
|
||||
assertEquals("Lu\u2620ce\u2620ne",is.readString());
|
||||
|
||||
assertEquals("\uD834\uDD1E",is.readString());
|
||||
assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString());
|
||||
assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString());
|
||||
|
||||
assertEquals("\u0000",is.readString());
|
||||
assertEquals("Lu\u0000ce\u0000ne",is.readString());
|
||||
|
||||
assertEquals("\u0000",is.readString());
|
||||
assertEquals("Lu\u0000ce\u0000ne",is.readString());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.ArrayList;
|
|||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
@ -3329,4 +3330,223 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
w.abort();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
final String[] utf8Data = new String[] {
|
||||
// unpaired low surrogate
|
||||
"ab\udc17cd", "ab\ufffdcd",
|
||||
"\udc17abcd", "\ufffdabcd",
|
||||
"\udc17", "\ufffd",
|
||||
"ab\udc17\udc17cd", "ab\ufffd\ufffdcd",
|
||||
"\udc17\udc17abcd", "\ufffd\ufffdabcd",
|
||||
"\udc17\udc17", "\ufffd\ufffd",
|
||||
|
||||
// unpaired high surrogate
|
||||
"ab\ud917cd", "ab\ufffdcd",
|
||||
"\ud917abcd", "\ufffdabcd",
|
||||
"\ud917", "\ufffd",
|
||||
"ab\ud917\ud917cd", "ab\ufffd\ufffdcd",
|
||||
"\ud917\ud917abcd", "\ufffd\ufffdabcd",
|
||||
"\ud917\ud917", "\ufffd\ufffd",
|
||||
|
||||
// backwards surrogates
|
||||
"ab\udc17\ud917cd", "ab\ufffd\ufffdcd",
|
||||
"\udc17\ud917abcd", "\ufffd\ufffdabcd",
|
||||
"\udc17\ud917", "\ufffd\ufffd",
|
||||
"ab\udc17\ud917\udc17\ud917cd", "ab\ufffd\ud917\udc17\ufffdcd",
|
||||
"\udc17\ud917\udc17\ud917abcd", "\ufffd\ud917\udc17\ufffdabcd",
|
||||
"\udc17\ud917\udc17\ud917", "\ufffd\ud917\udc17\ufffd"
|
||||
};
|
||||
|
||||
// LUCENE-510
|
||||
public void testInvalidUTF16() throws Throwable {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, false, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
Document doc = new Document();
|
||||
|
||||
final int count = utf8Data.length/2;
|
||||
for(int i=0;i<count;i++)
|
||||
doc.add(new Field("f" + i, utf8Data[2*i], Field.Store.YES, Field.Index.TOKENIZED));
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
Document doc2 = ir.document(0);
|
||||
for(int i=0;i<count;i++) {
|
||||
assertEquals("field " + i + " was not indexed correctly", 1, ir.docFreq(new Term("f"+i, utf8Data[2*i+1])));
|
||||
assertEquals("field " + i + " is incorrect", utf8Data[2*i+1], doc2.getField("f"+i).stringValue());
|
||||
}
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// LUCENE-510
|
||||
public void testAllUnicodeChars() throws Throwable {
|
||||
|
||||
UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
char[] chars = new char[2];
|
||||
for(int ch=0;ch<0x0010FFFF;ch++) {
|
||||
|
||||
if (ch == 0xd800)
|
||||
// Skip invalid code points
|
||||
ch = 0xe000;
|
||||
|
||||
int len = 0;
|
||||
if (ch <= 0xffff) {
|
||||
chars[len++] = (char) ch;
|
||||
} else {
|
||||
chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
|
||||
chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
|
||||
}
|
||||
|
||||
UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);
|
||||
|
||||
String s1 = new String(chars, 0, len);
|
||||
String s2 = new String(utf8.result, 0, utf8.length, "UTF-8");
|
||||
assertEquals("codepoint " + ch, s1, s2);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
|
||||
assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
|
||||
|
||||
byte[] b = s1.getBytes("UTF-8");
|
||||
assertEquals(utf8.length, b.length);
|
||||
for(int j=0;j<utf8.length;j++)
|
||||
assertEquals(utf8.result[j], b[j]);
|
||||
}
|
||||
}
|
||||
|
||||
Random r = new Random();
|
||||
|
||||
private int nextInt(int lim) {
|
||||
return r.nextInt(lim);
|
||||
}
|
||||
|
||||
private int nextInt(int start, int end) {
|
||||
return start + nextInt(end-start);
|
||||
}
|
||||
|
||||
private boolean fillUnicode(char[] buffer, char[] expected, int offset, int count) {
|
||||
final int len = offset + count;
|
||||
boolean hasIllegal = false;
|
||||
|
||||
if (offset > 0 && buffer[offset] >= 0xdc00 && buffer[offset] < 0xe000)
|
||||
// Don't start in the middle of a valid surrogate pair
|
||||
offset--;
|
||||
|
||||
for(int i=offset;i<len;i++) {
|
||||
int t = nextInt(6);
|
||||
if (0 == t && i < len-1) {
|
||||
// Make a surrogate pair
|
||||
// High surrogate
|
||||
expected[i] = buffer[i++] = (char) nextInt(0xd800, 0xdc00);
|
||||
// Low surrogate
|
||||
expected[i] = buffer[i] = (char) nextInt(0xdc00, 0xe000);
|
||||
} else if (t <= 1)
|
||||
expected[i] = buffer[i] = (char) nextInt(0x80);
|
||||
else if (2 == t)
|
||||
expected[i] = buffer[i] = (char) nextInt(0x80, 0x800);
|
||||
else if (3 == t)
|
||||
expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
|
||||
else if (4 == t)
|
||||
expected[i] = buffer[i] = (char) nextInt(0xe000, 0xffff);
|
||||
else if (5 == t && i < len-1) {
|
||||
// Illegal unpaired surrogate
|
||||
if (nextInt(10) == 7) {
|
||||
if (r.nextBoolean())
|
||||
buffer[i] = (char) nextInt(0xd800, 0xdc00);
|
||||
else
|
||||
buffer[i] = (char) nextInt(0xdc00, 0xe000);
|
||||
expected[i++] = 0xfffd;
|
||||
expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
|
||||
hasIllegal = true;
|
||||
} else
|
||||
expected[i] = buffer[i] = (char) nextInt(0x800, 0xd800);
|
||||
} else {
|
||||
expected[i] = buffer[i] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return hasIllegal;
|
||||
}
|
||||
|
||||
// LUCENE-510
|
||||
public void testRandomUnicodeStrings() throws Throwable {
|
||||
|
||||
char[] buffer = new char[20];
|
||||
char[] expected = new char[20];
|
||||
|
||||
UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
|
||||
for(int iter=0;iter<100000;iter++) {
|
||||
boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);
|
||||
|
||||
UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
|
||||
if (!hasIllegal) {
|
||||
byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
|
||||
assertEquals(b.length, utf8.length);
|
||||
for(int i=0;i<b.length;i++)
|
||||
assertEquals(b[i], utf8.result[i]);
|
||||
}
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16);
|
||||
assertEquals(utf16.length, 20);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16.result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-510
|
||||
public void testIncrementalUnicodeStrings() throws Throwable {
|
||||
char[] buffer = new char[20];
|
||||
char[] expected = new char[20];
|
||||
|
||||
UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
|
||||
|
||||
boolean hasIllegal = false;
|
||||
byte[] last = new byte[60];
|
||||
|
||||
for(int iter=0;iter<100000;iter++) {
|
||||
|
||||
final int prefix;
|
||||
|
||||
if (iter == 0 || hasIllegal)
|
||||
prefix = 0;
|
||||
else
|
||||
prefix = nextInt(20);
|
||||
|
||||
hasIllegal = fillUnicode(buffer, expected, prefix, 20-prefix);
|
||||
|
||||
UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
|
||||
if (!hasIllegal) {
|
||||
byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
|
||||
assertEquals(b.length, utf8.length);
|
||||
for(int i=0;i<b.length;i++)
|
||||
assertEquals(b[i], utf8.result[i]);
|
||||
}
|
||||
|
||||
int bytePrefix = 20;
|
||||
if (iter == 0 || hasIllegal)
|
||||
bytePrefix = 0;
|
||||
else
|
||||
for(int i=0;i<20;i++)
|
||||
if (last[i] != utf8.result[i]) {
|
||||
bytePrefix = i;
|
||||
break;
|
||||
}
|
||||
System.arraycopy(utf8.result, 0, last, 0, utf8.length);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.result, bytePrefix, utf8.length-bytePrefix, utf16);
|
||||
assertEquals(20, utf16.length);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16.result[i]);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.result, 0, utf8.length, utf16a);
|
||||
assertEquals(20, utf16a.length);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16a.result[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -415,8 +415,56 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
return r.nextInt(lim);
|
||||
}
|
||||
|
||||
// start is inclusive and end is exclusive
|
||||
public int nextInt(int start, int end) {
|
||||
return start + r.nextInt(end-start);
|
||||
}
|
||||
|
||||
char[] buffer = new char[100];
|
||||
|
||||
private int addUTF8Token(int start) {
|
||||
final int end = start + nextInt(20);
|
||||
if (buffer.length < 1+end) {
|
||||
char[] newBuffer = new char[(int) ((1+end)*1.25)];
|
||||
System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
|
||||
buffer = newBuffer;
|
||||
}
|
||||
|
||||
for(int i=start;i<end;i++) {
|
||||
int t = nextInt(6);
|
||||
if (0 == t && i < end-1) {
|
||||
// Make a surrogate pair
|
||||
// High surrogate
|
||||
buffer[i++] = (char) nextInt(0xd800, 0xdc00);
|
||||
// Low surrogate
|
||||
buffer[i] = (char) nextInt(0xdc00, 0xe000);
|
||||
} else if (t <= 1)
|
||||
buffer[i] = (char) nextInt(0x80);
|
||||
else if (2 == t)
|
||||
buffer[i] = (char) nextInt(0x80, 0x800);
|
||||
else if (3 == t)
|
||||
buffer[i] = (char) nextInt(0x800, 0xd800);
|
||||
else if (4 == t)
|
||||
buffer[i] = (char) nextInt(0xe000, 0xffff);
|
||||
else if (5 == t) {
|
||||
// Illegal unpaired surrogate
|
||||
if (r.nextBoolean())
|
||||
buffer[i] = (char) nextInt(0xd800, 0xdc00);
|
||||
else
|
||||
buffer[i] = (char) nextInt(0xdc00, 0xe000);
|
||||
}
|
||||
}
|
||||
buffer[end] = ' ';
|
||||
return 1+end;
|
||||
}
|
||||
|
||||
public String getString(int nTokens) {
|
||||
nTokens = nTokens!=0 ? nTokens : r.nextInt(4)+1;
|
||||
|
||||
// Half the time make a random UTF8 string
|
||||
if (r.nextBoolean())
|
||||
return getUTF8String(nTokens);
|
||||
|
||||
// avoid StringBuffer because it adds extra synchronization.
|
||||
char[] arr = new char[nTokens*2];
|
||||
for (int i=0; i<nTokens; i++) {
|
||||
|
@ -426,6 +474,14 @@ public class TestStressIndexing2 extends LuceneTestCase {
|
|||
return new String(arr);
|
||||
}
|
||||
|
||||
public String getUTF8String(int nTokens) {
|
||||
int upto = 0;
|
||||
Arrays.fill(buffer, (char) 0);
|
||||
for(int i=0;i<nTokens;i++)
|
||||
upto = addUTF8Token(upto);
|
||||
return new String(buffer, 0, upto);
|
||||
}
|
||||
|
||||
public String getIdString() {
|
||||
return Integer.toString(base + nextInt(range));
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue