lucene/sandbox/contributions/webcrawler-LARM/doc/webcrawler_tech_overview.rtf

857 lines
156 KiB
Plaintext

{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1031\deflangfe1031{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
{\f2\fmodern\fcharset0\fprq1{\*\panose 02070309020205020404}Courier New;}{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f14\fnil\fcharset2\fprq2{\*\panose 05000000000000000000}Wingdings;}
{\f29\fswiss\fcharset0\fprq2{\*\panose 020b0603020202020204}Trebuchet MS;}{\f31\froman\fcharset0\fprq2{\*\panose 00000000000000000000}Palatino{\*\falt Book Antiqua};}{\f165\froman\fcharset238\fprq2 Times New Roman CE;}
{\f166\froman\fcharset204\fprq2 Times New Roman Cyr;}{\f168\froman\fcharset161\fprq2 Times New Roman Greek;}{\f169\froman\fcharset162\fprq2 Times New Roman Tur;}{\f170\froman\fcharset177\fprq2 Times New Roman (Hebrew);}
{\f171\froman\fcharset178\fprq2 Times New Roman (Arabic);}{\f172\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f173\fswiss\fcharset238\fprq2 Arial CE;}{\f174\fswiss\fcharset204\fprq2 Arial Cyr;}{\f176\fswiss\fcharset161\fprq2 Arial Greek;}
{\f177\fswiss\fcharset162\fprq2 Arial Tur;}{\f178\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f179\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f180\fswiss\fcharset186\fprq2 Arial Baltic;}{\f181\fmodern\fcharset238\fprq1 Courier New CE;}
{\f182\fmodern\fcharset204\fprq1 Courier New Cyr;}{\f184\fmodern\fcharset161\fprq1 Courier New Greek;}{\f185\fmodern\fcharset162\fprq1 Courier New Tur;}{\f186\fmodern\fcharset177\fprq1 Courier New (Hebrew);}
{\f187\fmodern\fcharset178\fprq1 Courier New (Arabic);}{\f188\fmodern\fcharset186\fprq1 Courier New Baltic;}{\f397\fswiss\fcharset238\fprq2 Trebuchet MS CE;}{\f401\fswiss\fcharset162\fprq2 Trebuchet MS Tur;}}{\colortbl;\red0\green0\blue0;
\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;
\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;\red255\green255\blue255;}{\stylesheet{\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \snext0 Normal;}{\s1\ql \fi-432\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\adjustright\rin0\lin0\itap0
\b\fs36\lang2057\langfe1031\kerning28\cgrid\langnp2057\langfenp1031 \sbasedon0 \snext0 heading 1;}{\s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\adjustright\rin0\lin0\itap0
\b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 \sbasedon0 \snext0 heading 2;}{\s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 3;}{\s4\ql \fi-864\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl3\adjustright\rin0\lin0\itap0
\f29\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 4;}{\s5\ql \fi-1008\li1008\ri0\sb240\sa60\widctlpar\jclisttab\tx1008\aspalpha\aspnum\faauto\ls8\ilvl4\adjustright\rin0\lin1008\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 5;}{\s6\ql \fi-1152\li1152\ri0\sb240\sa60\widctlpar\jclisttab\tx1152\aspalpha\aspnum\faauto\ls8\ilvl5\adjustright\rin0\lin1152\itap0
\i\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 6;}{\s7\ql \fi-1296\li1296\ri0\sb240\sa60\widctlpar\jclisttab\tx1296\aspalpha\aspnum\faauto\ls8\ilvl6\adjustright\rin0\lin1296\itap0
\f1\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 7;}{\s8\ql \fi-1440\li1440\ri0\sb240\sa60\widctlpar\jclisttab\tx1440\aspalpha\aspnum\faauto\ls8\ilvl7\adjustright\rin0\lin1440\itap0
\i\f1\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 8;}{\s9\ql \fi-1584\li1584\ri0\sb240\sa60\widctlpar\jclisttab\tx1584\aspalpha\aspnum\faauto\ls8\ilvl8\adjustright\rin0\lin1584\itap0
\b\i\f1\fs18\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 heading 9;}{\*\cs10 \additive Default Paragraph Font;}{\s15\ql \li0\ri0\sa60\widctlpar\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f1\fs16\lang1031\langfe1031\langnp1031\langfenp1031 \sbasedon0 \snext15 header;}{\s16\ql \li0\ri0\sl-240\slmult0\widctlpar\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f1\fs16\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext16 footer;}{\*\cs17 \additive \b \sbasedon10 Strong;}{\*\cs18 \additive \i \sbasedon10 Emphasis;}{\s19\ql \li0\ri0\sa120\widctlpar
\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs16\lang1031\langfe1031\langnp1031\langfenp1031 \sbasedon15 \snext19 Fu\'dfzeile 1;}{\s20\ql \li0\ri0\sl-240\slmult0\widctlpar
\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs12\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon16 \snext20 Fu\'dfzeile 2;}{\s21\ql \li0\ri0\sa60\widctlpar
\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs12\lang1031\langfe1031\langnp1031\langfenp1031 \sbasedon15 \snext21 Kopfzeile 2;}{\s22\ql \fi-357\li357\ri0\sa120\widctlpar
\jclisttab\tx360\aspalpha\aspnum\faauto\ls7\adjustright\rin0\lin357\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext22 Aufz\'e4hlung 1;}{\s23\ql \fi-357\li714\ri0\widctlpar
\jclisttab\tx720\aspalpha\aspnum\faauto\ls9\ilvl1\adjustright\rin0\lin714\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext23 Aufz\'e4hlung 2;}{\s24\ql \fi-357\li1077\ri0\widctlpar
\jclisttab\tx1080\aspalpha\aspnum\faauto\ls9\ilvl2\adjustright\rin0\lin1077\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext24 Aufz\'e4hlung 3;}{\s25\ql \fi-360\li1440\ri0\widctlpar
\jclisttab\tx1440\aspalpha\aspnum\faauto\ls7\ilvl3\adjustright\rin0\lin1440\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext25 Aufz\'e4hlung 4;}{\s26\ql \li0\ri0\sl360\slmult1\widctlpar
\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 1;}{\s27\ql \li220\ri0\sl360\slmult1\widctlpar
\tx660\tx880\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin220\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 2;}{\s28\ql \li440\ri0\sl360\slmult1\widctlpar
\tx1100\tx1320\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin440\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 3;}{
\s29\ql \li660\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin660\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 4;}{
\s30\ql \li880\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin880\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 5;}{
\s31\ql \li1100\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1100\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 6;}{
\s32\ql \li1320\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1320\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 7;}{
\s33\ql \li1540\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1540\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 8;}{
\s34\ql \li1760\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1760\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext0 \sautoupd toc 9;}{
\s35\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \b\fs40\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext35 Inhalt-\'dcberschrift;}{
\s36\ql \fi426\li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext36 Body Text 2;}{
\s37\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \b\fs56\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext37 Dokumenten-Titel;}{\s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext38 footnote text;}{\*\cs39 \additive \super \sbasedon10 footnote reference;}{\s40\ql \fi-360\li360\ri0\widctlpar\jclisttab\tx360{\*\pn \pnlvlbody\ilvl0\ls13\pnrnot0\pndec }
\aspalpha\aspnum\faauto\ls13\adjustright\rin0\lin360\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \sbasedon0 \snext40 \sautoupd List Bullet;}{\*\cs41 \additive \ul\cf2 \sbasedon10 Hyperlink;}{\*\cs42 \additive \ul\cf12 \sbasedon10
FollowedHyperlink;}}{\*\listtable{\list\listtemplateid1288180088\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \s40\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid-119}{\list\listtemplateid2025747750\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat9230\levelspace0\levelindent0
{\leveltext\leveltemplateid426543360\'01-;}{\levelnumbers;}\loch\af0\hich\af0\dbch\af0\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720\jclisttab\tx720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1440\jclisttab\tx1440 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2160\jclisttab\tx2160 }{\listlevel\levelnfc23\levelnfcn23
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567617\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc23
\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li3600\jclisttab\tx3600 }{\listlevel
\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li4320
\jclisttab\tx4320 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567617\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0
\fi-360\li5040\jclisttab\tx5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li5760\jclisttab\tx5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li6480\jclisttab\tx6480 }{\listname ;}\listid77531085}{\list\listtemplateid67567617\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0
\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid130220302}{\list\listtemplateid1247607680{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3978 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \s22\fi-360\li360\jclisttab\tx360 }{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3880 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \s23\fi-360\li720\jclisttab\tx720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \s24\fi-360\li1080\jclisttab\tx1080 }{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \s25\fi-360\li1440\jclisttab\tx1440 }{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1800\jclisttab\tx1800 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3880 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2160\jclisttab\tx2160 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2520\jclisttab\tx2520 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li3240\jclisttab\tx3240 }{\listname ;}\listid163085644}{\list\listtemplateid1464243652
\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }
{\listname ;}\listid278416750}{\list\listtemplateid67567617\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid450631953}{\list\listtemplateid67567617\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid907614837}{\list\listtemplateid1148328050{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\'01\'00;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s1\fi-432\li432\jclisttab\tx432 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1
\levelspace0\levelindent0{\leveltext\'03\'00.\'01;}{\levelnumbers\'01\'03;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s2\fi-576\li576\jclisttab\tx576 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0
\levelindent0{\leveltext\'05\'00.\'01.\'02;}{\levelnumbers\'01\'03\'05;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s3\fi-720\li720\jclisttab\tx720 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0
\levelindent0{\leveltext\'07\'00.\'01.\'02.\'03;}{\levelnumbers\'01\'03\'05\'07;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s4\fi-864\li864\jclisttab\tx864 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1
\levelspace0\levelindent0{\leveltext\'09\'00.\'01.\'02.\'03.\'04;}{\levelnumbers\'01\'03\'05\'07\'09;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s5\fi-1008\li1008\jclisttab\tx1008 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0b\'00.\'01.\'02.\'03.\'04.\'05;}{\levelnumbers\'01\'03\'05\'07\'09\'0b;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s6\fi-1152\li1152\jclisttab\tx1152 }{\listlevel\levelnfc0
\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0d\'00.\'01.\'02.\'03.\'04.\'05.\'06;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s7\fi-1296\li1296
\jclisttab\tx1296 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0f\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d\'0f;}\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1 \s8\fi-1440\li1440\jclisttab\tx1440 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'11\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07.\'08;}{\levelnumbers
\'01\'03\'05\'07\'09\'0b\'0d\'0f\'11;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s9\fi-1584\li1584\jclisttab\tx1584 }{\listname ;}\listid983581600}{\list\listtemplateid1464243652\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1104501034}{\list\listtemplateid-748938384
\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\leveltemplateid426543360\'01-;}{\levelnumbers;}\loch\af0\hich\af0\dbch\af0\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720\jclisttab\tx720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1440\jclisttab\tx1440 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers
;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2160\jclisttab\tx2160 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567617
\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li3600\jclisttab\tx3600 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0
{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li4320\jclisttab\tx4320 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0
\levelindent0{\leveltext\leveltemplateid67567617\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li5040\jclisttab\tx5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1
\levelspace0\levelindent0{\leveltext\leveltemplateid67567619\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li5760\jclisttab\tx5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567621\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li6480\jclisttab\tx6480 }{\listname ;}\listid1374885547}
{\list\listtemplateid2068468094\listhybrid{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid-1418546572\'02\'00.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720\jclisttab\tx720 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567641\'02\'01.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li1440\jclisttab\tx1440 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567643\'02\'02.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li2160\jclisttab\tx2160 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567631\'02\'03.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567641\'02\'04.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li3600\jclisttab\tx3600 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567643\'02\'05.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li4320\jclisttab\tx4320 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567631\'02\'06.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li5040\jclisttab\tx5040 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567641\'02\'07.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li5760\jclisttab\tx5760 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid67567643\'02\'08.;}{\levelnumbers\'01;}\chbrdr
\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li6480\jclisttab\tx6480 }{\listname ;}\listid1380398570}{\list\listtemplateid648566982\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0
\levelindent0{\leveltext\'01-;}{\levelnumbers;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1506508248}{\list\listtemplateid67567617\listsimple{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }{\listname ;}\listid1542666708}
{\list\listtemplateid67567631\listsimple{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'02\'00.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0
\fi-360\li360\jclisttab\tx360 }{\listname ;}\listid2092700867}{\list\listtemplateid-1297428624\listhybrid{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\leveltemplateid1167366418
\'02\'00.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720\jclisttab\tx720 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567641\'02\'01.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li1440\jclisttab\tx1440 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567643\'02\'02.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li2160\jclisttab\tx2160 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567631\'02\'03.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567641\'02\'04.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li3600\jclisttab\tx3600 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567643\'02\'05.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li4320\jclisttab\tx4320 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567631\'02\'06.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li5040\jclisttab\tx5040 }{\listlevel\levelnfc4\levelnfcn4\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567641\'02\'07.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-360\li5760\jclisttab\tx5760 }{\listlevel\levelnfc2\levelnfcn2\leveljc2\leveljcn2\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\leveltemplateid67567643\'02\'08.;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \fi-180\li6480\jclisttab\tx6480 }{\listname ;}\listid2133397028}}{\*\listoverridetable{\listoverride\listid450631953\listoverridecount0\ls1}
{\listoverride\listid1506508248\listoverridecount0\ls2}{\listoverride\listid2092700867\listoverridecount0\ls3}{\listoverride\listid278416750\listoverridecount0\ls4}{\listoverride\listid1104501034\listoverridecount0\ls5}{\listoverride\listid1542666708
\listoverridecount0\ls6}{\listoverride\listid163085644\listoverridecount0\ls7}{\listoverride\listid983581600\listoverridecount0\ls8}{\listoverride\listid163085644\listoverridecount9{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li360\jclisttab\tx360 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23
\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3979 ?;}{\levelnumbers;}\f14\fs12\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720\jclisttab\tx720 }}{\lfolevel\listoverrideformat
{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1080\jclisttab\tx1080 }}{\lfolevel
\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1440
\jclisttab\tx1440 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1
\chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1800\jclisttab\tx1800 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3880 ?;}{\levelnumbers;}\f14
\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2160\jclisttab\tx2160 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2520\jclisttab\tx2520 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0
\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2880\jclisttab\tx2880 }}{\lfolevel\listoverrideformat{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3928 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li3240\jclisttab\tx3240 }}\ls9}{\listoverride\listid130220302\listoverridecount0\ls10}
{\listoverride\listid907614837\listoverridecount0\ls11}{\listoverride\listid77531085\listoverridecount0\ls12}{\listoverride\listid-119\listoverridecount0\ls13}{\listoverride\listid1374885547\listoverridecount0\ls14}{\listoverride\listid2133397028
\listoverridecount0\ls15}{\listoverride\listid1380398570\listoverridecount0\ls16}}{\info{\title Meine \'dcberschrift}{\author Clemens Marschner}{\operator Clemens Marschner}{\creatim\yr2002\mo6\dy30\hr17\min10}{\revtim\yr2002\mo6\dy30\hr17\min10}
{\printim\yr2002\mo5\dy6\hr19\min52}{\version2}{\edmins0}{\nofpages16}{\nofwords5234}{\nofchars24603}{\*\company Ludwig-Maximilians-Universit\'e4t}{\nofcharsws36642}{\vern8249}}\paperw11906\paperh16838\margl1701\margr1133\margt1702\margb1985
\deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dghspace180\dgvspace180\dghorigin1701\dgvorigin1984\dghshow0\dgvshow0
\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\nolnhtadjtbl \fet0{\*\template C:\\vorlagen\\Standard-Vorlage.dot}\sectd \psz9\linex0\footery866\endnhere\sectdefaultcl {\header \pard\plain \s21\ql \li0\ri0\sa60\widctlpar
\tx0\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs12\lang1031\langfe1031\langnp1031\langfenp1031 {\fs16\lang2057\langfe1031\langnp2057 The Fetcher Web Crawler \endash Technical Overview \endash Version 0.5}{
\lang2057\langfe1031\langnp2057
\par }}{\footer \pard\plain \s16\ql \li0\ri0\sl-240\slmult0\widctlpar\brdrt\brdrs\brdrw10\brsp20 \tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f1\fs16\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Version: }{\field{\*\fldinst
{ REVNUM \\* MERGEFORMAT }}{\fldrslt {\lang1024\langfe1024\noproof 13}}}{\pard\plain \s16\ql \li0\ri0\sl-240\slmult0\widctlpar\brdrt\brdrs\brdrw10\brsp20 \tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\v\f1\fs16\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\tc {\tcf92l\tcn }}}{\tab \tab }{\field{\*\fldinst {\cgrid0 PAGE }}{\fldrslt {\lang1024\langfe1024\cgrid0\noproof 1}}}{\cgrid0 / }{\field{\*\fldinst {\cgrid0 NUMPAGES }}{\fldrslt {
\lang1024\langfe1024\cgrid0\noproof 16}}}{
\par }\pard \s16\ql \li0\ri0\sl-240\slmult0\widctlpar\tqc\tx4536\tqr\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\tab }{\fs24
\par }}{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}
{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8
\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar
\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {
\par
\par
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {
\par
\par
\par
\par Apache Jakarta Lucene
\par
\par }\pard\plain \s37\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \b\fs56\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The Fetcher Web Crawler
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 Technical Overview
\par
\par Version 0.5
\par
\par
\par
\par Author:
\par Clemens Marschner\tab \tab LMU \endash University of Munich, Germany
\par }\pard \ql \fi708\li2124\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin2124\itap0 {\lang2057\langfe1031\langnp2057 Clemens.Marschner at campus.lmu.de
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
\par
\par
\par
\par
\par }\pard\plain \s35\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \b\fs40\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 \page Table Of Contents
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
\par }\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {1}{
\f0\fs24 \tab }{Overview\tab }{\field{\*\fldinst { PAGEREF _Toc8477592 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390032000000}}}{\fldrslt {3}}}{\f0\fs24
\par }\pard\plain \s27\ql \li220\ri0\sl360\slmult1\widctlpar\tx660\tx880\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin220\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {1.1}{\f0\fs24 \tab }{
Purpose and Intended Audience}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477593 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390033000000}}}{\fldrslt {3}}}{\f0\fs24
\par }{1.2}{\f0\fs24 \tab }{Why do we need web crawlers?}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477594 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390034000000}}}{\fldrslt {3}}}{\f0\fs24
\par }{1.3}{\f0\fs24 \tab }{Implementation \endash the first attempt}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477595 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390035000000}}}{\fldrslt {4}
}}{\f0\fs24
\par }{1.4}{\f0\fs24 \tab }{Features of the Fetcher crawler}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477596 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390036000000}}}{\fldrslt {4}}}{
\f0\fs24
\par }{1.5}{\f0\fs24 \tab }{What the crawler can do for you, and what it cannot (yet)}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477597 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390037000000
}}}{\fldrslt {5}}}{\f0\fs24
\par }{1.6}{\f0\fs24 \tab }{Syntax and runtime behaviour}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477598 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390038000000}}}{\fldrslt {6}}}{\f0\fs24
\par }\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {2}{\f0\fs24 \tab }{Architecture\tab }{\field{\*\fldinst {
PAGEREF _Toc8477599 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003500390039000000}}}{\fldrslt {7}}}{\f0\fs24
\par }\pard\plain \s27\ql \li220\ri0\sl360\slmult1\widctlpar\tx660\tx880\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin220\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {2.1}{\f0\fs24 \tab }{Performance}{\tab }
{\field{\*\fldinst { PAGEREF _Toc8477600 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300030000000}}}{\fldrslt {8}}}{\f0\fs24
\par }{2.2}{\f0\fs24 \tab }{Memory Usage}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477601 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300031000000}}}{\fldrslt {10}}}{\f0\fs24
\par }{2.3}{\f0\fs24 \tab }{The Filters}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477602 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300032000000}}}{\fldrslt {12}}}{\f0\fs24
\par }\pard\plain \s28\ql \li440\ri0\sl360\slmult1\widctlpar\tx1100\tx1320\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin440\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {2.3.1}{\f0\fs24 \tab }{RobotExclusionFilter}{
\tab }{\field{\*\fldinst { PAGEREF _Toc8477603 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300033000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.2}{\f0\fs24 \tab }{URLLengthFilter}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477604 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300034000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.3}{\f0\fs24 \tab }{KnownPathsFilter}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477605 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300035000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.4}{\f0\fs24 \tab }{URLScopeFilter}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477606 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300036000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.5}{\f0\fs24 \tab }{URLVisitedFilter}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477607 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300037000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.6}{\f0\fs24 \tab }{Fetcher}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477608 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300038000000}}}{\fldrslt {12}}}{\f0\fs24
\par }{2.3.7}{\f0\fs24 \tab }{A Note on DNS}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477609 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600300039000000}}}{\fldrslt {13}}}{\f0\fs24
\par }\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {3}{\f0\fs24 \tab }{Future Enhancements\tab }
{\field{\*\fldinst { PAGEREF _Toc8477610 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310030000000}}}{\fldrslt {14}}}{\f0\fs24
\par }\pard\plain \s27\ql \li220\ri0\sl360\slmult1\widctlpar\tx660\tx880\tqr\tldot\tx9072\aspalpha\aspnum\faauto\adjustright\rin0\lin220\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {3.1}{\f0\fs24 \tab }{\'93Politeness\'94}{\tab }
{\field{\*\fldinst { PAGEREF _Toc8477611 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310031000000}}}{\fldrslt {14}}}{\f0\fs24
\par }{3.2}{\f0\fs24 \tab }{The processing pipeline}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477612 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310032000000}}}{\fldrslt {14}}}{\f0\fs24
\par }{3.3}{\f0\fs24 \tab }{Lucene integration}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477613 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310033000000}}}{\fldrslt {14}}}{\f0\fs24
\par }{3.4}{\f0\fs24 \tab }{A Real Server}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477614 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310034000000}}}{\fldrslt {14}}}{\f0\fs24
\par }{3.5}{\f0\fs24 \tab }{Distribution}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477615 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310035000000}}}{\fldrslt {14}}}{\f0\fs24
\par }{3.6}{\f0\fs24 \tab }{URL Reordering}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477616 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310036000000}}}{\fldrslt {15}}}{\f0\fs24
\par }{3.7}{\f0\fs24\lang1024\langfe1024\langnp1031 \tab }{Recovery}{\tab }{\field{\*\fldinst { PAGEREF _Toc8477617 \\h }{{\*\datafield 08d0c9ea79f9bace118c8200aa004ba90b02000000080000000c0000005f0054006f00630038003400370037003600310037000000}}}{\fldrslt {15}
}}{\f0\fs24\lang1024\langfe1024\langnp1031
\par }\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 }}\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar
\tx440\tqr\tldot\tx9062\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {
\par {\*\bkmkstart _Toc8477592}{\listtext\pard\plain\s1 \b\fs36\lang2057\langfe1031\kerning28\langnp2057 \hich\af0\dbch\af0\loch\f0 1\tab}}\pard\plain \s1\ql \fi-432\li0\ri0\sb240\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\outlinelevel0\adjustright\rin0\lin0\itap0 \b\fs36\lang2057\langfe1031\kerning28\cgrid\langnp2057\langfenp1031 {Overview{\*\bkmkend _Toc8477592}
\par {\*\bkmkstart _Toc8477593}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.1\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Purpose and Intended Audience{\*\bkmkend _Toc8477593}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
This document was made for Lucene developers, not necessarily with any background knowledge on crawlers, t
o understand the inner workings of the Fetcher crawler, the current problems and some directions for future development. The aim is to keep the entry costs low for people who have an interest in developing this piece of software further.
\par {\*\bkmkstart _Toc8477594}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.2\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Why do we need web crawlers?{\*\bkmkend _Toc8477594}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
The answer is: Because the web is not perfect. It became necessary because the web standard protocols didn\rquote t contain any mechanisms to inform search engines that the data on a web server had been changed. If this were possible, a search engine c
ould be notified in a \'93push\'94 fashion, which would simplify the total process and would make indexes as current as possible.
\par Imagine a web server that notifies another web server that a link was created from one of its pages to the other server. That other server could then send a message back if the page was removed.}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn
{\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{\lang2057\langfe1031\langnp2057
I know that there is research on that matter. }}}{\lang2057\langfe1031\langnp2057
\par On the other hand, this system would be a lot more complicated to handle. Keeping distributed information up to date is an erroneous task. Even in a single relational database it is often co
mplicated to define and handle dependencies between relations. Should it be possible to allow inconsistencies for a short period of time? Should dependent data be deleted if a record is removed? Handling relationships between clusters of information well
incorporates a new level of complexity.
\par In order to keep the software (web servers and browsers) simple, the inventors of the web concentrated on just a few core elements \endash URLs for (more or less) uniquely identifying distributed information, HTTP for handl
ing the information, and HTML for structuring it. That system was so simple that one could understand it in a very short time. This is probably one of the main reasons why the WWW became so popular. Well, another one would probably be coloured, moving gra
phics of naked people.
\par But the WWW has some major disadvantages: There is no single index of all available pages. Information can change without notice. URLs can point to pages that no longer exist. There is no mechanism to get \'93all\'94 pages from a web server
. The whole system is in a constant process of change. And after all, the whole thing is growing at phenomenal rates. Building a search engine on top of that is not something you can do on a Saturday afternoon. Given the sheer size, it would take months t
o search through all the pages in order to answer a single query, even if we had a means to get from server to server, get the pages from there, and search them. But we don\rquote t even know how to do }{\i\lang2057\langfe1031\langnp2057 that}{
\lang2057\langfe1031\langnp2057 , since we don\rquote t know all the web servers.
\par That first problem was addressed by bookmark collections, which soon became very popular. The most popular probably was Yahoo, which evolved to one of the most popular pages in the web just a year after it emerged from a college dorm room.
\par The second problem was how to get the information from all those pages laying around. This is where a web crawler comes in.
\par Ok, those engineers said, we are not able to get a list of all the pages. But almost every page contains links to other pages. We can save a page, extract all the
links, and load all of these pages these links point to. If we start at a popular location which contains a lot of links, like Yahoo for example, chances should be that we can get \'93all\'94 pages on the web.
\par A little more formal, the web can be seen as a directional graph, with pages as nodes and links as edges between them. A web crawler, also called \'93spider\'94 or \'93fetcher\'94
, uses the graph structure of the web to get documents in order to be able to index them. Since there is no \'93push\'94 mechanism for updating our index, we need to \'93pull\'94 the information on our own, by repeatedly crawling the web.
\par {\*\bkmkstart _Toc8477595}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.3\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Implementation \endash the first attempt{\*\bkmkend _Toc8477595}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 \'93Easy\'94, you may think now, \'93
just implement what he said in the paragraph before.\'94 So you start getting a page, extracting the links, following all the pages you have not already visited\'85 In Perl that can be done in a few lines of code.
\par But then, very soon (I can tell you), you end up in a lot of problems:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057 a server doesn
\rquote t respond. Your program always wait for it to time out
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}you get OutOfMemory errors soon after the beginning
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}your hard drive fills up
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}You notice that one page is loaded again time after time, because the URL changed a little
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Some servers will behave very strange. They will respond after 30 seconds, sometimes they time out, sometimes they are not accessible at all
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}some URLs will get longer and longer. Suddenly you will get URLs with a length of thousands of characters.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}But the main problem will be: you notice that your network interface card (NIC) is waiting, and your CPU is waiting. What\rquote
s going on? The overall process will take days
\par {\*\bkmkstart _Toc8477596}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.4\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Features of the Fetcher crawler{\*\bkmkend _Toc8477596}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
The Fetcher web crawler is a result of experiences with the errors as mentioned above, connected with a lot of monitoring to get the maximum out of the given system ressources. It was designed with several different aspects in mind:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
Speed. This involves balancing the resources to prevent bottlenecks. The crawler is multithreaded. A lot of work went in avoiding synchronization between threads, i.e. by rewriting or
replacing the standard Java classes, which slows down multithreaded programs a lot
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Simplicity. The underlying scheme is quite modular and comprehensible. See the description of the pipeline below
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Power. The modular design and the ease of the Java language makes customisation simple
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Java. Although there are many crawlers around at the time when I started to think about it (in Summer 2000), I couldn\rquote
t find a good available implementation in Java. If this crawler would have to be integrated in a Java search engine, a homogenous system would be an advantage. And after all, I wanted to see if a fast implementation could be done in this language.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
\par {\*\bkmkstart _Toc8477597}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.5\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {What the crawler can do for you, and what it cannot (yet){\*\bkmkend _Toc8477597}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 What it can do for you:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
Crawl a distinct set of the web, on
ly restricted by a given regular expression all pages have to match. The pages are saved into page files of max. 50 MB and an index file that contains the links between the URL and the position in the page file. Links are logged as well. This is part of t
he standard LogStorage. Other storages exist as well (see below)
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}
Crawling is done breadth first. Hosts are accessed in a round-robin manner, to prevent the situation that all threads access one host at once. However, at the moment there is no means to throttle access to a server \endash
the crawler works as fast as it can. There are also some problems with this technique, as will be described below.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}The main part of the crawler is implemented as a pool of concurrent threads, which speeds up I/O access
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}The HTML link extractor has been optimised for speed. It was made 10 x faster than a generic SAX parser implementation
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}A lot of logging and monitoring is done, to be able to track down the going-ons in the inside
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}A lot of parts of the crawler have already been optimi
sed to consume not more memory then needed. A lot of the internal queues are cached on hard drive, for example. Only the HashMap of already crawled pages and the HostInfo structures still completely remain in memory, thus limiting the number of crawled h
osts and the number of crawled pages. At the moment, OutOfMemory errors are not prevented, so beware.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}URLs are passed through a pipeline of filters that limit, for example, the length of a URL, load robots.txt the first time a host is accessed, etc. This p
ipeline can be extended easily by adding a Java class to the pipeline.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}
The storage mechanism is also pluggable. One of the next issues would be to include this storage mechanism into the pipeline, to allow a seperation of logging, processing, and storage
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
\par On the other hand, at the time of this writing, the crawler has not yet evolved into a production release. The reason is: until now, it just served me alone. These issues remain:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
The missing things as noted above
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}There may be bugs which prevent it from ru
nning for longer than a couple of hours. I noticed for example that very slowly system sockets were eaten, although the Java code seemed to be ok. One reason why I wanted to publish it now was to have other people have a look on the code, to learn from th
eir experiences and to let them find errors I couldn\rquote t see anymore.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}
Only some of the configuration can be done with command line parameters. The pipeline is put together in the startup procedure. It should not be very hard to put that into a property file
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}The ThreadMonitor is very experimental. It has evolved from a pure monitoring mechanism to a central part of the whole crawler. It should probably be refactored.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Speed could still be optimised. Synchronization takes place too often
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}After all, the crawler is not yet incorporated into the Lucene engine.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}URLs should be handled in a more intelligent manner. At the moment \'93http://host?id=1\'94, \'93http://host/?id=1\'94, and \'93http://host/index.shtml?id=1\'94
are handled as three different URLs. It also doesn\rquote t recognize host aliases or mirrors. Other crawlers also calculate finger prints of the pages loaded, to prevent loading mirrors. This does not.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}No processing whatsoever is done on the documents (except extracting the links). It should be decided how much of this is supp
osed to be done within the crawler, and what should be done in a post processing step
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Unix is the favoured operating system. I used a SUSE Linux with 2.2 kernel. I remember that I ran into problems with the I/O routines on Windows machines. I haven
\rquote t tried it for a long time now, though.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Only http is supported, no file server crawling with recurse directory options, etc.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}It\rquote s not polite. It sucks out the servers, which can impose DOS (Denial of Service) problems
\par {\*\bkmkstart _Toc8477598}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 1.6\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Syntax and runtime behaviour{\*\bkmkend _Toc8477598}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The command line options are very simple:
\par }{\b\lang2057\langfe1031\langnp2057 java [-server] [-Xmx<ZZ>mb] \endash classpath fetcher.jar de.lanlab.larm.fetcher.FetcherMain
\par \tab \tab -start STARTURL
\par \tab \tab -restrictto REGEX
\par \tab \tab [-threads[=10]]
\par }\pard \ql \fi-1416\li1416\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1416\itap0 {\lang2057\langfe1031\langnp2057 -start\tab a start URL. Currently only one. It must be a valid http-URL, including the http prefix
\par -restrictto\tab a (Perl5) regular expression that all
\par \tab If you are not familiar with regular expressions
\par -threads\tab the number of concurrent threads that crawl the pages. At this time, more than 25 threads don\rquote t provide any advantages because synchronization effects and (probably) the overhead of the scheduler slow the system down
\par }{\b\lang2057\langfe1031\langnp2057 Java runtime options:
\par }{\lang2057\langfe1031\langnp2057 -server\tab starts the hot spot VM in server mode, which starts up a little slower, but is faster during the run
\par -Xmx<ZZ>mb\tab sets the maximum size of the heap to <ZZ> mb. Should be a lot. Set it to what you have
\par
\par You also have to provide a \'93logs/\'94 directory (won\rquote t be created for you).
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057 You may also want to have a look at the source code, because some options cannot be dealt with from the outside at this time.
\par
\par }\pard \ql \fi-1416\li1416\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin1416\itap0 {\lang2057\langfe1031\langnp2057 What happens now?
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 1.\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls15\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
The filter pipeline is built. The ScopeFilter is initialised with the expression given by restrictto
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 2.\tab}The URL is put into the pipeline
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 3.\tab}The documents are fetched. If the mime type is text/html, links are extracted and put back into the queue
. The documents and URLs are forwarded to the storage, which saves them
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 4.\tab}
Meanwhile, every 5 seconds, the ThreadMonitor gathers statistics, flushes log files, starts the garbage collection, and stops the fetcher when everything seems to be done: all threads are idle, and nothing is remaining in the queues
\par {\listtext\pard\plain\s1 \b\fs36\lang2057\langfe1031\kerning28\langnp2057 \hich\af0\dbch\af0\loch\f0 2\tab}}\pard\plain \s1\ql \fi-432\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\outlinelevel0\adjustright\rin0\lin0\itap0
\b\fs36\lang2057\langfe1031\kerning28\cgrid\langnp2057\langfenp1031 {\page {\*\bkmkstart _Toc8477599}Architecture{\*\bkmkend _Toc8477599}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 I studied the Mercator web crawler}{
\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{
\lang2057\langfe1031\langnp2057 see }{\field{\*\fldinst {\lang2057\langfe1031\langnp2057 HYPERLINK "http://citeseer.nj.nec.com/heydon99mercator.html" }{\lang2057\langfe1031\langnp2057 {\*\datafield
00d0c9ea79f9bace118c8200aa004ba90b02000000170000003100000068007400740070003a002f002f00630069007400650073006500650072002e006e006a002e006e00650063002e0063006f006d002f0068006500790064006f006e00390039006d00650072006300610074006f0072002e00680074006d006c000000
e0c9ea79f9bace118c8200aa004ba90b6200000068007400740070003a002f002f00630069007400650073006500650072002e006e006a002e006e00650063002e0063006f006d002f0068006500790064006f006e00390039006d00650072006300610074006f0072002e00680074006d006c000000}}}{\fldrslt {
\cs41\ul\cf2\lang2057\langfe1031\langnp2057 http://citeseer.nj.nec.com/heydon99mercator.html}}}{\lang2057\langfe1031\langnp2057 }}}{\lang2057\langfe1031\langnp2057
but decided to implement a somewhat different architecture. Here is a high level overview of the default configuration:
\par }{\fs20\lang1024\langfe1024\noproof {\shpgrp{\*\shpinst\shpleft-180\shptop140\shpright8280\shpbottom6980\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz7\shplid1110
{\sp{\sn groupLeft}{\sv 1521}}{\sp{\sn groupTop}{\sv 3150}}{\sp{\sn groupRight}{\sv 9981}}{\sp{\sn groupBottom}{\sv 9990}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn fLayoutInCell}{\sv 1}}{\shp{\*\shpinst\shplid1046{\sp{\sn relLeft}{\sv 2961}}{\sp{\sn relTop}{\sv 3150}}{\sp{\sn relRight}{\sv 3501}}{\sp{\sn relBottom}{\sv 8910}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}
{\sp{\sn lTxid}{\sv 65536}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18\lang2057\langfe1031\langnp2057 Message Handler }{\i\fs18\lang2057\langfe1031\langnp2057 (Thread)}{\fs18\lang2057\langfe1031\langnp2057
\par }}}}{\shp{\*\shpinst\shplid1047{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 4144}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 4504}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 131072}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLScopeFilter}{
\par }}}}{\shp{\*\shpinst\shplid1048{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 3604}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 3964}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 196608}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLLengthFilter}{
\par }}}}{\shp{\*\shpinst\shplid1049{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 4684}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 5044}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 262144}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 RobotExclusionFilter}{
\par }}}}{\shp{\*\shpinst\shplid1050{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 5223}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 5583}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 327680}}{\sp{\sn hspNext}{\sv 1050}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLVisitedFilter}{
\par }}}}{\shp{\*\shpinst\shplid1051{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 5763}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 6123}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 393216}}{\sp{\sn hspNext}{\sv 1051}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 KnownPathsFilter}{
\par }}}}{\shp{\*\shpinst\shplid1053{\sp{\sn relLeft}{\sv 4041}}{\sp{\sn relTop}{\sv 9270}}{\sp{\sn relRight}{\sv 6741}}{\sp{\sn relBottom}{\sv 9990}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 524288}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Storage}{
\par }}}}{\shp{\*\shpinst\shplid1054{\sp{\sn relLeft}{\sv 8718}}{\sp{\sn relTop}{\sv 4050}}{\sp{\sn relRight}{\sv 9978}}{\sp{\sn relBottom}{\sv 4770}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 589824}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Host\line Manager}{
\par }}}}{\shp{\*\shpinst\shplid1055{\sp{\sn relLeft}{\sv 4038}}{\sp{\sn relTop}{\sv 6390}}{\sp{\sn relRight}{\sv 6738}}{\sp{\sn relBottom}{\sv 8910}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 655360}}{\sp{\sn hspNext}{\sv 1055}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Fetcher}{
\par }}}}{\shp{\*\shpinst\shplid1056{\sp{\sn relLeft}{\sv 5301}}{\sp{\sn relTop}{\sv 3963}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 4143}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1057{\sp{\sn relLeft}{\sv 5301}}{\sp{\sn relTop}{\sv 5043}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 5223}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1058{\sp{\sn relLeft}{\sv 5301}}{\sp{\sn relTop}{\sv 4503}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 4683}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1059{\sp{\sn relLeft}{\sv 5301}}{\sp{\sn relTop}{\sv 5583}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 5763}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1061{\sp{\sn relLeft}{\sv 5298}}{\sp{\sn relTop}{\sv 6124}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 6390}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1062{\sp{\sn relLeft}{\sv 3501}}{\sp{\sn relTop}{\sv 3784}}{\sp{\sn relRight}{\sv 4038}}{\sp{\sn relBottom}{\sv 3784}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1064{\sp{\sn relLeft}{\sv 6738}}{\sp{\sn relTop}{\sv 4410}}{\sp{\sn relRight}{\sv 8718}}{\sp{\sn relBottom}{\sv 4410}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineDashing}{\sv 6}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLine}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1065{\sp{\sn relLeft}{\sv 4218}}{\sp{\sn relTop}{\sv 6750}}{\sp{\sn relRight}{\sv 6558}}{\sp{\sn relBottom}{\sv 8730}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 720896}}
{\sp{\sn hspNext}{\sv 1065}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18
FetcherPool }{\i\fs18 (Thread)}{
\par }}}}{\shp{\*\shpinst\shplid1066{\sp{\sn relLeft}{\sv 4398}}{\sp{\sn relTop}{\sv 7110}}{\sp{\sn relRight}{\sv 4938}}{\sp{\sn relBottom}{\sv 8550}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 786432}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1066}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}}}{\shp{\*\shpinst\shplid1067{\sp{\sn relLeft}{\sv 5118}}{\sp{\sn relTop}{\sv 7110}}{\sp{\sn relRight}{\sv 5658}}{\sp{\sn relBottom}{\sv 8550}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 851968}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1067}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}}}{\shp{\*\shpinst\shplid1068{\sp{\sn relLeft}{\sv 5838}}{\sp{\sn relTop}{\sv 7110}}{\sp{\sn relRight}{\sv 6378}}{\sp{\sn relBottom}{\sv 8550}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 917504}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1068}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}}}{\shp{\*\shpinst\shplid1071{\sp{\sn relLeft}{\sv 2241}}{\sp{\sn relTop}{\sv 3510}}{\sp{\sn relRight}{\sv 2781}}{\sp{\sn relBottom}{\sv 5310}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 983040}}{\sp{\sn dxTextRight}{\sv 0}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1071}}
{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 puts U
RLs into
\par }}}}{\shp{\*\shpinst\shplid1073{\sp{\sn relLeft}{\sv 5841}}{\sp{\sn relTop}{\sv 8550}}{\sp{\sn relRight}{\sv 7281}}{\sp{\sn relBottom}{\sv 8910}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1114112}}{\sp{\sn dyTextTop}{\sv 0}}{\sp{\sn hspNext}{\sv 1073}}{\sp{\sn fFilled}{\sv 0}}
{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18
WebDocument}{\i
\par }}}}{\shp{\*\shpinst\shplid1075{\sp{\sn relLeft}{\sv 6381}}{\sp{\sn relTop}{\sv 4590}}{\sp{\sn relRight}{\sv 8721}}{\sp{\sn relBottom}{\sv 7290}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 0}}{\sp{\sn rotation}{\sv 0}}
{\sp{\sn geoRight}{\sv 2340}}{\sp{\sn geoBottom}{\sv 2700}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn pVerticies}{\sv 8;4;(0,2700);(720,2700);(720,0);(2340,0)}}{\sp{\sn pSegmentInfo}{\sv 2;9;16384;44032;1;44032;1;44032;1;44032
;32768}}{\sp{\sn fFillOK}{\sv 1}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineOpacity}{\sv 65536}}{\sp{\sn lineType}{\sv 0}}{\sp{\sn lineDashing}{\sv 6}}{\sp{\sn lineEndArrowhead}{\sv 1}}
{\sp{\sn lineEndCapStyle}{\sv 2}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLine}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn posh}{\sv 0}}{\sp{\sn posv}{\sv 0}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1076{\sp{\sn relLeft}{\sv 6738}}{\sp{\sn relTop}{\sv 4050}}{\sp{\sn relRight}{\sv 8718}}{\sp{\sn relBottom}{\sv 4410}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1179648}}
{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 use}{\i
\par }}}}{\shpgrp{\*\shpinst\shplid1078{\sp{\sn groupLeft}{\sv 1881}}{\sp{\sn groupTop}{\sv 9544}}{\sp{\sn groupRight}{\sv 4941}}{\sp{\sn groupBottom}{\sv 15124}}{\sp{\sn relLeft}{\sv 2781}}
{\sp{\sn relTop}{\sv 3510}}{\sp{\sn relRight}{\sv 5301}}{\sp{\sn relBottom}{\sv 9090}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn rotation}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn posh}{\sv 0}}{\sp{\sn posv}{\sv 0}}
{\sp{\sn fLayoutInCell}{\sv 1}}{\shp{\*\shpinst\shplid1070{\sp{\sn relLeft}{\sv 1881}}{\sp{\sn relTop}{\sv 9544}}{\sp{\sn relRight}{\sv 3141}}{\sp{\sn relBottom}{\sv 15124}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 0}}
{\sp{\sn geoRight}{\sv 1260}}{\sp{\sn geoBottom}{\sv 4860}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn pVerticies}{\sv 8;4;(1260,4860);(0,4860);(0,0);(180,0)}}{\sp{\sn pSegmentInfo}{\sv 2;9;16384;44032;1;44032;1;44032;1;44032
;32768}}{\sp{\sn fFillOK}{\sv 1}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineOpacity}{\sv 65536}}{\sp{\sn lineType}{\sv 0}}{\sp{\sn lineDashing}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}
{\sp{\sn lineEndCapStyle}{\sv 2}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLine}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}}{\shp{\*\shpinst\shplid1077{\sp{\sn relLeft}{\sv 3141}}{\sp{\sn relTop}{\sv 14584}}
{\sp{\sn relRight}{\sv 4941}}{\sp{\sn relBottom}{\sv 15124}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 0}}{\sp{\sn geoRight}{\sv 1800}}{\sp{\sn geoBottom}{\sv 540}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn pVerticies}{\sv 8;3
;(0,540);(1800,540);(1800,0)}}{\sp{\sn pSegmentInfo}{\sv 2;7;16384;44032;1;44032;1;44032;32768}}{\sp{\sn fFillOK}{\sv 1}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineOpacity}{\sv 65536}}{\sp{\sn lineType}{\sv 0}}{\sp{\sn lineDashing}{\sv 0}}
{\sp{\sn lineEndCapStyle}{\sv 2}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLine}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}}}}{\shp{\*\shpinst\shplid1080{\sp{\sn relLeft}{\sv 8721}}{\sp{\sn relTop}{\sv 5786}}
{\sp{\sn relRight}{\sv 9981}}{\sp{\sn relBottom}{\sv 6570}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1245184}}
{\sp{\sn hspNext}{\sv 1080}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {
\fs16\lang2057\langfe1031\langnp2057 Thread\line Monitor }{\i\fs16\lang2057\langfe1031\langnp2057 (Thread)}{\fs20\lang2057\langfe1031\langnp2057
\par }}}}{\shp{\*\shpinst\shplid1081{\sp{\sn relLeft}{\sv 8181}}{\sp{\sn relTop}{\sv 5426}}{\sp{\sn relRight}{\sv 8721}}{\sp{\sn relBottom}{\sv 5966}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1082{\sp{\sn relLeft}{\sv 8181}}{\sp{\sn relTop}{\sv 6146}}{\sp{\sn relRight}{\sv 8721}}{\sp{\sn relBottom}{\sv 6146}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1083{\sp{\sn relLeft}{\sv 8181}}{\sp{\sn relTop}{\sv 6326}}{\sp{\sn relRight}{\sv 8721}}{\sp{\sn relBottom}{\sv 6866}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1084{\sp{\sn relLeft}{\sv 7281}}{\sp{\sn relTop}{\sv 5786}}{\sp{\sn relRight}{\sv 8361}}{\sp{\sn relBottom}{\sv 6686}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1310720}}
{\sp{\sn dyTextTop}{\sv 0}}{\sp{\sn hspNext}{\sv 1084}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 monitors every 5 seconds}{\i
\par }}}}{\shp{\*\shpinst\shplid1086{\sp{\sn relLeft}{\sv 9081}}{\sp{\sn relTop}{\sv 6750}}{\sp{\sn relRight}{\sv 9981}}{\sp{\sn relBottom}{\sv 7650}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 1376256}}{\sp{\sn hspNext}{\sv 1086}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {log
\par }}}}{\shp{\*\shpinst\shplid1089{\sp{\sn relLeft}{\sv 7461}}{\sp{\sn relTop}{\sv 9090}}{\sp{\sn relRight}{\sv 8361}}{\sp{\sn relBottom}{\sv 9990}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 1507328}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Log
\par }}}}{\shp{\*\shpinst\shplid1092{\sp{\sn relLeft}{\sv 6201}}{\sp{\sn relTop}{\sv 8550}}{\sp{\sn relRight}{\sv 6201}}{\sp{\sn relBottom}{\sv 9270}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1093{\sp{\sn relLeft}{\sv 5481}}{\sp{\sn relTop}{\sv 8550}}{\sp{\sn relRight}{\sv 5481}}{\sp{\sn relBottom}{\sv 9270}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1094{\sp{\sn relLeft}{\sv 4761}}{\sp{\sn relTop}{\sv 8550}}{\sp{\sn relRight}{\sv 4761}}{\sp{\sn relBottom}{\sv 9270}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1095{\sp{\sn relLeft}{\sv 7461}}{\sp{\sn relTop}{\sv 8010}}{\sp{\sn relRight}{\sv 8361}}{\sp{\sn relBottom}{\sv 8910}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 1572864}}
{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Store
\par }}}}{\shp{\*\shpinst\shplid1096{\sp{\sn relLeft}{\sv 3501}}{\sp{\sn relTop}{\sv 3964}}{\sp{\sn relRight}{\sv 3861}}{\sp{\sn relBottom}{\sv 6304}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1048576}}{\sp{\sn dxTextLeft}{\sv 0}}{\sp{\sn dyTextTop}{\sv 0}}{\sp{\sn dxTextRight}{\sv 0}}
{\sp{\sn dyTextBottom}{\sv 0}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1096}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLMessage}{
\par }}}}{\shp{\*\shpinst\shplid1097{\sp{\sn relLeft}{\sv 2421}}{\sp{\sn relTop}{\sv 4950}}{\sp{\sn relRight}{\sv 2781}}{\sp{\sn relBottom}{\sv 7290}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 202}}{\sp{\sn lTxid}{\sv 1441792}}{\sp{\sn dxTextLeft}{\sv 0}}{\sp{\sn dyTextTop}{\sv 0}}{\sp{\sn dxTextRight}{\sv 0}}
{\sp{\sn dyTextBottom}{\sv 0}}{\sp{\sn txflTextFlow}{\sv 3}}{\sp{\sn hspNext}{\sv 1097}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLMessage}{
\par }}}}{\shp{\*\shpinst\shplid1098{\sp{\sn relLeft}{\sv 9441}}{\sp{\sn relTop}{\sv 6570}}{\sp{\sn relRight}{\sv 9441}}{\sp{\sn relBottom}{\sv 6750}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}{\shp{\*\shpinst\shplid1099{\sp{\sn relLeft}{\sv 6741}}{\sp{\sn relTop}{\sv 9810}}
{\sp{\sn relRight}{\sv 7461}}{\sp{\sn relBottom}{\sv 9810}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}
{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}{\shp{\*\shpinst\shplid1100{\sp{\sn relLeft}{\sv 6741}}{\sp{\sn relTop}{\sv 8730}}{\sp{\sn relRight}{\sv 7461}}{\sp{\sn relBottom}{\sv 9450}}{\sp{\sn fRelFlipH}{\sv 1}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn lidRegroup}{\sv 5}}{\sp{\sn fLayoutInCell}{\sv 1}}}}
{\shp{\*\shpinst\shplid1105{\sp{\sn relLeft}{\sv 7461}}{\sp{\sn relTop}{\sv 7024}}{\sp{\sn relRight}{\sv 8361}}{\sp{\sn relBottom}{\sv 7924}}{\sp{\sn fRelFlipH}{\sv 0}}{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 458752}}
{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs20 Queue}{
\par }}}}{\shp{\*\shpinst\shplid1106{\sp{\sn relLeft}{\sv 1521}}{\sp{\sn relTop}{\sv 6484}}{\sp{\sn relRight}{\sv 2421}}{\sp{\sn relBottom}{\sv 7384}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 1638400}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs20 Queue
\par }}}}{\shp{\*\shpinst\shplid1107{\sp{\sn relLeft}{\sv 6561}}{\sp{\sn relTop}{\sv 7564}}{\sp{\sn relRight}{\sv 7461}}{\sp{\sn relBottom}{\sv 7564}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}
{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}}{\shp{\*\shpinst\shplid1108{\sp{\sn relLeft}{\sv 2421}}{\sp{\sn relTop}{\sv 6844}}
{\sp{\sn relRight}{\sv 2961}}{\sp{\sn relBottom}{\sv 6844}}{\sp{\sn fRelFlipH}{\sv 1}}{\sp{\sn fRelFlipV}{\sv 1}}{\sp{\sn shapeType}{\sv 20}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}
{\sp{\sn fLayoutInCell}{\sv 1}}}}{\shp{\*\shpinst\shplid1109{\sp{\sn relLeft}{\sv 3321}}{\sp{\sn relTop}{\sv 5404}}{\sp{\sn relRight}{\sv 4221}}{\sp{\sn relBottom}{\sv 6304}}{\sp{\sn fRelFlipH}{\sv 0}}
{\sp{\sn fRelFlipV}{\sv 0}}{\sp{\sn shapeType}{\sv 22}}{\sp{\sn lTxid}{\sv 1703936}}{\sp{\sn lineDashing}{\sv 2}}{\sp{\sn fLine}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}{\shptxt \pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Logs
\par }}}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8199\dpgroup\dpcount47\dpx-180\dpy140\dpxsize8460\dpysize6840\dptxbx\dptxtbrl{\dptxbxtext\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18\lang2057\langfe1031\langnp2057 Message Handler }{\i\fs18\lang2057\langfe1031\langnp2057 (Thread)}{\fs18\lang2057\langfe1031\langnp2057
\par }}\dpx1440\dpy0\dpxsize540\dpysize5760\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLScopeFilter}{
\par }}\dpx2520\dpy994\dpxsize2700\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLLengthFilter}{
\par }}\dpx2520\dpy454\dpxsize2700\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 RobotExclusionFilter}{
\par }}\dpx2520\dpy1534\dpxsize2700\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLVisitedFilter}{
\par }}\dpx2520\dpy2073\dpxsize2700\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 KnownPathsFilter}{
\par }}\dpx2520\dpy2613\dpxsize2700\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Storage}{
\par }}\dpx2520\dpy6120\dpxsize2700\dpysize720\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Host\line Manager}{
\par }}\dpx7197\dpy900\dpxsize1260\dpysize720\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 Fetcher}{
\par }}\dpx2517\dpy3240\dpxsize2700\dpysize2520\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840
\dpx3780\dpy813\dpxsize0\dpysize180\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx3780\dpy1893\dpxsize0\dpysize180\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840
\dpx3780\dpy1353\dpxsize0\dpysize180\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx3780\dpy2433\dpxsize0\dpysize180\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx8460\dppty0\dpptx0\dppty6840
\dpx3777\dpy2974\dpxsize3\dpysize266\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx1980\dpy634\dpxsize537\dpysize0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx8460\dppty0\dpptx0\dppty6840
\dpx5217\dpy1260\dpxsize1980\dpysize0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {
\fs18 FetcherPool }{\i\fs18 (Thread)}{
\par }}\dpx2697\dpy3600\dpxsize2340\dpysize1980\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}\dpx2877\dpy3960\dpxsize540\dpysize1440\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}\dpx3597\dpy3960\dpxsize540\dpysize1440\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 FetcherThread
\par }}\dpx4317\dpy3960\dpxsize540\dpysize1440\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 puts URLs into
\par }}\dpx720\dpy360\dpxsize540\dpysize1800\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinehollow\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 WebDocument}{\i
\par }}\dpx4320\dpy5400\dpxsize1440\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinehollow\dppolygon\dppolycount4\dpptx0\dppty2700\dpptx720\dppty2700\dpptx720\dppty0\dpptx2340\dppty0
\dpx4860\dpy1440\dpxsize2340\dpysize2700\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\i\fs18 use}{\i
\par }}\dpx5217\dpy900\dpxsize1980\dpysize360\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinehollow\dpgroup\dpcount3\dpx1260\dpy360\dpxsize2520\dpysize5580\dppolygon\dppolycount4\dpptx1260\dppty5580
\dpptx0\dppty5580\dpptx0\dppty0\dpptx180\dppty0\dpx0\dpy0\dpxsize1038\dpysize5580\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dppolygon\dppolycount3
\dpptx0\dppty540\dpptx1800\dppty540\dpptx1800\dppty0\dpx1038\dpy5040\dpxsize1482\dpysize540\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinew15\dplinecor0\dplinecog0\dplinecob0
\dpendgroup\dpx0\dpy0\dpxsize0\dpysize0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs16\lang2057\langfe1031\langnp2057
Thread\line Monitor }{\i\fs16\lang2057\langfe1031\langnp2057 (Thread)}{\fs20\lang2057\langfe1031\langnp2057
\par }}\dpx7200\dpy2636\dpxsize1260\dpysize784\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840
\dpx6660\dpy2276\dpxsize540\dpysize540\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx8460\dppty0\dpptx0\dppty6840\dpx6660\dpy2996\dpxsize540\dpysize0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx8460\dppty0\dpptx0\dppty6840
\dpx6660\dpy3176\dpxsize540\dpysize540\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031
{\i\fs18 monitors every 5 seconds}{\i
\par }}\dpx5760\dpy2636\dpxsize1080\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinehollow\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {log
\par }}\dpx7560\dpy3600\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Log
\par }}\dpx5940\dpy5940\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx4680\dpy5400\dpxsize0\dpysize720
\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx3960\dpy5400\dpxsize0\dpysize720\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx3240\dpy5400\dpxsize0\dpysize720
\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Store
\par }}\dpx5940\dpy4860\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLMessage}{
\par }}\dpx1980\dpy814\dpxsize360\dpysize2340\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinehollow\dptxbx\dptxtbrl{\dptxbxtext\pard\plain
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs18 URLMessage}{
\par }}\dpx900\dpy1800\dpxsize360\dpysize2340\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat0\dplinehollow\dpline\dpptx8460\dppty0\dpptx0\dppty6840\dpx7920\dpy3420\dpxsize0\dpysize180
\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx5220\dpy6660\dpxsize720\dpysize0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx8460\dppty0\dpptx0\dppty6840\dpx5220\dpy5580\dpxsize720\dpysize720
\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs20 Queue}{
\par }}\dpx5940\dpy3874\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\fs20 Queue
\par }}\dpx0\dpy3334\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx5040\dpy4414\dpxsize900\dpysize0
\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpline\dpptx0\dppty0\dpptx8460\dppty6840\dpx900\dpy3694\dpxsize540\dpysize0\dplinew15\dplinecor0\dplinecog0\dplinecob0\dptxbx\dptxlrtb{\dptxbxtext\pard\plain
\qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Logs
\par }}\dpx1800\dpy2254\dpxsize900\dpysize900\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0\dpendgroup\dpx0\dpy0\dpxsize0\dpysize0}}}}{\lang2057\langfe1031\langnp2057
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par
\par The message handler is an implementation of a simple }{\i\lang2057\langfe1031\langnp2057 chain of responsibility}{\lang2057\langfe1031\langnp2057 . Implementations of }{\i\lang2057\langfe1031\langnp2057 Message}{\lang2057\langfe1031\langnp2057
are passed down a filter chain. Each of the filters can decide whether to send the message along, change it, or even delete it. In this case, Messages of type URLMessage are used. The message handler runs in its own thread. Thus, a call of }{
\i\lang2057\langfe1031\langnp2057 putMessage()}{\lang2057\langfe1031\langnp2057 or }{\i\lang2057\langfe1031\langnp2057 putMessages()}{\lang2057\langfe1031\langnp2057 resp. involve a }{\i\lang2057\langfe1031\langnp2057 producer-consumer-}{
\lang2057\langfe1031\langnp2057 like message transfer. The filters themselves run within the message handler thread.
\par At the end of the pipeline the Fetcher distributes the incoming messages to its worker threads. They are implemented as a }{\i\lang2057\langfe1031\langnp2057 thread pool}{\lang2057\langfe1031\langnp2057 : Several }{\i\lang2057\langfe1031\langnp2057
ServerThreads}{\lang2057\langfe1031\langnp2057 are running concurrently and wait for }{\i\lang2057\langfe1031\langnp2057 Tasks}{\lang2057\langfe1031\langnp2057 which include the procedure to be executed. If more tasks are to be done than th
reads are available, they are kept in a queue, which will be read whenever a task is finished.
\par At this point the pipeline pattern is left}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031
{\cs39\super \chftn }{\lang2057\langfe1031\langnp2057 probably this will be one of the foremost places to work on}}}{\lang2057\langfe1031\langnp2057 . The }{\i\lang2057\langfe1031\langnp2057 FetcherTask}{\lang2057\langfe1031\langnp2057
itself is still quite monolithic. It gets the document, parses it if possible, and stores it into a
storage. In the future one might think of additional configurable processing steps within another processing pipeline. I thought about incorporating it into the filter pipeline, but since the filters are passive components and the }{
\i\lang2057\langfe1031\langnp2057 FetcherThreads}{\lang2057\langfe1031\langnp2057 are active, this didn\rquote t work.
\par {\*\bkmkstart _Toc8477600}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 2.1\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Performance{\*\bkmkend _Toc8477600}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The performance was improved about 10-15 times compared to the first na\'ef
ve attempts with a pre-built parser and Sun\rquote s network classes. And there is still room left. On a network with about 150 web servers, which the crawler
server was connected to by a 100 MBit FDDS connection, I was able to crawl an average of 60 documents per second, or 3,7 MB, after 10 minutes in the startup period. In this first period, crawling is slower because the number of servers is small, so the se
rver output limits crawling. There may also be servers that don\rquote t respond. They are excluded from the crawl after a few attempts.
\par Overall, performance is affected by a lot of factors: The operating system, the native interface, the Java libraries, the web servers, the number of threads, whether dynamic pages are included in the crawl, etc.
\par From a development side, the speed is affected by the balance between I/O and CPU usage. Both has to be kept at 100%, otherwise one of them becomes the bottleneck. Managing these resources is the central part of a crawler.
\par Imagine that only one thread is crawling. This is the worst case, as can be seen very fast:
\par
\par }\trowd \trgaph70\trrh564\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10
\clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\b\fs18\lang1036\langfe1031\langnp1036 Action
\cell CPU Usage\cell }{\b\fs18\lang1040\langfe1031\langnp1040 I/O Usage\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\b\fs18\lang2057\langfe1031\langnp2057 :}{\b\fs18\ul\lang2057\langfe1031\langnp2057 Crawler}{
\b\fs18\lang2057\langfe1031\langnp2057 \cell Network}{\b\fs18\ul\lang2057\langfe1031\langnp2057 \cell :Web Server\cell }\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\b\fs18\lang2057\langfe1031\langnp2057 \trowd
\trgaph70\trrh564\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10
\clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\trowd \trgaph70\trrh407\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr
\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10
\clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt
\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417
\cellx8574\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 1. Process URL\cell 100%\cell 0%\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {
\fs20\lang1024\langfe1024\noproof {\shp{\*\shpinst\shpleft647\shptop381\shpright3559\shpbottom686\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz4\shplid1042
{\sp{\sn shapeType}{\sv 20}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}
{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8196\dpline\dpptx0\dppty0\dpptx2912\dppty305\dpx647\dpy381\dpxsize2912\dpysize305\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}
{\shp{\*\shpinst\shpleft557\shptop21\shpright737\shpbottom381\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz2\shplid1040{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8194\dprect\dpx557\dpy21\dpxsize180\dpysize360
\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}
{\shp{\*\shpinst\shpleft647\shptop21\shpright647\shpbottom3801\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz0\shplid1038{\sp{\sn shapeType}{\sv 20}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8192\dpline\dpptx0\dppty0\dpptx0\dppty3780
\dpx647\dpy21\dpxsize0\dpysize3780\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}}{\fs18\lang2057\langfe1031\langnp2057 \cell \cell }{\fs20\lang1024\langfe1024\noproof
{\shp{\*\shpinst\shpleft694\shptop31\shpright694\shpbottom3811\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz1\shplid1039{\sp{\sn shapeType}{\sv 20}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8193\dpline\dpptx0\dppty0\dpptx0\dppty3780
\dpx694\dpy31\dpxsize0\dpysize3780\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}}{\fs18\lang2057\langfe1031\langnp2057 \cell }\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 \trowd
\trgaph70\trrh407\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb
\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone
\clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\trowd \trgaph70\trrh270\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh
\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418
\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb
\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\pard
\ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 2. Send Request\cell <10%?\cell <100%\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {
\fs18\lang2057\langfe1031\langnp2057 \cell \cell \cell }\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 \trowd \trgaph70\trrh270\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10
\trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr
\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl
\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740
\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\trowd \trgaph70\trrh702\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10
\trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl
\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323
\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\pard
\ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 3. Wait\cell 0%\cell 0%\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {
\fs18\lang2057\langfe1031\langnp2057 \cell \cell }{\fs20\lang1024\langfe1024\noproof {\shp{\*\shpinst\shpleft598\shptop2\shpright847\shpbottom723\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz3\shplid1041
{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8195\dprect\dpx598\dpy2\dpxsize249\dpysize721
\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}}{\fs18\lang2057\langfe1031\langnp2057 \cell }\pard
\ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 \trowd \trgaph70\trrh702\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh
\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418
\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb
\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\trowd
\trgaph70\trrh1264\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb
\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone
\clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 4. Receive\cell <10%?
\cell <100%\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs20\lang1024\langfe1024\noproof
{\shp{\*\shpinst\shpleft652\shptop0\shpright3532\shpbottom1274\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz5\shplid1043{\sp{\sn shapeType}{\sv 20}}{\sp{\sn fFlipH}{\sv 1}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn shapePath}{\sv 4}}{\sp{\sn fFillOK}{\sv 0}}{\sp{\sn fFilled}{\sv 0}}{\sp{\sn lineEndArrowhead}{\sv 1}}{\sp{\sn fArrowheadsOK}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8197
\dpline\dpptx2880\dppty0\dpptx0\dppty1274\dpx652\dpy0\dpxsize2880\dpysize1274\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}}{\fs18\lang2057\langfe1031\langnp2057 \cell \cell \cell }\pard
\ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 \trowd \trgaph70\trrh1264\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh
\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418
\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb
\brdrnone \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrnone \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\trowd
\trgaph70\trrh976\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt
\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb
\brdrs\brdrw10 \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt
\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\pard \ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057
Process Doc.\cell 100%\cell 0%\cell }\pard \qc \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs20\lang1024\langfe1024\noproof
{\shp{\*\shpinst\shpleft599\shptop34\shpright807\shpbottom969\shpfhdr0\shpbxcolumn\shpbxignore\shpbypara\shpbyignore\shpwr3\shpwrk0\shpfblwtxt0\shpz6\shplid1044{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fFlipH}{\sv 0}}{\sp{\sn fFlipV}{\sv 0}}
{\sp{\sn fLayoutInCell}{\sv 1}}}{\shprslt{\*\do\dobxcolumn\dobypara\dodhgt8198\dprect\dpx599\dpy34\dpxsize208\dpysize935
\dpfillfgcr255\dpfillfgcg255\dpfillfgcb255\dpfillbgcr255\dpfillbgcg255\dpfillbgcb255\dpfillpat1\dplinew15\dplinecor0\dplinecog0\dplinecob0}}}}{\fs18\lang2057\langfe1031\langnp2057 \cell \cell \cell }\pard
\ql \li0\ri0\widctlpar\intbl\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\fs18\lang2057\langfe1031\langnp2057 \trowd \trgaph70\trrh976\trleft-70\trbrdrt\brdrs\brdrw10 \trbrdrl\brdrs\brdrw10 \trbrdrb\brdrs\brdrw10 \trbrdrr\brdrs\brdrw10 \trbrdrh
\brdrs\brdrw10 \trbrdrv\brdrs\brdrw10 \trftsWidth1\trautofit1\trpaddl70\trpaddr70\trpaddfl3\trpaddfr3 \clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx1418
\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1488 \cellx2906\clvertalt\clbrdrt\brdrs\brdrw10 \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10
\cltxlrtb\clftsWidth3\clwWidth1417 \cellx4323\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrs\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx5740\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb
\brdrs\brdrw10 \clbrdrr\brdrdash\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx7157\clvertalt\clbrdrt\brdrnone \clbrdrl\brdrdash\brdrw10 \clbrdrb\brdrs\brdrw10 \clbrdrr\brdrs\brdrw10 \cltxlrtb\clftsWidth3\clwWidth1417 \cellx8574\row }\pard
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
\par The diagram to the right resembles a UML sequence diagram, except that it stresses the time that a message needs to traverse the network.
\par 1, The URL is processed somehow. That\rquote s the filter part as stated above
\par 2. The request is sent. It goes through the different network layers of the crawler server. A TCP/IP connection is established. Several packets are sent back and forth. Then the crawler waits until the web server proc
esses the request, looks up the file or renders the page (which can take several seconds or even minutes), then sends the file to the crawler.
\par 3. The crawler receives packet after packet, combines them to a file. Probably it is copied through several buffe
rs until it is complete. This will take some CPU time, but mostly it will wait for the next packet to arrive. The network transfer by itself is also affected by a lot of factors, i.e. the speed of the web server, acknowledgement messages, resent packages
etc. so 100% network utilization will almost never be reached.
\par 4. The document is processed, which will take up the whole CPU. The network will be idle at that time.
\par The storage process, which by itself uses CPU and disk I/O resources, was left out here. That process will be very similar, although the traversal will be faster.
\par As you can see, both CPU and I/O are not used most of the time, and wait for the other one (or the network) to complete. This is the reason why single threaded web crawlers tend to be
very slow (wget for example). The slowest component always becomes the bottleneck.
\par Two strategies can be followed to make this situation better:
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 1.\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls16\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
use asynchronous I/O
\par {\listtext\pard\plain\f31\fs22\lang2057\langfe1031\langnp2057 \hich\af31\dbch\af0\loch\f31 2.\tab}use several threads
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057 Asynchronous I/O means, I/O requests are sent, but then the crawler continues to process documents it has already crawled.
\par Actually I haven\rquote t seen an implementation of this technique. Well, asynchronous I/O was not available in Java until version 1.4. An advantage would be that thread handling is also an expensive process in
terms of CPU and memory usage. Threads are resources and, thus, limited. I heard that application server developers wanted asynchronous I/O, to be able to cope with hundreds of simultaneous requests without spawning extra threads for each of them. Probab
ly this can be a solution in the future. But from what I know about it today, it will not be necessary
\par The way this problem is solved usually in Java is with the use of several threads. If many threads are used, chances are good that at any given moment, at least one thread is in one of the states above, which means both CPU and I/O will be at a maximum.
\par The problem with this is that multi threaded programming is considered to be one of the most difficult areas in computer science. But given the simple line
ar structure of web crawlers, it is not very hard to avoid race conditions or dead lock problems. You always get into problems when threads are supposed to access shared resources, though. Don\rquote
t touch this until you have read the standard literature and have made at least 10 mistakes (and solved them!)}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain
\s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{\lang2057\langfe1031\langnp2057
see for example Magee, Kramer: Concurrency. State Models and Java Programs. Wiley 1999; Lea, Doug: Concurrent Programming in Java, Second Edition. Design Principles and Patterns. Addison-Wesley 2000}}}{\lang2057\langfe1031\langnp2057 .
\par Multithreading doesn\rquote t come without a cost, however. First, there is the cost of thread scheduling. I don\rquote t have numbers for that in Java, but I suppose that this should not be very expensive. MutExes can affect the whole program a lot}{
\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{
\lang2057\langfe1031\langnp2057 the sequential part of a parallel program has a massive effect on the maximum speed gain of parallelization. See i.e. the \'93Amdahl law\'94
(I hope this can be transferred to a single-processor, multithreaded system) in Amdahl, G.: The validity of the single processor approach to achieving large
\par scale computing capabilities. In: AFIPS conference proceedings, Spring Joint Computing Conference, Issue 30, pp. 483-485, 1967. Cited by Pizka, Markus: }{\lang2057\langfe1031\langnp2057 Integrated Management of Extensible Distributed Systems}{
\i\lang2057\langfe1031\langnp2057 }{\lang2057\langfe1031\langnp2057 (Ph.D. thesis), online at }{\field{\*\fldinst {\lang2057\langfe1031\langnp2057 HYPERLINK "http://wwwbroy.in.tum.de/~pizka/dissertation.pdf" }{\lang2057\langfe1031\langnp2057
{\*\datafield
00d0c9ea79f9bace118c8200aa004ba90b02000000170000003100000068007400740070003a002f002f00770077007700620072006f0079002e0069006e002e00740075006d002e00640065002f007e00700069007a006b0061002f0064006900730073006500720074006100740069006f006e002e007000640066000000
e0c9ea79f9bace118c8200aa004ba90b6200000068007400740070003a002f002f00770077007700620072006f0079002e0069006e002e00740075006d002e00640065002f007e00700069007a006b0061002f0064006900730073006500720074006100740069006f006e002e007000640066000000}}}{\fldrslt {
\cs41\ul\cf2\lang2057\langfe1031\langnp2057 http://wwwbroy.in.tum.de/~pizka/dissertation.pdf}}}{\lang2057\langfe1031\langnp2057 (in German)}}}{\lang2057\langfe1031\langnp2057
. I noticed that they should be avoided like hell. In a crawler, a MutEx is used, for example, when a new URL is passed to the thread, or when the fetched documents are supposed to be stored linearly, one after the other.
\par For
example, the tasks used to insert a new URL into the global message handler each time when a new URL was found in the document. I was able to speed it up considerably when I changed this so that the URLs are collected locally and then inserted only once p
er document. Probably this can be augmented even further if each task is comprised of several documents which are fetched one after the other and then stored together.
\par Nonetheless, keeping the right balance between the two resources is a big concern. At the
moment, the number of threads and the number of processing steps is static, and is only optimised by trial and error. Few hosts, slow network -> few threads. slow CPU -> few processing steps. many hosts, fast network -> many threads. Probably those heuri
stics will do well, but I wonder if these figures could also be fine-tuned dynamically during runtime.
\par Another issue that was optimised were very fine-grained method calls. For example, the original implementation of the HTML parser used to call the read()-method for each character. This call had probably to traverse several }{
\i\lang2057\langfe1031\langnp2057 Decorators}{\lang2057\langfe1031\langnp2057 until it got to a \endash synchronized call. That\rquote
s why the CharArrayReader was replaced by a SimpleCharArrayReader, because only one thread works on a document at a time.
\par These issues can only be traced down with special tools, i.e. profilers. The work is worth it, because it allows one to work on the 20% of the code that costs 80% of the time.
\par {\*\bkmkstart _Toc8477601}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 2.2\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Memory Usage{\*\bkmkend _Toc8477601}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 One \'93web crawler law\'94 could be defined as:
\par }\pard \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\i\lang2057\langfe1031\langnp2057 What can get infinite, will get infinite. Eventually. Very soon.
\par }\pard\plain \s26\ql \li0\ri0\sl360\slmult1\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1024\langfe1024\cgrid\noproof\langnp2057\langfenp1031 {
A major task during the development was to get memory usage low. But a lot of work still needs to be done here. Most of the optimizations incorporated now move the problem from main memory to the hard disk, which doesn\rquote t solve the problem.
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 Here are some means that were used:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
CachingQueues: The message queue, the Fetcher queue, the robot exclusion queue (see below) \endash a lot of queues can fill up the whole main memory in a very short period of time. But since queues are only acc
essed at their ends, a very simple mechanism was implemented to keep memory usage low: The queue was divided into blocks of fixed size. Only the two blocks at its end are kept in RAM. The rest is serialized on disk. In the end, only a list of block refere
nces has to be managed
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Define a maximum value for everything, and keep an eye on it. Downloaded files can get \'93infinitely\'94
large. URLs can get infinitely long. Servers may contain an infinite set of documents. A lot of these checks had to be included even
in the university network mentioned. A special case were the URLs. Some .shtml pages on a web server pointed to a subdirectory that didn\rquote
t exist but revealed the same page. If these errors are introduced at will, they are called crawler traps: An infinite URL space. The only way of dealing with this is manually excluding the hosts.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}
Optimized HTML parser. Current parsers tend to create a huge amount of very small objects. Most of that work is unnecessary for the task to be done. This can only be optimised b
y stripping down the parser to do only what it is supposed to do in that special task.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057 However, there still remains a problem: The HashMap of already visited URLs needs to be accessed randomly while reading }{
\i\lang2057\langfe1031\langnp2057 and}{\lang2057\langfe1031\langnp2057 writing. I can imagine only two ways to overcome this:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
Limiting, in some way, the number of URLs in RAM. If the crawler were distributed, this could be done by assigning only a certain number of hosts to each crawler node, while at the same time limiting the number of pages read from one host. In t
he end this will only limit the number of hosts that can be crawled by the number of crawler nodes available. Another solution would be to store complete hosts on drive, together with the list of unresolved URLs. Again, this shifts the problem only from R
AM to the hard drive
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}
Something worth while would be to compress the URLs. A lot of parts of URLs are the same between hundreds of URLs (i.e. the host name). And since only a limited number of characters are allowed in URLs, Huffman compression will lead to
a good compression rate}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super
\chftn }{\lang2057\langfe1031\langnp2057 see Randall, Stata et al.: The Link Database: Fast Access to Graphs of the Web, 2000; Witten, Moffat, Bell: Managing Gigabytes, Morgan Kaufmann 1999}}}{\lang2057\langfe1031\langnp2057
. A first attempt would be to incorporate the visited URLs hash into the HostInfo structure.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057 After all, the VisitedFilter hash map turned out to be the data structure that will take up most of the RAM after some time.
\par {\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 2.3\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0
\b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {\page {\*\bkmkstart _Toc8477602}The Filters{\*\bkmkend _Toc8477602}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
Most of the functionality of the different filters has already been described. Here\rquote s another, more detailed view}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain
\s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{\lang2057\langfe1031\langnp2057 this chapter will
probably be left out in future revisions, since that information can also be found in the Javadoc and the source code. Or do you disagree?}}}{\lang2057\langfe1031\langnp2057 :
\par {\*\bkmkstart _Toc8477603}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.1\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {RobotExclusionFilter{\*\bkmkend _Toc8477603}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
The first implementation of this filter just kept a list of hosts, and every time a new URLMessage with an unknown host came by, it attempted to read the robots.txt file first to determine whether the URL should be filtered.
\par A major drawback of that was that when the server was not accessible somehow, the whole crawler was held until the connection timed out (well with Sun\rquote s classes that even didn\rquote t happen, causing the whole program to die).
\par The second implementation has its own little ThreadPool, and keeps a state machine of each host in the HostInfo structure.
\par If the host manager doesn\rquote t contain a HostInfo structure at all, the filter creates it and creates a task to get the robots.txt file. During this time, the host state is set to \'93isLoadingRobotsTxt\'94
, which means further requests to that host are put into a queue. When loading is finished, these URLs (and all subsequent ones) are put back to the beginning of the queue.
\par After this initial step, every URL that enters the filter is compared to the disallow rules set (if present), and is filtered if necessary.
\par Since the URLs are put back to the beginning of the queue, the filter has to be put in front of the VisitedFilter.
\par In the host info structure, which is also used by the FetcherTasks, some information about the health of the hosts is stored as well. If the server is in a bad state several times, it is excluded from
the crawl. Note that it is possible that a server will be accessed more than the (predefined) 5 times that it can time out, since a FetcherThread may already have started to get a document when another one marks it as bad.
\par {\*\bkmkstart _Toc8477604}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.2\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {URLLengthFilter{\*\bkmkend _Toc8477604}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 This very simple
filter just filters a URL if a certain (total) length is exceeded
\par {\*\bkmkstart _Toc8477605}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.3\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {KnownPathsFilter{\*\bkmkend _Toc8477605}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
This one filters some very common URLs (i.e. different views of an Apache directory index), or hosts known to make problems. Should be more configurable from outside in the future\'85
\par {\*\bkmkstart _Toc8477606}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.4\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {URLScopeFilter{\*\bkmkend _Toc8477606}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The scope filter filters a URL if it doesn\rquote
t match a given regular expression.
\par {\*\bkmkstart _Toc8477607}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.5\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {URLVisitedFilter{\*\bkmkend _Toc8477607}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
This filter keeps a HashMap of already visited URLs, and filters out what it already knows
\par {\*\bkmkstart _Toc8477608}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.6\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {Fetcher{\*\bkmkend _Toc8477608}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The fetcher itself is also a filter that filters all URLs \endash
they are passed along to the storage as WebDocuments, in a different manner. It contains a ThreadPool that runs in its own thread of control, which takes tasks from the queue an distributes them to the different FetcherThreads.
\par In the first implementation the fetcher would simply distribute the incoming URLs to the threads. The thread pool would use a simple queue to store the remaining tasks. But this can lead to a very \'93unpolite\'94 distribution of the tasks: Since \'be
of the links in a
page point to the same server, and all links of a page are added to the message handler at once, groups of successive tasks would all try to access the same server, probably causing denial of service, while other hosts present in the queue are not accesse
d.
\par To overcome this, the queue is divided into different parts, one for each host. Each host contains its own (caching) queue. But the methods used to pull tasks from the \'93end\'94
of this queue cycle through the hosts and always get a URL from a different host.
\par One major problem still remains with this technique: If one host is very slow, it can still slow down everything. Since with n host every n}{\lang2057\langfe1031\sub\langnp2057 th}{\lang2057\langfe1031\langnp2057
task will be accessed to this host, it can eat one thread after the other if loading a document takes longer t
han loading it from the (n-1) other servers. Then two concurrent requests will result on the same server, which slows down the response times even more, and so on. In reality, this will clog up the queue very fast. A little more work has to be done to avo
id these situations, i.e. by limiting the number of threads that access one host at a time.
\par {\*\bkmkstart _Toc8477609}{\listtext\pard\plain\s3 \hich\af0\dbch\af0\loch\f0 2.3.7\tab}}\pard\plain \s3\ql \fi-720\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl2\outlinelevel2\adjustright\rin0\lin0\itap0
\fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {A Note on DNS{\*\bkmkend _Toc8477609}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
The Mercator crawler document stresses a lot on resolving host names. Because of that, a DNSResolver filter was implemented in the very first time. Two reasons prevented that it is used any more:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
newer versions of the JDK than the one Mercator used resolve the IP address of a host the first time it is accessed, and keep a cache of already resolved host names.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}the crawler itself was designed to crawl large local networks, and not the internet. Thus, the number of hosts is very limited.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
\par
\par {\listtext\pard\plain\s1 \b\fs36\lang2057\langfe1031\kerning28\langnp2057 \hich\af0\dbch\af0\loch\f0 3\tab}}\pard\plain \s1\ql \fi-432\li0\ri0\sb240\sa60\keepn\widctlpar\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\outlinelevel0\adjustright\rin0\lin0\itap0
\b\fs36\lang2057\langfe1031\kerning28\cgrid\langnp2057\langfenp1031 {\page {\*\bkmkstart _Toc8477610}Future Enhancements{\*\bkmkend _Toc8477610}
\par {\*\bkmkstart _Toc8477611}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.1\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {\'93Politeness\'94{\*\bkmkend _Toc8477611}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
A crawler should not cause a Denial of Service attack. So this has to be addressed.
\par {\*\bkmkstart _Toc8477612}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.2\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {The processing pipeline{\*\bkmkend _Toc8477612}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 The FetcherTask, as already
stated, is very monolithic at this time. Probably some more processing should be done at this step (the problem with balanced CPU/IO usage taken into account). At least different handlers for different mime types should be provided, i.e. to extract links
from PDF documents. The Storage should also be broken up. I only used the LogStorage within the last months, which now doesn\rquote
t only writes to log files, but also stored the files on disk. This should probably be replaced by a storage chain where different stores could be appended.
\par {\*\bkmkstart _Toc8477613}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.3\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Lucene integration{\*\bkmkend _Toc8477613}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
A very simple enhancement would be a LuceneStorage, which takes the document, parses it, and puts it into a Lucene store. But this will probably be very CPU intensive. Probably this should be done in a distributed environment.
\par {\*\bkmkstart _Toc8477614}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.4\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {A Real Server{\*\bkmkend _Toc8477614}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
The only way to start a crawl today is starting the crawler from the shell. But it could also remain idle and wait for commands from an RMI connection or expose a Web Service. Monitoring could be done by a simple included web s
erver that provides current statistics via HTML
\par {\*\bkmkstart _Toc8477615}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.5\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Distribution{\*\bkmkend _Toc8477615}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 Distribution is a big issue. Some people say \'93
Distribute your program late. And then later.\'94 But as others have implemented distributed crawlers, this should not be very hard.
\par I see two possible architectures for that:
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls14\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057
Write a single dispatcher (a star network) that contains the whole MessageHandler except the Fetcher itself. The crawlers are run as servers (see above), and are configured with a URL source that gets their input from the dispatcher
and a MessageHandler that stores URLs back to the dispatcher. The main drawback being that this can become a bottleneck.
\par {\listtext\pard\plain\fs22\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Partition the domain to be crawled into several parts. This could be done for example by dividing up different intervals of the hash v
alue of the host names. Then plugging in another crawler could be done dynamically, even within a peer to peer network. Each node knows which node is responsible for which interval, and sends all URLs to the right node. This could even be implemented as a
filter.
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057 One thing to keep in mind is that the number of URLs transferred to other nodes should be as large as possible.
\par The next thing to be distributed is the storage mechanism. Here, the number of pure crawling nodes and the number of storing (post processing) nodes could possibly diverge. An issue here is that the whole documents have to be transferred over the net.
\par {\*\bkmkstart _Toc8477616}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.6\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {URL Reordering{\*\bkmkend _Toc8477616}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 One paper discussed different types of reordering URLs while crawling}{
\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{
\lang2057\langfe1031\langnp2057 see J. Cho, H. Garcia-Molina, and L. Page. Efficient crawling through url ordering. In Proc. 7th Intl. World Wide Web Conference, Brisbane, Australia, 1998}}}{\lang2057\langfe1031\langnp2057
. One of the most promising attempts was to take the calculated PageRank into account}{\cs39\lang2057\langfe1031\super\langnp2057 \chftn {\footnote \pard\plain \s38\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f31\fs20\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\cs39\super \chftn }{\lang2057\langfe1031\langnp2057 see Brin, S., Page, L.: The Anatomy of a large scale Hypertextual Web Search Engine, 1998}}}{\lang2057\langfe1031\langnp2057
. Crawling pages with higher PageRanks first seemed to get important pages earlier. Yes, this is not rocket science, folks, the research was already done years ago.
\par
\par {\*\bkmkstart _Toc8477617}{\listtext\pard\plain\s2 \b\fs28\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 3.7\tab}}\pard\plain \s2\ql \fi-578\li0\ri0\sb480\sa60\keepn\widctlpar
\jclisttab\tx0\aspalpha\aspnum\faauto\ls8\ilvl1\outlinelevel1\adjustright\rin0\lin0\itap0 \b\fs28\lang2057\langfe1031\cgrid\langnp2057\langfenp1031 {Recovery{\*\bkmkend _Toc8477617}
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \f31\fs22\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 At the moment there is no way of stopping and restarting a crawl.
\par
\par
\par }}