mirror of https://github.com/apache/lucene.git
113 lines
21 KiB
Plaintext
113 lines
21 KiB
Plaintext
{\rtf1\ansi\ansicpg1252\uc1 \deff0\deflang1031\deflangfe1031{\fonttbl{\f0\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
|
|
{\f2\fmodern\fcharset0\fprq1{\*\panose 02070309020205020404}Courier New;}{\f3\froman\fcharset2\fprq2{\*\panose 05050102010706020507}Symbol;}{\f4\froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times;}
|
|
{\f5\fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Helvetica;}{\f14\fnil\fcharset2\fprq2{\*\panose 05000000000000000000}Wingdings;}{\f28\froman\fcharset238\fprq2 Times New Roman CE;}{\f29\froman\fcharset204\fprq2 Times New Roman Cyr;}
|
|
{\f31\froman\fcharset161\fprq2 Times New Roman Greek;}{\f32\froman\fcharset162\fprq2 Times New Roman Tur;}{\f33\froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f34\froman\fcharset178\fprq2 Times New Roman (Arabic);}
|
|
{\f35\froman\fcharset186\fprq2 Times New Roman Baltic;}{\f36\fswiss\fcharset238\fprq2 Arial CE;}{\f37\fswiss\fcharset204\fprq2 Arial Cyr;}{\f39\fswiss\fcharset161\fprq2 Arial Greek;}{\f40\fswiss\fcharset162\fprq2 Arial Tur;}
|
|
{\f41\fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f42\fswiss\fcharset178\fprq2 Arial (Arabic);}{\f43\fswiss\fcharset186\fprq2 Arial Baltic;}{\f60\froman\fcharset238\fprq2 Times CE;}{\f61\froman\fcharset204\fprq2 Times Cyr;}
|
|
{\f63\froman\fcharset161\fprq2 Times Greek;}{\f64\froman\fcharset162\fprq2 Times Tur;}{\f65\froman\fcharset177\fprq2 Times (Hebrew);}{\f66\froman\fcharset178\fprq2 Times (Arabic);}{\f67\froman\fcharset186\fprq2 Times Baltic;}
|
|
{\f68\fswiss\fcharset238\fprq2 Helvetica CE;}{\f69\fswiss\fcharset204\fprq2 Helvetica Cyr;}{\f71\fswiss\fcharset161\fprq2 Helvetica Greek;}{\f72\fswiss\fcharset162\fprq2 Helvetica Tur;}{\f73\fswiss\fcharset177\fprq2 Helvetica (Hebrew);}
|
|
{\f74\fswiss\fcharset178\fprq2 Helvetica (Arabic);}{\f75\fswiss\fcharset186\fprq2 Helvetica Baltic;}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;
|
|
\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\stylesheet{
|
|
\ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 \snext0 Normal;}{\s1\ql \fi-432\li432\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx432\aspalpha\aspnum\faauto\ls1\adjustright\rin0\lin432\itap0 \b\f1\fs32\lang1033\langfe1031\kerning32\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 1;}{\s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 2;}{\s3\ql \fi-720\li720\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx720\aspalpha\aspnum\faauto\ls1\ilvl2\adjustright\rin0\lin720\itap0 \b\f1\fs26\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 3;}{\s4\ql \fi-864\li864\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx864\aspalpha\aspnum\faauto\ls1\ilvl3\adjustright\rin0\lin864\itap0 \b\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 4;}{\s5\ql \fi-1008\li1008\ri0\sb240\sa60\widctlpar
|
|
\jclisttab\tx1008\aspalpha\aspnum\faauto\ls1\ilvl4\adjustright\rin0\lin1008\itap0 \b\i\fs26\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 5;}{\s6\ql \fi-1152\li1152\ri0\sb240\sa60\widctlpar
|
|
\jclisttab\tx1152\aspalpha\aspnum\faauto\ls1\ilvl5\adjustright\rin0\lin1152\itap0 \b\f4\fs22\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 6;}{\s7\ql \fi-1296\li1296\ri0\sb240\sa60\widctlpar
|
|
\jclisttab\tx1296\aspalpha\aspnum\faauto\ls1\ilvl6\adjustright\rin0\lin1296\itap0 \f4\fs24\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 7;}{\s8\ql \fi-1440\li1440\ri0\sb240\sa60\widctlpar
|
|
\jclisttab\tx1440\aspalpha\aspnum\faauto\ls1\ilvl7\adjustright\rin0\lin1440\itap0 \i\f4\fs24\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 8;}{\s9\ql \fi-1584\li1584\ri0\sb240\sa60\widctlpar
|
|
\jclisttab\tx1584\aspalpha\aspnum\faauto\ls1\ilvl8\adjustright\rin0\lin1584\itap0 \f5\fs22\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext0 heading 9;}{\*\cs10 \additive Default Paragraph Font;}{
|
|
\s15\qc \li0\ri0\sb240\sa60\widctlpar\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0 \b\f5\fs32\lang1033\langfe1031\kerning28\cgrid\langnp1033\langfenp1031 \sbasedon0 \snext15 Title;}}{\*\listtable{\list\listtemplateid-100094782
|
|
\listhybrid{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat0\levelspace0\levelindent0{\leveltext\'01-;}{\levelnumbers;}\loch\af0\hich\af0\dbch\af0\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li720
|
|
\jclisttab\tx720 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li1440\jclisttab\tx1440 }
|
|
{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2160\jclisttab\tx2160 }{\listlevel
|
|
\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li2880\jclisttab\tx2880 }{\listlevel\levelnfc23
|
|
\levelnfcn23\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li3600\jclisttab\tx3600 }{\listlevel\levelnfc23\levelnfcn23
|
|
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li4320\jclisttab\tx4320 }{\listlevel\levelnfc23\levelnfcn23\leveljc0
|
|
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3913 ?;}{\levelnumbers;}\f3\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li5040\jclisttab\tx5040 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0
|
|
\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01o;}{\levelnumbers;}\f2\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li5760\jclisttab\tx5760 }{\listlevel\levelnfc23\levelnfcn23\leveljc0\leveljcn0\levelfollow0
|
|
\levelstartat1\levelspace0\levelindent0{\leveltext\'01\u-3929 ?;}{\levelnumbers;}\f14\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1\fbias0 \fi-360\li6480\jclisttab\tx6480 }{\listname ;}\listid72943879}{\list\listtemplateid1804128586{\listlevel
|
|
\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'01\'00;}{\levelnumbers\'01;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s1\fi-432\li432\jclisttab\tx432 }{\listlevel\levelnfc0\levelnfcn0
|
|
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'03\'00.\'01;}{\levelnumbers\'01\'03;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s2\fi-576\li576\jclisttab\tx576 }{\listlevel\levelnfc0\levelnfcn0\leveljc0
|
|
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'05\'00.\'01.\'02;}{\levelnumbers\'01\'03\'05;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s3\fi-720\li720\jclisttab\tx720 }{\listlevel\levelnfc0\levelnfcn0\leveljc0
|
|
\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'07\'00.\'01.\'02.\'03;}{\levelnumbers\'01\'03\'05\'07;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s4\fi-864\li864\jclisttab\tx864 }{\listlevel\levelnfc0\levelnfcn0
|
|
\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'09\'00.\'01.\'02.\'03.\'04;}{\levelnumbers\'01\'03\'05\'07\'09;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s5\fi-1008\li1008\jclisttab\tx1008 }{\listlevel
|
|
\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0b\'00.\'01.\'02.\'03.\'04.\'05;}{\levelnumbers\'01\'03\'05\'07\'09\'0b;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s6\fi-1152\li1152
|
|
\jclisttab\tx1152 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0d\'00.\'01.\'02.\'03.\'04.\'05.\'06;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d;}\chbrdr\brdrnone\brdrcf1
|
|
\chshdng0\chcfpat1\chcbpat1 \s7\fi-1296\li1296\jclisttab\tx1296 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext\'0f\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07;}{\levelnumbers
|
|
\'01\'03\'05\'07\'09\'0b\'0d\'0f;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s8\fi-1440\li1440\jclisttab\tx1440 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace0\levelindent0{\leveltext
|
|
\'11\'00.\'01.\'02.\'03.\'04.\'05.\'06.\'07.\'08;}{\levelnumbers\'01\'03\'05\'07\'09\'0b\'0d\'0f\'11;}\chbrdr\brdrnone\brdrcf1 \chshdng0\chcfpat1\chcbpat1 \s9\fi-1584\li1584\jclisttab\tx1584 }{\listname ;}\listid854879813}}{\*\listoverridetable
|
|
{\listoverride\listid854879813\listoverridecount0\ls1}{\listoverride\listid72943879\listoverridecount0\ls2}}{\info{\title Usage Scenarios}{\author Clemens Marschner}{\operator Clemens Marschner}{\creatim\yr2002\mo12\dy2\min42}
|
|
{\revtim\yr2002\mo12\dy2\min42}{\version2}{\edmins0}{\nofpages2}{\nofwords638}{\nofchars3642}{\*\company Dell Computer Corporation}{\nofcharsws4472}{\vern8249}}\paperw11906\paperh16838\margl1417\margr1417\margt1417\margb1134
|
|
\deftab708\widowctrl\ftnbj\aenddoc\hyphhotz425\noxlattoyen\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1417\dgvorigin1417\dghshow1\dgvshow1
|
|
\jexpand\viewkind1\viewscale137\viewzk2\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule \fet0\sectd \linex0\headery708\footery708\colsx708\endnhere\sectlinegrid360\sectdefaultcl
|
|
{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang{\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang{\pntxta )}}{\*\pnseclvl5
|
|
\pndec\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang
|
|
{\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang{\pntxtb (}{\pntxta )}}\pard\plain \s15\qc \li0\ri0\sb240\sa60\widctlpar\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0
|
|
\b\f5\fs32\lang1033\langfe1031\kerning28\cgrid\langnp1033\langfenp1031 {\fs28\lang1040\langfe1031\langnp1040 {\*\bkmkstart _Toc26538554}Lucene Retrieval Machine
|
|
\par Lucene Framework
|
|
\par }{\fs28\lang1031\langfe1031\langnp1031 Usage Scenarios Document
|
|
\par }\pard\plain \qc \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang1040\langfe1031\langnp1040
|
|
\par }{Revision: 5 (cmarschn, 2002-12-01)
|
|
\par
|
|
\par Clemens Marschner - Otis Gospodnetic - Peter Carlson - Kelvin Tan
|
|
\par }\pard\plain \s1\ql \li0\ri0\sb240\sa60\keepn\widctlpar\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0 \b\f1\fs32\lang1033\langfe1031\kerning32\cgrid\langnp1033\langfenp1031 {
|
|
\par Usage Scenarios{\*\bkmkend _Toc26538554}
|
|
\par {\*\bkmkstart _Toc26538555}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.1\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {File System Indexer{\*\bkmkend _Toc26538555}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 A file system indexer would work like the \'93Microsoft Index server\'94
|
|
. It may consist of only one pipeline.
|
|
\par The Scheduler puts document locations (i.e. file URLs) in an asynchronous (Request-) pipeline. The first MP loads the document, replacing the file URL message by a document message, and tries to detec
|
|
t a MIME type. After that, the MessageDispatcher component dispatches the messages to different MPs, depending on the MIME type, that extract the text of each document. In the end, a LuceneStorage takes the resulting message and saves it in a Lucene index
|
|
.
|
|
\par In an extension of that one MP first detects the Mime type, a second would check if that mime type can be handled by the application, the third then loads the doc, the fourth analyses the documents and the sixth saves them to a LuceneStorage.
|
|
\par In an incremental operation the Source is connected to the LuceneStorage and checks if documents have to be refreshed. An additional MP may check if the document loaded is newer than the one already indexed, and may discard the message if not.
|
|
\par
|
|
\par {\*\bkmkstart _Toc26538556}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.2\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {Intranet Web Crawler{\*\bkmkend _Toc26538556}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 An
|
|
intranet web crawler (only a few hosts) is not that different than the file system indexer, except that the loading process may be multithreaded and loads document over the net instead of the file system. In this case there have to be at least two Messag
|
|
e
|
|
Pipelines, since the crawling parts are again active components. An additional processing step extracts links from the loaded documents and puts them back into the queue. A URLSeenFilter (called URLVisitedFilter at this time) makes sure no URL is put into
|
|
the pipeline twice. A RobotExclusionFilter makes sure the robot exclusion standard is followed, and filters URLs that are marked to be \'93disallowed\'94. At the end there is again that LuceneStorage.
|
|
\par (This is how LARM is implemented right now. There are already some efforts made to put some of the data structures on hard drive)
|
|
\par
|
|
\par {\*\bkmkstart _Toc26538557}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.3\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {Small WWW Web Crawler{\*\bkmkend _Toc26538557}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
|
|
If the system is supposed to scale to more than a few hosts, memory, efficiency and fault tolerance becomes a major concern, and it more and more becomes a matter
|
|
of juggling with the system resources (network bandwidth, CPU time, RAM, hard drive space). If one of them becomes a bottleneck, the whole system may become very slow or (in case of RAM or HD shortage) may crash.
|
|
\par Suppose LuceneStorage is much slower than t
|
|
he crawler. Since the indexer is pretty much CPU bound it becomes necessary to distribute that on to two hosts. This can be done easily with the pipeline framework if it the pipeline is broken up into two parts and connected via JMS. The loaded document i
|
|
s
|
|
put into a JMS topic which is configured such that the JMS messages are routed to one of the destinations in a round-robin manner. On the other side there are indexing components on different hosts that build Lucene indexes that are merged from time to t
|
|
ime.
|
|
\par {\*\bkmkstart _Toc26538558}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.4\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {Large WWW Web Crawler{\*\bkmkend _Toc26538558}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 (todo)
|
|
\par If a web crawler is supposed to scale to the whole WWW a whole set of precautions have to be taken care of.
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}}\pard \ql \fi-360\li720\ri0\widctlpar\jclisttab\tx720\aspalpha\aspnum\faauto\ls2
|
|
\jclisttab\tx720\aspalpha\aspnum\faauto\ls2\pnrauth1\pnrdate1718329849\pnrstart0\pnrxst1\pnrxst0\pnrxst45\pnrxst0\pnrstop4\pnrstart1\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrrgb0\pnrstop9\pnrstart2\pnrnfc23\pnrnfc23
|
|
\pnrnfc23\pnrnfc23\pnrnfc23\pnrnfc23\pnrnfc23\pnrnfc23\pnrnfc23\pnrnfc0\pnrnfc0\pnrnfc3\pnrnfc0\pnrnfc0\pnrnfc0\pnrnfc0\pnrnfc0\pnrnfc0\pnrstop18\pnrstart3\pnrpnbr3\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0
|
|
\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrpnbr0\pnrstop36
|
|
\adjustright\rin0\lin720\itap0 {\lang2057\langfe1031\langnp2057 The URLSeen structure must scale to billions of URLs (i.e. constant memory usage) and must also be distributed
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Crawlers and indexers must be distributed
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Most of the data must be kept on disk
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}The server must be able to save its state on disk and recover after failures
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}Exchange of messages should take place in batch operation
|
|
\par {\listtext\pard\plain\lang2057\langfe1031\langnp2057 \hich\af0\dbch\af0\loch\f0 -\tab}special services, i.e. DNS resolvers, have to be installed to prevent bottlenecks
|
|
\par }\pard \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 {\lang2057\langfe1031\langnp2057
|
|
\par {\*\bkmkstart _Toc26538559}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.5\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {Database Indexer{\*\bkmkend _Toc26538559}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057
|
|
A database indexer may consist of a Message source that is connected to the messaging mechanism of the database (i.e. triggers). It then reads the contents of changed database fields and puts them alon
|
|
g to the indexer. That way Lucene may be integrated into HSSQL or even Oracle.
|
|
\par Problem here: Transaction safety may make it necessary to operate largely with disk based structures (i.e. transaction logs)
|
|
\par {\*\bkmkstart _Toc26538560}{\listtext\pard\plain\s2 \b\i\f1\fs28\lang1033\langfe1031\langnp1033 \hich\af1\dbch\af0\loch\f1 1.6\tab}}\pard\plain \s2\ql \fi-576\li576\ri0\sb240\sa60\keepn\widctlpar
|
|
\jclisttab\tx576\aspalpha\aspnum\faauto\ls1\ilvl1\outlinelevel1\adjustright\rin0\lin576\itap0 \b\i\f1\fs28\lang1033\langfe1031\cgrid\langnp1033\langfenp1031 {Single Subsystem Test Scenario{\*\bkmkend _Toc26538560}
|
|
\par }\pard\plain \ql \li0\ri0\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \fs24\lang1031\langfe1031\cgrid\langnp1031\langfenp1031 {\lang2057\langfe1031\langnp2057 For testing purposes
|
|
it may be viable to interface the SUT with dummy or utility components. That way, if the document processor subsystem is to be tested, the LuceneStorage may be replaced by a LogStorage, which does nothing but log everything it gets into log files. This lo
|
|
g storage may also be placed between different processing steps.
|
|
\par
|
|
\par }} |