mirror of https://github.com/apache/lucene.git
SOLR-3691: SimplePostTool: Mode for indexing a web page
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1374497 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e2fa0ad45
commit
92171e49a5
|
@ -79,6 +79,9 @@ Other Changes
|
|||
* SOLR-2857: The /update/json and /update/csv URLs were restored to aid
|
||||
in the migration of existing clients. (yonik)
|
||||
|
||||
* SOLR-3691: SimplePostTool: Mode for crawling/posting web pages
|
||||
See http://wiki.apache.org/solr/ExtractingRequestHandler for examples (janhoy)
|
||||
|
||||
|
||||
================== 4.0.0-BETA ===================
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,49 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Welcome to Solr</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Here is some text
|
||||
</p>
|
||||
<div>Here is some text in a div</div>
|
||||
<div>This has a <a href="http://www.apache.org">link</a>.</div>
|
||||
<a href="#news">News</a>
|
||||
<ul class="minitoc">
|
||||
<li>
|
||||
<a href="#03+October+2008+-+Solr+Logo+Contest">03 October 2008 - Solr Logo Contest</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#15+September+2008+-+Solr+1.3.0+Available">15 September 2008 - Solr 1.3.0 Available</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#28+August+2008+-+Lucene%2FSolr+at+ApacheCon+New+Orleans">28 August 2008 - Lucene/Solr at ApacheCon New Orleans</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#03+September+2007+-+Lucene+at+ApacheCon+Atlanta">03 September 2007 - Lucene at ApacheCon Atlanta</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#06+June+2007%3A+Release+1.2+available">06 June 2007: Release 1.2 available</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#17+January+2007%3A+Solr+graduates+from+Incubator">17 January 2007: Solr graduates from Incubator</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#22+December+2006%3A+Release+1.1.0+available">22 December 2006: Release 1.1.0 available</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#15+August+2006%3A+Solr+at+ApacheCon+US">15 August 2006: Solr at ApacheCon US</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#21+April+2006%3A+Solr+at+ApacheCon">21 April 2006: Solr at ApacheCon</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#21+February+2006%3A+nightly+builds">21 February 2006: nightly builds</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#17+January+2006%3A+Solr+Joins+Apache+Incubator">17 January 2006: Solr Joins Apache Incubator</a>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,3 @@
|
|||
Example text document
|
||||
|
||||
This is a simple example for a plain text document, indexed to Solr
|
|
@ -0,0 +1,237 @@
|
|||
package org.apache.solr.util;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.util.SimplePostTool.PageFetcher;
|
||||
import org.apache.solr.util.SimplePostTool.PageFetcherResult;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SimplePostToolTest extends SolrTestCaseJ4 {
|
||||
SimplePostTool t_file, t_file_auto, t_file_rec, t_web, t_test;
|
||||
PageFetcher pf;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
String[] args = {"-"};
|
||||
System.setProperty("data", "files");
|
||||
t_file = SimplePostTool.parseArgsAndInit(args);
|
||||
|
||||
System.setProperty("auto", "yes");
|
||||
t_file_auto = SimplePostTool.parseArgsAndInit(args);
|
||||
|
||||
System.setProperty("recursive", "yes");
|
||||
t_file_rec = SimplePostTool.parseArgsAndInit(args);
|
||||
|
||||
System.setProperty("data", "web");
|
||||
t_web = SimplePostTool.parseArgsAndInit(args);
|
||||
|
||||
System.setProperty("params", "param1=foo¶m2=bar");
|
||||
t_test = SimplePostTool.parseArgsAndInit(args);
|
||||
|
||||
pf = new MockPageFetcher();
|
||||
SimplePostTool.pageFetcher = pf;
|
||||
SimplePostTool.mockMode = true;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseArgsAndInit() {
|
||||
assertEquals(false, t_file.auto);
|
||||
assertEquals(true, t_file_auto.auto);
|
||||
assertEquals(0, t_file_auto.recursive);
|
||||
assertEquals(999, t_file_rec.recursive);
|
||||
assertEquals(true, t_file.commit);
|
||||
assertEquals(false, t_file.optimize);
|
||||
assertEquals(null, t_file.out);
|
||||
|
||||
assertEquals(1, t_web.recursive);
|
||||
assertEquals(10, t_web.delay);
|
||||
|
||||
assertNotNull(t_test.solrUrl);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNormalizeUrlEnding() {
|
||||
assertEquals("http://example.com", SimplePostTool.normalizeUrlEnding("http://example.com/"));
|
||||
assertEquals("http://example.com", SimplePostTool.normalizeUrlEnding("http://example.com/#foo?bar=baz"));
|
||||
assertEquals("http://example.com/index.html", SimplePostTool.normalizeUrlEnding("http://example.com/index.html#hello"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComputeFullUrl() throws MalformedURLException {
|
||||
assertEquals("http://example.com/index.html", t_web.computeFullUrl(new URL("http://example.com/"), "/index.html"));
|
||||
assertEquals("http://example.com/index.html", t_web.computeFullUrl(new URL("http://example.com/foo/bar/"), "/index.html"));
|
||||
assertEquals("http://example.com/fil.html", t_web.computeFullUrl(new URL("http://example.com/foo.htm?baz#hello"), "fil.html"));
|
||||
// TODO: How to know what is the base if URL path ends with "foo"??
|
||||
// assertEquals("http://example.com/fil.html", t_web.computeFullUrl(new URL("http://example.com/foo?baz#hello"), "fil.html"));
|
||||
assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "fil.jpg"));
|
||||
assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "mailto:hello@foo.bar"));
|
||||
assertEquals(null, t_web.computeFullUrl(new URL("http://example.com/"), "ftp://server/file"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTypeSupported() {
|
||||
assertTrue(t_web.typeSupported("application/pdf"));
|
||||
assertTrue(t_web.typeSupported("text/xml"));
|
||||
assertFalse(t_web.typeSupported("text/foo"));
|
||||
|
||||
t_web.fileTypes = "doc,xls,ppt";
|
||||
t_web.globFileFilter = t_web.getFileFilterFromFileTypes(t_web.fileTypes);
|
||||
assertFalse(t_web.typeSupported("application/pdf"));
|
||||
assertTrue(t_web.typeSupported("application/msword"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsOn() {
|
||||
assertTrue(SimplePostTool.isOn("true"));
|
||||
assertTrue(SimplePostTool.isOn("1"));
|
||||
assertFalse(SimplePostTool.isOn("off"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAppendParam() {
|
||||
assertEquals("http://example.com?foo=bar", SimplePostTool.appendParam("http://example.com", "foo=bar"));
|
||||
assertEquals("http://example.com/?a=b&foo=bar", SimplePostTool.appendParam("http://example.com/?a=b", "foo=bar"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAppendUrlPath() throws MalformedURLException {
|
||||
assertEquals(new URL("http://example.com/a?foo=bar"), SimplePostTool.appendUrlPath(new URL("http://example.com?foo=bar"), "/a"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGuessType() {
|
||||
File f = new File("foo.doc");
|
||||
assertEquals("application/msword", SimplePostTool.guessType(f));
|
||||
f = new File("foobar");
|
||||
assertEquals(null, SimplePostTool.guessType(f));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoFilesMode() {
|
||||
t_file_auto.recursive = 0;
|
||||
File dir = getFile("exampledocs");
|
||||
int num = t_file_auto.postFiles(new File[] {dir}, 0, null, null);
|
||||
assertEquals(2, num);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoWebMode() {
|
||||
// Uses mock pageFetcher
|
||||
t_web.delay = 0;
|
||||
t_web.recursive = 5;
|
||||
int num = t_web.postWebPages(new String[] {"http://example.com/#removeme"}, 0, null);
|
||||
assertEquals(5, num);
|
||||
|
||||
t_web.recursive = 1;
|
||||
num = t_web.postWebPages(new String[] {"http://example.com/"}, 0, null);
|
||||
assertEquals(3, num);
|
||||
|
||||
// Without respecting robots.txt
|
||||
SimplePostTool.pageFetcher.robotsCache.clear();
|
||||
t_web.recursive = 5;
|
||||
num = t_web.postWebPages(new String[] {"http://example.com/#removeme"}, 0, null);
|
||||
assertEquals(6, num);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRobotsExclusion() throws MalformedURLException {
|
||||
assertFalse(SimplePostTool.pageFetcher.isDisallowedByRobots(new URL("http://example.com/")));
|
||||
assertTrue(SimplePostTool.pageFetcher.isDisallowedByRobots(new URL("http://example.com/disallowed")));
|
||||
assertTrue("There should be two entries parsed from robots.txt", SimplePostTool.pageFetcher.robotsCache.get("example.com").size() == 2);
|
||||
}
|
||||
|
||||
class MockPageFetcher extends PageFetcher {
|
||||
HashMap<String,String> htmlMap = new HashMap<String,String>();
|
||||
HashMap<String,Set<URL>> linkMap = new HashMap<String,Set<URL>>();
|
||||
|
||||
public MockPageFetcher() throws IOException {
|
||||
(new SimplePostTool()).super();
|
||||
htmlMap.put("http://example.com", "<html><body><a href=\"http://example.com/page1\">page1</a><a href=\"http://example.com/page2\">page2</a></body></html>");
|
||||
htmlMap.put("http://example.com/index.html", "<html><body><a href=\"http://example.com/page1\">page1</a><a href=\"http://example.com/page2\">page2</a></body></html>");
|
||||
htmlMap.put("http://example.com/page1", "<html><body><a href=\"http://example.com/page1/foo\"></body></html>");
|
||||
htmlMap.put("http://example.com/page1/foo", "<html><body><a href=\"http://example.com/page1/foo/bar\"></body></html>");
|
||||
htmlMap.put("http://example.com/page1/foo/bar", "<html><body><a href=\"http://example.com/page1\"></body></html>");
|
||||
htmlMap.put("http://example.com/page2", "<html><body><a href=\"http://example.com/\"><a href=\"http://example.com/disallowed\"/></body></html>");
|
||||
htmlMap.put("http://example.com/disallowed", "<html><body><a href=\"http://example.com/\"></body></html>");
|
||||
|
||||
Set<URL> s = new HashSet<URL>();
|
||||
s.add(new URL("http://example.com/page1"));
|
||||
s.add(new URL("http://example.com/page2"));
|
||||
linkMap.put("http://example.com", s);
|
||||
linkMap.put("http://example.com/index.html", s);
|
||||
s = new HashSet<URL>();
|
||||
s.add(new URL("http://example.com/page1/foo"));
|
||||
linkMap.put("http://example.com/page1", s);
|
||||
s = new HashSet<URL>();
|
||||
s.add(new URL("http://example.com/page1/foo/bar"));
|
||||
linkMap.put("http://example.com/page1/foo", s);
|
||||
s = new HashSet<URL>();
|
||||
s.add(new URL("http://example.com/disallowed"));
|
||||
linkMap.put("http://example.com/page2", s);
|
||||
|
||||
// Simulate a robots.txt file with comments and a few disallows
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n");
|
||||
sb.append("User-agent: * # match all bots\n");
|
||||
sb.append("Disallow: # This is void\n");
|
||||
sb.append("Disallow: /disallow # Disallow this path\n");
|
||||
sb.append("Disallow: /nonexistingpath # Disallow this path\n");
|
||||
this.robotsCache.put("example.com", SimplePostTool.pageFetcher.
|
||||
parseRobotsTxt(new ByteArrayInputStream(sb.toString().getBytes("UTF-8"))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public PageFetcherResult readPageFromUrl(URL u) {
|
||||
PageFetcherResult res = (new SimplePostTool()).new PageFetcherResult();
|
||||
if (isDisallowedByRobots(u)) {
|
||||
res.httpStatus = 403;
|
||||
return res;
|
||||
}
|
||||
res.httpStatus = 200;
|
||||
res.contentType = "text/html";
|
||||
try {
|
||||
res.content = htmlMap.get(u.toString()).getBytes("UTF-8");
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
|
||||
Set<URL> s = linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString()));
|
||||
if(s == null)
|
||||
s = new HashSet<URL>();
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue